From 39d2fdb51233ed9b1aaf3adaa3267853f5e58c0f Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Tue, 1 Nov 2016 23:00:17 -0700
Subject: [PATCH 001/534] [SPARK-17475][STREAMING] Delete CRC files if the
 filesystem doesn't use checksum files

## What changes were proposed in this pull request?

When the metadata logs for various parts of Structured Streaming are stored on non-HDFS filesystems such as NFS or ext4, the HDFSMetadataLog class leaves hidden HDFS-style checksum (CRC) files in the log directory, one file per batch. This PR modifies HDFSMetadataLog so that it detects the use of a filesystem that doesn't use CRC files and removes the CRC files.
## How was this patch tested?

Modified an existing test case in HDFSMetadataLogSuite to check whether HDFSMetadataLog correctly removes CRC files on the local POSIX filesystem.  Ran the entire regression suite.

Author: frreiss <frreiss@us.ibm.com>

Closes #15027 from frreiss/fred-17475.

(cherry picked from commit 620da3b4828b3580c7ed7339b2a07938e6be1bb1)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/execution/streaming/HDFSMetadataLog.scala     | 5 +++++
 .../sql/execution/streaming/HDFSMetadataLogSuite.scala      | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index c7235320fd6bd..9a0f87cf0498c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -148,6 +148,11 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
           // It will fail if there is an existing file (someone has committed the batch)
           logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
           fileManager.rename(tempPath, batchIdToPath(batchId))
+
+          // SPARK-17475: HDFSMetadataLog should not leak CRC files
+          // If the underlying filesystem didn't rename the CRC file, delete it.
+          val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc")
+          if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
           return
         } catch {
           case e: IOException if isFileAlreadyExistsException(e) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
index 9c1d26dcb2241..d03e08d9a576c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLogSuite.scala
@@ -119,6 +119,12 @@ class HDFSMetadataLogSuite extends SparkFunSuite with SharedSQLContext {
       assert(metadataLog.get(1).isEmpty)
       assert(metadataLog.get(2).isDefined)
       assert(metadataLog.getLatest().get._1 == 2)
+
+      // There should be exactly one file, called "2", in the metadata directory.
+      // This check also tests for regressions of SPARK-17475
+      val allFiles = new File(metadataLog.metadataPath.toString).listFiles().toSeq
+      assert(allFiles.size == 1)
+      assert(allFiles(0).getName() == "2")
     }
   }
 

From e6509c2459e7ece3c3c6bcd143b8cc71f8f4d5c8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 2 Nov 2016 14:15:10 +0800
Subject: [PATCH 002/534] [SPARK-18183][SPARK-18184] Fix INSERT
 [INTO|OVERWRITE] TABLE ... PARTITION for Datasource tables

There are a couple issues with the current 2.1 behavior when inserting into Datasource tables with partitions managed by Hive.

(1) OVERWRITE TABLE ... PARTITION will actually overwrite the entire table instead of just the specified partition.
(2) INSERT|OVERWRITE does not work with partitions that have custom locations.

This PR fixes both of these issues for Datasource tables managed by Hive. The behavior for legacy tables or when `manageFilesourcePartitions = false` is unchanged.

There is one other issue in that INSERT OVERWRITE with dynamic partitions will overwrite the entire table instead of just the updated partitions, but this behavior is pretty complicated to implement for Datasource tables. We should address that in a future release.

Unit tests.

Author: Eric Liang <ekl@databricks.com>

Closes #15705 from ericl/sc-4942.

(cherry picked from commit abefe2ec428dc24a4112c623fb6fbe4b2ca60a2b)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/catalyst/dsl/package.scala      |  2 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  9 +++-
 .../plans/logical/basicLogicalOperators.scala | 19 ++++++-
 .../sql/catalyst/parser/PlanParserSuite.scala | 15 ++++--
 .../apache/spark/sql/DataFrameWriter.scala    |  4 +-
 .../datasources/CatalogFileIndex.scala        |  5 +-
 .../datasources/DataSourceStrategy.scala      | 30 +++++++++--
 .../InsertIntoDataSourceCommand.scala         |  6 +--
 .../spark/sql/hive/HiveStrategies.scala       |  3 +-
 .../CreateHiveTableAsSelectCommand.scala      |  5 +-
 .../PartitionProviderCompatibilitySuite.scala | 52 +++++++++++++++++++
 11 files changed, 129 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 66e52ca68af19..e901683be6854 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -367,7 +367,7 @@ package object dsl {
       def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
         InsertIntoTable(
           analysis.UnresolvedRelation(TableIdentifier(tableName)),
-          Map.empty, logicalPlan, overwrite, false)
+          Map.empty, logicalPlan, OverwriteOptions(overwrite), false)
 
       def as(alias: String): LogicalPlan = logicalPlan match {
         case UnresolvedRelation(tbl, _) => UnresolvedRelation(tbl, Option(alias))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 38e9bb6c162ad..ac1577b3abb4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -177,12 +177,19 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
       throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " +
         "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx)
     }
+    val overwrite = ctx.OVERWRITE != null
+    val overwritePartition =
+      if (overwrite && partitionKeys.nonEmpty && dynamicPartitionKeys.isEmpty) {
+        Some(partitionKeys.map(t => (t._1, t._2.get)))
+      } else {
+        None
+      }
 
     InsertIntoTable(
       UnresolvedRelation(tableIdent, None),
       partitionKeys,
       query,
-      ctx.OVERWRITE != null,
+      OverwriteOptions(overwrite, overwritePartition),
       ctx.EXISTS != null)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index a48974c6322ad..7a15c2285d584 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -21,6 +21,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
@@ -345,18 +346,32 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode {
   override lazy val statistics: Statistics = super.statistics.copy(isBroadcastable = true)
 }
 
+/**
+ * Options for writing new data into a table.
+ *
+ * @param enabled whether to overwrite existing data in the table.
+ * @param specificPartition only data in the specified partition will be overwritten.
+ */
+case class OverwriteOptions(
+    enabled: Boolean,
+    specificPartition: Option[CatalogTypes.TablePartitionSpec] = None) {
+  if (specificPartition.isDefined) {
+    assert(enabled, "Overwrite must be enabled when specifying a partition to overwrite.")
+  }
+}
+
 case class InsertIntoTable(
     table: LogicalPlan,
     partition: Map[String, Option[String]],
     child: LogicalPlan,
-    overwrite: Boolean,
+    overwrite: OverwriteOptions,
     ifNotExists: Boolean)
   extends LogicalPlan {
 
   override def children: Seq[LogicalPlan] = child :: Nil
   override def output: Seq[Attribute] = Seq.empty
 
-  assert(overwrite || !ifNotExists)
+  assert(overwrite.enabled || !ifNotExists)
   assert(partition.values.forall(_.nonEmpty) || !ifNotExists)
 
   override lazy val resolved: Boolean = childrenResolved && table.resolved
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index ca86304d4d400..7400f3430e99c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -180,7 +180,16 @@ class PlanParserSuite extends PlanTest {
         partition: Map[String, Option[String]],
         overwrite: Boolean = false,
         ifNotExists: Boolean = false): LogicalPlan =
-      InsertIntoTable(table("s"), partition, plan, overwrite, ifNotExists)
+      InsertIntoTable(
+        table("s"), partition, plan,
+        OverwriteOptions(
+          overwrite,
+          if (overwrite && partition.nonEmpty) {
+            Some(partition.map(kv => (kv._1, kv._2.get)))
+          } else {
+            None
+          }),
+        ifNotExists)
 
     // Single inserts
     assertEqual(s"insert overwrite table s $sql",
@@ -196,9 +205,9 @@ class PlanParserSuite extends PlanTest {
     val plan2 = table("t").where('x > 5).select(star())
     assertEqual("from t insert into s select * limit 1 insert into u select * where x > 5",
       InsertIntoTable(
-        table("s"), Map.empty, plan.limit(1), overwrite = false, ifNotExists = false).union(
+        table("s"), Map.empty, plan.limit(1), OverwriteOptions(false), ifNotExists = false).union(
         InsertIntoTable(
-          table("u"), Map.empty, plan2, overwrite = false, ifNotExists = false)))
+          table("u"), Map.empty, plan2, OverwriteOptions(false), ifNotExists = false)))
   }
 
   test ("insert with if not exists") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 11dd1df909938..700f4835ac89a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType}
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Union}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, OverwriteOptions, Union}
 import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand
 import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, CreateTable, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
@@ -259,7 +259,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         table = UnresolvedRelation(tableIdent),
         partition = Map.empty[String, Option[String]],
         child = df.logicalPlan,
-        overwrite = mode == SaveMode.Overwrite,
+        overwrite = OverwriteOptions(mode == SaveMode.Overwrite),
         ifNotExists = false)).toRdd
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
index 092aabc89a36c..443a2ec033a98 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
@@ -67,7 +67,10 @@ class CatalogFileIndex(
       val selectedPartitions = sparkSession.sessionState.catalog.listPartitionsByFilter(
         table.identifier, filters)
       val partitions = selectedPartitions.map { p =>
-        PartitionPath(p.toRow(partitionSchema), p.storage.locationUri.get)
+        val path = new Path(p.storage.locationUri.get)
+        val fs = path.getFileSystem(hadoopConf)
+        PartitionPath(
+          p.toRow(partitionSchema), path.makeQualified(fs.getUri, fs.getWorkingDirectory))
       }
       val partitionSpec = PartitionSpec(partitionSchema, partitions)
       new PrunedInMemoryFileIndex(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 34b77cab65def..47c1f9d3fac1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.datasources
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
@@ -174,14 +176,32 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths
       }.flatten
 
-      val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append
-      if (overwrite && inputPaths.contains(outputPath)) {
+      val mode = if (overwrite.enabled) SaveMode.Overwrite else SaveMode.Append
+      if (overwrite.enabled && inputPaths.contains(outputPath)) {
         throw new AnalysisException(
           "Cannot overwrite a path that is also being read from.")
       }
 
+      val overwritingSinglePartition = (overwrite.specificPartition.isDefined &&
+        t.sparkSession.sessionState.conf.manageFilesourcePartitions &&
+        l.catalogTable.get.partitionProviderIsHive)
+
+      val effectiveOutputPath = if (overwritingSinglePartition) {
+        val partition = t.sparkSession.sessionState.catalog.getPartition(
+          l.catalogTable.get.identifier, overwrite.specificPartition.get)
+        new Path(partition.storage.locationUri.get)
+      } else {
+        outputPath
+      }
+
+      val effectivePartitionSchema = if (overwritingSinglePartition) {
+        Nil
+      } else {
+        query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver)
+      }
+
       def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = {
-        if (l.catalogTable.isDefined &&
+        if (l.catalogTable.isDefined && updatedPartitions.nonEmpty &&
             l.catalogTable.get.partitionColumnNames.nonEmpty &&
             l.catalogTable.get.partitionProviderIsHive) {
           val metastoreUpdater = AlterTableAddPartitionCommand(
@@ -194,8 +214,8 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
       }
 
       val insertCmd = InsertIntoHadoopFsRelationCommand(
-        outputPath,
-        query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver),
+        effectiveOutputPath,
+        effectivePartitionSchema,
         t.bucketSpec,
         t.fileFormat,
         refreshPartitionsCallback,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
index b2ff68a833fea..2eba1e9986acd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoDataSourceCommand.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OverwriteOptions}
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.sources.InsertableRelation
 
@@ -30,7 +30,7 @@ import org.apache.spark.sql.sources.InsertableRelation
 case class InsertIntoDataSourceCommand(
     logicalRelation: LogicalRelation,
     query: LogicalPlan,
-    overwrite: Boolean)
+    overwrite: OverwriteOptions)
   extends RunnableCommand {
 
   override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)
@@ -40,7 +40,7 @@ case class InsertIntoDataSourceCommand(
     val data = Dataset.ofRows(sparkSession, query)
     // Apply the schema of the existing table to the new data.
     val df = sparkSession.internalCreateDataFrame(data.queryExecution.toRdd, logicalRelation.schema)
-    relation.insert(df, overwrite)
+    relation.insert(df, overwrite.enabled)
 
     // Invalidate the cache.
     sparkSession.sharedState.cacheManager.invalidateCache(logicalRelation)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 9d2930948d6ba..ce1e3eb1a5bc9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -46,7 +46,8 @@ private[hive] trait HiveStrategies {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case logical.InsertIntoTable(
           table: MetastoreRelation, partition, child, overwrite, ifNotExists) =>
-        InsertIntoHiveTable(table, partition, planLater(child), overwrite, ifNotExists) :: Nil
+        InsertIntoHiveTable(
+          table, partition, planLater(child), overwrite.enabled, ifNotExists) :: Nil
 
       case CreateTable(tableDesc, mode, Some(query)) if tableDesc.provider.get == "hive" =>
         val newTableDesc = if (tableDesc.storage.serde.isEmpty) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
index ef5a5a001fb6f..cac43597aef21 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
@@ -21,7 +21,7 @@ import scala.util.control.NonFatal
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions}
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.hive.MetastoreRelation
 
@@ -88,7 +88,8 @@ case class CreateHiveTableAsSelectCommand(
     } else {
       try {
         sparkSession.sessionState.executePlan(InsertIntoTable(
-          metastoreRelation, Map(), query, overwrite = true, ifNotExists = false)).toRdd
+          metastoreRelation, Map(), query, overwrite = OverwriteOptions(true),
+          ifNotExists = false)).toRdd
       } catch {
         case NonFatal(e) =>
           // drop the created table.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index 5f16960fb1496..ac435bf6195b0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -134,4 +134,56 @@ class PartitionProviderCompatibilitySuite
       }
     }
   }
+
+  test("insert overwrite partition of legacy datasource table overwrites entire table") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir)
+          spark.sql(
+            """insert overwrite table test
+              |partition (partCol=1)
+              |select * from range(100)""".stripMargin)
+          assert(spark.sql("select * from test").count() == 100)
+
+          // Dynamic partitions case
+          spark.sql("insert overwrite table test select id, id from range(10)".stripMargin)
+          assert(spark.sql("select * from test").count() == 10)
+        }
+      }
+    }
+  }
+
+  test("insert overwrite partition of new datasource table overwrites just partition") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir)
+          sql("msck repair table test")
+          spark.sql(
+            """insert overwrite table test
+              |partition (partCol=1)
+              |select * from range(100)""".stripMargin)
+          assert(spark.sql("select * from test").count() == 104)
+
+          // Test overwriting a partition that has a custom location
+          withTempDir { dir2 =>
+            sql(
+              s"""alter table test partition (partCol=1)
+                |set location '${dir2.getAbsolutePath}'""".stripMargin)
+            assert(sql("select * from test").count() == 4)
+            sql(
+              """insert overwrite table test
+                |partition (partCol=1)
+                |select * from range(30)""".stripMargin)
+            sql(
+              """insert overwrite table test
+                |partition (partCol=1)
+                |select * from range(20)""".stripMargin)
+            assert(sql("select * from test").count() == 24)
+          }
+        }
+      }
+    }
+  }
 }

From 85dd073743946383438aabb9f1281e6075f25cc5 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 1 Nov 2016 23:37:03 -0700
Subject: [PATCH 003/534] [SPARK-18192] Support all file formats in structured
 streaming

## What changes were proposed in this pull request?
This patch adds support for all file formats in structured streaming sinks. This is actually a very small change thanks to all the previous refactoring done using the new internal commit protocol API.

## How was this patch tested?
Updated FileStreamSinkSuite to add test cases for json, text, and parquet.

Author: Reynold Xin <rxin@databricks.com>

Closes #15711 from rxin/SPARK-18192.

(cherry picked from commit a36653c5b7b2719f8bfddf4ddfc6e1b828ac9af1)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../execution/datasources/DataSource.scala    |  8 +--
 .../sql/streaming/FileStreamSinkSuite.scala   | 62 +++++++++----------
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index d980e6a15aabe..3f956c427655e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -29,7 +29,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
@@ -37,7 +36,6 @@ import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
@@ -292,7 +290,7 @@ case class DataSource(
       case s: StreamSinkProvider =>
         s.createSink(sparkSession.sqlContext, options, partitionColumns, outputMode)
 
-      case parquet: parquet.ParquetFileFormat =>
+      case fileFormat: FileFormat =>
         val caseInsensitiveOptions = new CaseInsensitiveMap(options)
         val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
@@ -301,7 +299,7 @@ case class DataSource(
           throw new IllegalArgumentException(
             s"Data source $className does not support $outputMode output mode")
         }
-        new FileStreamSink(sparkSession, path, parquet, partitionColumns, options)
+        new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, options)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -516,7 +514,7 @@ case class DataSource(
           val plan = data.logicalPlan
           plan.resolve(name :: Nil, data.sparkSession.sessionState.analyzer.resolver).getOrElse {
             throw new AnalysisException(
-              s"Unable to resolve ${name} given [${plan.output.map(_.name).mkString(", ")}]")
+              s"Unable to resolve $name given [${plan.output.map(_.name).mkString(", ")}]")
           }.asInstanceOf[Attribute]
         }
         // For partitioned relation r, r.schema's column ordering can be different from the column
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 902cf05344716..0f140f94f630e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.streaming
 
-import org.apache.spark.sql._
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.streaming.{MemoryStream, MetadataLogFileIndex}
@@ -142,42 +142,38 @@ class FileStreamSinkSuite extends StreamTest {
     }
   }
 
-  test("FileStreamSink - supported formats") {
-    def testFormat(format: Option[String]): Unit = {
-      val inputData = MemoryStream[Int]
-      val ds = inputData.toDS()
+  test("FileStreamSink - parquet") {
+    testFormat(None) // should not throw error as default format parquet when not specified
+    testFormat(Some("parquet"))
+  }
 
-      val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
-      val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+  test("FileStreamSink - text") {
+    testFormat(Some("text"))
+  }
 
-      var query: StreamingQuery = null
+  test("FileStreamSink - json") {
+    testFormat(Some("text"))
+  }
 
-      try {
-        val writer =
-          ds.map(i => (i, i * 1000))
-            .toDF("id", "value")
-            .writeStream
-        if (format.nonEmpty) {
-          writer.format(format.get)
-        }
-        query = writer
-            .option("checkpointLocation", checkpointDir)
-            .start(outputDir)
-      } finally {
-        if (query != null) {
-          query.stop()
-        }
-      }
-    }
+  def testFormat(format: Option[String]): Unit = {
+    val inputData = MemoryStream[Int]
+    val ds = inputData.toDS()
 
-    testFormat(None) // should not throw error as default format parquet when not specified
-    testFormat(Some("parquet"))
-    val e = intercept[UnsupportedOperationException] {
-      testFormat(Some("text"))
-    }
-    Seq("text", "not support", "stream").foreach { s =>
-      assert(e.getMessage.contains(s))
+    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+    val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+
+    var query: StreamingQuery = null
+
+    try {
+      val writer = ds.map(i => (i, i * 1000)).toDF("id", "value").writeStream
+      if (format.nonEmpty) {
+        writer.format(format.get)
+      }
+      query = writer.option("checkpointLocation", checkpointDir).start(outputDir)
+    } finally {
+      if (query != null) {
+        query.stop()
+      }
     }
   }
-
 }

From 4c4bf87acf2516a72b59f4e760413f80640dca1e Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Tue, 1 Nov 2016 23:39:53 -0700
Subject: [PATCH 004/534] [SPARK-18144][SQL] logging
 StreamingQueryListener$QueryStartedEvent

## What changes were proposed in this pull request?

The PR fixes the bug that the QueryStartedEvent is not logged

the postToAll() in the original code is actually calling StreamingQueryListenerBus.postToAll() which has no listener at all....we shall post by sparkListenerBus.postToAll(s) and this.postToAll() to trigger local listeners as well as the listeners registered in LiveListenerBus

zsxwing
## How was this patch tested?

The following snapshot shows that QueryStartedEvent has been logged correctly

![image](https://cloud.githubusercontent.com/assets/678008/19821553/007a7d28-9d2d-11e6-9f13-49851559cdaa.png)

Author: CodingCat <zhunansjtu@gmail.com>

Closes #15675 from CodingCat/SPARK-18144.

(cherry picked from commit 85c5424d466f4a5765c825e0e2ab30da97611285)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../streaming/StreamingQueryListenerBus.scala          | 10 +++++++++-
 .../spark/sql/streaming/StreamingQuerySuite.scala      |  7 ++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
index fc2190d39da4f..22e4c6380fcd5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
@@ -41,6 +41,8 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
   def post(event: StreamingQueryListener.Event) {
     event match {
       case s: QueryStartedEvent =>
+        sparkListenerBus.post(s)
+        // post to local listeners to trigger callbacks
         postToAll(s)
       case _ =>
         sparkListenerBus.post(event)
@@ -50,7 +52,13 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
   override def onOtherEvent(event: SparkListenerEvent): Unit = {
     event match {
       case e: StreamingQueryListener.Event =>
-        postToAll(e)
+        // SPARK-18144: we broadcast QueryStartedEvent to all listeners attached to this bus
+        // synchronously and the ones attached to LiveListenerBus asynchronously. Therefore,
+        // we need to ignore QueryStartedEvent if this method is called within SparkListenerBus
+        // thread
+        if (!LiveListenerBus.withinListenerThread.value || !e.isInstanceOf[QueryStartedEvent]) {
+          postToAll(e)
+        }
       case _ =>
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 464c443beb6e7..31b7fe0b04da9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -290,7 +290,10 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     // A StreamingQueryListener that gets the query status after the first completed trigger
     val listener = new StreamingQueryListener {
       @volatile var firstStatus: StreamingQueryStatus = null
-      override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = { }
+      @volatile var queryStartedEvent = 0
+      override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
+        queryStartedEvent += 1
+      }
       override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
        if (firstStatus == null) firstStatus = queryProgress.queryStatus
       }
@@ -303,6 +306,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       q.processAllAvailable()
       eventually(timeout(streamingTimeout)) {
         assert(listener.firstStatus != null)
+        // test if QueryStartedEvent callback is called for only once
+        assert(listener.queryStartedEvent === 1)
       }
       listener.firstStatus
     } finally {

From 3b624bedf0f0ecd5dcfcc262a3ca8b4e33662533 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 2 Nov 2016 00:08:30 -0700
Subject: [PATCH 005/534] [SPARK-17532] Add lock debugging info to thread
 dumps.

## What changes were proposed in this pull request?

This adds information to the web UI thread dump page about the JVM locks
held by threads and the locks that threads are blocked waiting to
acquire. This should help find cases where lock contention is causing
Spark applications to run slowly.
## How was this patch tested?

Tested by applying this patch and viewing the change in the web UI.

![thread-lock-info](https://cloud.githubusercontent.com/assets/87915/18493057/6e5da870-79c3-11e6-8c20-f54c18a37544.png)

Additions:
- A "Thread Locking" column with the locks held by the thread or that are blocking the thread
- Links from the a blocked thread to the thread holding the lock
- Stack frames show where threads are inside `synchronized` blocks, "holding Monitor(...)"

Author: Ryan Blue <blue@apache.org>

Closes #15088 from rdblue/SPARK-17532-add-thread-lock-info.

(cherry picked from commit 2dc048081668665f85623839d5f663b402e42555)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/ui/static/table.js       |  3 +-
 .../ui/exec/ExecutorThreadDumpPage.scala      | 12 +++++++
 .../apache/spark/util/ThreadStackTrace.scala  |  6 +++-
 .../scala/org/apache/spark/util/Utils.scala   | 34 ++++++++++++++++---
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/table.js b/core/src/main/resources/org/apache/spark/ui/static/table.js
index 14b06bfe860ed..0315ebf5c48a9 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/table.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/table.js
@@ -36,7 +36,7 @@ function toggleThreadStackTrace(threadId, forceAdd) {
     if (stackTrace.length == 0) {
         var stackTraceText = $('#' + threadId + "_td_stacktrace").html()
         var threadCell = $("#thread_" + threadId + "_tr")
-        threadCell.after("<tr id=\"" + threadId +"_stacktrace\" class=\"accordion-body\"><td colspan=\"3\"><pre>" +
+        threadCell.after("<tr id=\"" + threadId +"_stacktrace\" class=\"accordion-body\"><td colspan=\"4\"><pre>" +
             stackTraceText +  "</pre></td></tr>")
     } else {
         if (!forceAdd) {
@@ -73,6 +73,7 @@ function onMouseOverAndOut(threadId) {
     $("#" + threadId + "_td_id").toggleClass("threaddump-td-mouseover");
     $("#" + threadId + "_td_name").toggleClass("threaddump-td-mouseover");
     $("#" + threadId + "_td_state").toggleClass("threaddump-td-mouseover");
+    $("#" + threadId + "_td_locking").toggleClass("threaddump-td-mouseover");
 }
 
 function onSearchStringChange() {
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
index a0ef80d9bdae0..c6a07445f2a35 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala
@@ -48,6 +48,16 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
           }
       }.map { thread =>
         val threadId = thread.threadId
+        val blockedBy = thread.blockedByThreadId match {
+          case Some(blockedByThreadId) =>
+            <div>
+              Blocked by <a href={s"#${thread.blockedByThreadId}_td_id"}>
+              Thread {thread.blockedByThreadId} {thread.blockedByLock}</a>
+            </div>
+          case None => Text("")
+        }
+        val heldLocks = thread.holdingLocks.mkString(", ")
+
         <tr id={s"thread_${threadId}_tr"} class="accordion-heading"
             onclick={s"toggleThreadStackTrace($threadId, false)"}
             onmouseover={s"onMouseOverAndOut($threadId)"}
@@ -55,6 +65,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
           <td id={s"${threadId}_td_id"}>{threadId}</td>
           <td id={s"${threadId}_td_name"}>{thread.threadName}</td>
           <td id={s"${threadId}_td_state"}>{thread.threadState}</td>
+          <td id={s"${threadId}_td_locking"}>{blockedBy}{heldLocks}</td>
           <td id={s"${threadId}_td_stacktrace"} class="hidden">{thread.stackTrace}</td>
         </tr>
       }
@@ -86,6 +97,7 @@ private[ui] class ExecutorThreadDumpPage(parent: ExecutorsTab) extends WebUIPage
           <th onClick="collapseAllThreadStackTrace(false)">Thread ID</th>
           <th onClick="collapseAllThreadStackTrace(false)">Thread Name</th>
           <th onClick="collapseAllThreadStackTrace(false)">Thread State</th>
+          <th onClick="collapseAllThreadStackTrace(false)">Thread Locks</th>
         </thead>
         <tbody>{dumpRows}</tbody>
       </table>
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala b/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
index d4e0ad93b966a..b1217980faf1f 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadStackTrace.scala
@@ -24,4 +24,8 @@ private[spark] case class ThreadStackTrace(
   threadId: Long,
   threadName: String,
   threadState: Thread.State,
-  stackTrace: String)
+  stackTrace: String,
+  blockedByThreadId: Option[Long],
+  blockedByLock: String,
+  holdingLocks: Seq[String])
+
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 6027b07c0fee8..22c28fba2087e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.util
 
 import java.io._
-import java.lang.management.ManagementFactory
+import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo}
 import java.net._
 import java.nio.ByteBuffer
 import java.nio.channels.Channels
@@ -2096,15 +2096,41 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  private implicit class Lock(lock: LockInfo) {
+    def lockString: String = {
+      lock match {
+        case monitor: MonitorInfo =>
+          s"Monitor(${lock.getClassName}@${lock.getIdentityHashCode}})"
+        case _ =>
+          s"Lock(${lock.getClassName}@${lock.getIdentityHashCode}})"
+      }
+    }
+  }
+
   /** Return a thread dump of all threads' stacktraces.  Used to capture dumps for the web UI */
   def getThreadDump(): Array[ThreadStackTrace] = {
     // We need to filter out null values here because dumpAllThreads() may return null array
     // elements for threads that are dead / don't exist.
     val threadInfos = ManagementFactory.getThreadMXBean.dumpAllThreads(true, true).filter(_ != null)
     threadInfos.sortBy(_.getThreadId).map { case threadInfo =>
-      val stackTrace = threadInfo.getStackTrace.map(_.toString).mkString("\n")
-      ThreadStackTrace(threadInfo.getThreadId, threadInfo.getThreadName,
-        threadInfo.getThreadState, stackTrace)
+      val monitors = threadInfo.getLockedMonitors.map(m => m.getLockedStackFrame -> m).toMap
+      val stackTrace = threadInfo.getStackTrace.map { frame =>
+        monitors.get(frame) match {
+          case Some(monitor) =>
+            monitor.getLockedStackFrame.toString + s" => holding ${monitor.lockString}"
+          case None =>
+            frame.toString
+        }
+      }.mkString("\n")
+
+      // use a set to dedup re-entrant locks that are held at multiple places
+      val heldLocks = (threadInfo.getLockedSynchronizers.map(_.lockString)
+          ++ threadInfo.getLockedMonitors.map(_.lockString)
+        ).toSet
+
+      ThreadStackTrace(threadInfo.getThreadId, threadInfo.getThreadName, threadInfo.getThreadState,
+        stackTrace, if (threadInfo.getLockOwnerId < 0) None else Some(threadInfo.getLockOwnerId),
+        Option(threadInfo.getLockInfo).map(_.lockString).getOrElse(""), heldLocks.toSeq)
     }
   }
 

From ab8da1413836591fecbc75a2515875bf3e50527f Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Wed, 2 Nov 2016 09:10:34 +0000
Subject: [PATCH 006/534] [SPARK-18198][DOC][STREAMING] Highlight code snippets

## What changes were proposed in this pull request?

This patch uses `{% highlight lang %}...{% endhighlight %}` to highlight code snippets in the `Structured Streaming Kafka010 integration doc` and the `Spark Streaming Kafka010 integration doc`.

This patch consists of two commits:
- the first commit fixes only the leading spaces -- this is large
- the second commit adds the highlight instructions -- this is much simpler and easier to review

## How was this patch tested?

SKIP_API=1 jekyll build

## Screenshots

**Before**

![snip20161101_3](https://cloud.githubusercontent.com/assets/15843379/19894258/47746524-a087-11e6-9a2a-7bff2d428d44.png)

**After**

![snip20161101_1](https://cloud.githubusercontent.com/assets/15843379/19894324/8bebcd1e-a087-11e6-835b-88c4d2979cfa.png)

Author: Liwei Lin <lwlin7@gmail.com>

Closes #15715 from lw-lin/doc-highlight-code-snippet.

(cherry picked from commit 98ede49496d0d7b4724085083d4f24436b92a7bf)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/streaming-kafka-0-10-integration.md      | 391 +++++++++---------
 .../structured-streaming-kafka-integration.md | 156 +++----
 2 files changed, 287 insertions(+), 260 deletions(-)

diff --git a/docs/streaming-kafka-0-10-integration.md b/docs/streaming-kafka-0-10-integration.md
index c1ef396907db7..b645d3c3a4b53 100644
--- a/docs/streaming-kafka-0-10-integration.md
+++ b/docs/streaming-kafka-0-10-integration.md
@@ -17,69 +17,72 @@ For Scala/Java applications using SBT/Maven project definitions, link your strea
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	import org.apache.kafka.clients.consumer.ConsumerRecord
-	import org.apache.kafka.common.serialization.StringDeserializer
-	import org.apache.spark.streaming.kafka010._
-	import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
-	import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
-
-	val kafkaParams = Map[String, Object](
-	  "bootstrap.servers" -> "localhost:9092,anotherhost:9092",
-	  "key.deserializer" -> classOf[StringDeserializer],
-	  "value.deserializer" -> classOf[StringDeserializer],
-	  "group.id" -> "use_a_separate_group_id_for_each_stream",
-	  "auto.offset.reset" -> "latest",
-	  "enable.auto.commit" -> (false: java.lang.Boolean)
-	)
-
-	val topics = Array("topicA", "topicB")
-	val stream = KafkaUtils.createDirectStream[String, String](
-	  streamingContext,
-	  PreferConsistent,
-	  Subscribe[String, String](topics, kafkaParams)
-	)
-
-	stream.map(record => (record.key, record.value))
-
+{% highlight scala %}
+import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.common.serialization.StringDeserializer
+import org.apache.spark.streaming.kafka010._
+import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
+import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
+
+val kafkaParams = Map[String, Object](
+  "bootstrap.servers" -> "localhost:9092,anotherhost:9092",
+  "key.deserializer" -> classOf[StringDeserializer],
+  "value.deserializer" -> classOf[StringDeserializer],
+  "group.id" -> "use_a_separate_group_id_for_each_stream",
+  "auto.offset.reset" -> "latest",
+  "enable.auto.commit" -> (false: java.lang.Boolean)
+)
+
+val topics = Array("topicA", "topicB")
+val stream = KafkaUtils.createDirectStream[String, String](
+  streamingContext,
+  PreferConsistent,
+  Subscribe[String, String](topics, kafkaParams)
+)
+
+stream.map(record => (record.key, record.value))
+{% endhighlight %}
 Each item in the stream is a [ConsumerRecord](http://kafka.apache.org/0100/javadoc/org/apache/kafka/clients/consumer/ConsumerRecord.html)
 </div>
 <div data-lang="java" markdown="1">
-	import java.util.*;
-	import org.apache.spark.SparkConf;
-	import org.apache.spark.TaskContext;
-	import org.apache.spark.api.java.*;
-	import org.apache.spark.api.java.function.*;
-	import org.apache.spark.streaming.api.java.*;
-	import org.apache.spark.streaming.kafka010.*;
-	import org.apache.kafka.clients.consumer.ConsumerRecord;
-	import org.apache.kafka.common.TopicPartition;
-	import org.apache.kafka.common.serialization.StringDeserializer;
-	import scala.Tuple2;
-	
-	Map<String, Object> kafkaParams = new HashMap<>();
-	kafkaParams.put("bootstrap.servers", "localhost:9092,anotherhost:9092");
-	kafkaParams.put("key.deserializer", StringDeserializer.class);
-	kafkaParams.put("value.deserializer", StringDeserializer.class);
-	kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
-	kafkaParams.put("auto.offset.reset", "latest");
-	kafkaParams.put("enable.auto.commit", false);
-	
-	Collection<String> topics = Arrays.asList("topicA", "topicB");
-	
-	final JavaInputDStream<ConsumerRecord<String, String>> stream =
-	  KafkaUtils.createDirectStream(
-	    streamingContext,
-	    LocationStrategies.PreferConsistent(),
-	    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
-	  );
-	
-	stream.mapToPair(
-	  new PairFunction<ConsumerRecord<String, String>, String, String>() {
-	    @Override
-	    public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
-	      return new Tuple2<>(record.key(), record.value());
-	    }
-	  })
+{% highlight java %}
+import java.util.*;
+import org.apache.spark.SparkConf;
+import org.apache.spark.TaskContext;
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.*;
+import org.apache.spark.streaming.api.java.*;
+import org.apache.spark.streaming.kafka010.*;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.kafka.common.serialization.StringDeserializer;
+import scala.Tuple2;
+
+Map<String, Object> kafkaParams = new HashMap<>();
+kafkaParams.put("bootstrap.servers", "localhost:9092,anotherhost:9092");
+kafkaParams.put("key.deserializer", StringDeserializer.class);
+kafkaParams.put("value.deserializer", StringDeserializer.class);
+kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
+kafkaParams.put("auto.offset.reset", "latest");
+kafkaParams.put("enable.auto.commit", false);
+
+Collection<String> topics = Arrays.asList("topicA", "topicB");
+
+final JavaInputDStream<ConsumerRecord<String, String>> stream =
+  KafkaUtils.createDirectStream(
+    streamingContext,
+    LocationStrategies.PreferConsistent(),
+    ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
+  );
+
+stream.mapToPair(
+  new PairFunction<ConsumerRecord<String, String>, String, String>() {
+    @Override
+    public Tuple2<String, String> call(ConsumerRecord<String, String> record) {
+      return new Tuple2<>(record.key(), record.value());
+    }
+  })
+{% endhighlight %}
 </div>
 </div>
 
@@ -109,32 +112,35 @@ If you have a use case that is better suited to batch processing, you can create
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	// Import dependencies and create kafka params as in Create Direct Stream above
-
-	val offsetRanges = Array(
-	  // topic, partition, inclusive starting offset, exclusive ending offset
-	  OffsetRange("test", 0, 0, 100),
-	  OffsetRange("test", 1, 0, 100)
-	)
+{% highlight scala %}
+// Import dependencies and create kafka params as in Create Direct Stream above
 
-	val rdd = KafkaUtils.createRDD[String, String](sparkContext, kafkaParams, offsetRanges, PreferConsistent)
+val offsetRanges = Array(
+  // topic, partition, inclusive starting offset, exclusive ending offset
+  OffsetRange("test", 0, 0, 100),
+  OffsetRange("test", 1, 0, 100)
+)
 
+val rdd = KafkaUtils.createRDD[String, String](sparkContext, kafkaParams, offsetRanges, PreferConsistent)
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	// Import dependencies and create kafka params as in Create Direct Stream above
-
-	OffsetRange[] offsetRanges = {
-	  // topic, partition, inclusive starting offset, exclusive ending offset
-	  OffsetRange.create("test", 0, 0, 100),
-	  OffsetRange.create("test", 1, 0, 100)
-	};
-
-	JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(
-	  sparkContext,
-	  kafkaParams,
-	  offsetRanges,
-	  LocationStrategies.PreferConsistent()
-	);
+{% highlight java %}
+// Import dependencies and create kafka params as in Create Direct Stream above
+
+OffsetRange[] offsetRanges = {
+  // topic, partition, inclusive starting offset, exclusive ending offset
+  OffsetRange.create("test", 0, 0, 100),
+  OffsetRange.create("test", 1, 0, 100)
+};
+
+JavaRDD<ConsumerRecord<String, String>> rdd = KafkaUtils.createRDD(
+  sparkContext,
+  kafkaParams,
+  offsetRanges,
+  LocationStrategies.PreferConsistent()
+);
+{% endhighlight %}
 </div>
 </div>
 
@@ -144,29 +150,33 @@ Note that you cannot use `PreferBrokers`, because without the stream there is no
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	stream.foreachRDD { rdd =>
-	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
-	  rdd.foreachPartition { iter =>
-	    val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
-	    println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
-	  }
-	}
+{% highlight scala %}
+stream.foreachRDD { rdd =>
+  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+  rdd.foreachPartition { iter =>
+    val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
+    println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
+  }
+}
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
-	  @Override
-	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
-	    final OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
-	    rdd.foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
-	      @Override
-	      public void call(Iterator<ConsumerRecord<String, String>> consumerRecords) {
-	        OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
-	        System.out.println(
-	          o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
-	      }
-	    });
-	  }
-	});
+{% highlight java %}
+stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+  @Override
+  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+    final OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+    rdd.foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
+      @Override
+      public void call(Iterator<ConsumerRecord<String, String>> consumerRecords) {
+        OffsetRange o = offsetRanges[TaskContext.get().partitionId()];
+        System.out.println(
+          o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset());
+      }
+    });
+  }
+});
+{% endhighlight %}
 </div>
 </div>
 
@@ -183,25 +193,28 @@ Kafka has an offset commit API that stores offsets in a special Kafka topic.  By
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	stream.foreachRDD { rdd =>
-	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
-
-	  // some time later, after outputs have completed
-	  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
-	}
-
+{% highlight scala %}
+stream.foreachRDD { rdd =>
+  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+
+  // some time later, after outputs have completed
+  stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
+}
+{% endhighlight %}
 As with HasOffsetRanges, the cast to CanCommitOffsets will only succeed if called on the result of createDirectStream, not after transformations.  The commitAsync call is threadsafe, but must occur after outputs if you want meaningful semantics.
 </div>
 <div data-lang="java" markdown="1">
-	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
-	  @Override
-	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
-	    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
-
-	    // some time later, after outputs have completed
-	    ((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
-	  }
-	});
+{% highlight java %}
+stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+  @Override
+  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+
+    // some time later, after outputs have completed
+    ((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
+  }
+});
+{% endhighlight %}
 </div>
 </div>
 
@@ -210,64 +223,68 @@ For data stores that support transactions, saving offsets in the same transactio
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	// The details depend on your data store, but the general idea looks like this
+{% highlight scala %}
+// The details depend on your data store, but the general idea looks like this
 
-	// begin from the the offsets committed to the database
-	val fromOffsets = selectOffsetsFromYourDatabase.map { resultSet =>
-	  new TopicPartition(resultSet.string("topic"), resultSet.int("partition")) -> resultSet.long("offset")
-	}.toMap
+// begin from the the offsets committed to the database
+val fromOffsets = selectOffsetsFromYourDatabase.map { resultSet =>
+  new TopicPartition(resultSet.string("topic"), resultSet.int("partition")) -> resultSet.long("offset")
+}.toMap
 
-	val stream = KafkaUtils.createDirectStream[String, String](
-	  streamingContext,
-	  PreferConsistent,
-	  Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
-	)
+val stream = KafkaUtils.createDirectStream[String, String](
+  streamingContext,
+  PreferConsistent,
+  Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
+)
 
-	stream.foreachRDD { rdd =>
-	  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+stream.foreachRDD { rdd =>
+  val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
 
-	  val results = yourCalculation(rdd)
+  val results = yourCalculation(rdd)
 
-	  // begin your transaction
+  // begin your transaction
 
-	  // update results
-	  // update offsets where the end of existing offsets matches the beginning of this batch of offsets
-	  // assert that offsets were updated correctly
+  // update results
+  // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+  // assert that offsets were updated correctly
 
-	  // end your transaction
-	}
+  // end your transaction
+}
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	// The details depend on your data store, but the general idea looks like this
-
-	// begin from the the offsets committed to the database
-	Map<TopicPartition, Long> fromOffsets = new HashMap<>();
-	for (resultSet : selectOffsetsFromYourDatabase)
-	  fromOffsets.put(new TopicPartition(resultSet.string("topic"), resultSet.int("partition")), resultSet.long("offset"));
-	}
-
-	JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(
-	  streamingContext,
-	  LocationStrategies.PreferConsistent(),
-	  ConsumerStrategies.<String, String>Assign(fromOffsets.keySet(), kafkaParams, fromOffsets)
-	);
-
-	stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
-	  @Override
-	  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
-	    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
-	    
-	    Object results = yourCalculation(rdd);
-
-	    // begin your transaction
-
-	    // update results
-	    // update offsets where the end of existing offsets matches the beginning of this batch of offsets
-	    // assert that offsets were updated correctly
-
-	    // end your transaction
-	  }
-	});
+{% highlight java %}
+// The details depend on your data store, but the general idea looks like this
+
+// begin from the the offsets committed to the database
+Map<TopicPartition, Long> fromOffsets = new HashMap<>();
+for (resultSet : selectOffsetsFromYourDatabase)
+  fromOffsets.put(new TopicPartition(resultSet.string("topic"), resultSet.int("partition")), resultSet.long("offset"));
+}
+
+JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(
+  streamingContext,
+  LocationStrategies.PreferConsistent(),
+  ConsumerStrategies.<String, String>Assign(fromOffsets.keySet(), kafkaParams, fromOffsets)
+);
+
+stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
+  @Override
+  public void call(JavaRDD<ConsumerRecord<String, String>> rdd) {
+    OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+    
+    Object results = yourCalculation(rdd);
+
+    // begin your transaction
+
+    // update results
+    // update offsets where the end of existing offsets matches the beginning of this batch of offsets
+    // assert that offsets were updated correctly
+
+    // end your transaction
+  }
+});
+{% endhighlight %}
 </div>
 </div>
 
@@ -277,25 +294,29 @@ The new Kafka consumer [supports SSL](http://kafka.apache.org/documentation.html
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-	val kafkaParams = Map[String, Object](
-	  // the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
-	  "security.protocol" -> "SSL",
-	  "ssl.truststore.location" -> "/some-directory/kafka.client.truststore.jks",
-	  "ssl.truststore.password" -> "test1234",
-	  "ssl.keystore.location" -> "/some-directory/kafka.client.keystore.jks",
-	  "ssl.keystore.password" -> "test1234",
-	  "ssl.key.password" -> "test1234"
-	)
+{% highlight scala %}
+val kafkaParams = Map[String, Object](
+  // the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
+  "security.protocol" -> "SSL",
+  "ssl.truststore.location" -> "/some-directory/kafka.client.truststore.jks",
+  "ssl.truststore.password" -> "test1234",
+  "ssl.keystore.location" -> "/some-directory/kafka.client.keystore.jks",
+  "ssl.keystore.password" -> "test1234",
+  "ssl.key.password" -> "test1234"
+)
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
-	Map<String, Object> kafkaParams = new HashMap<String, Object>();
-	// the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
-	kafkaParams.put("security.protocol", "SSL");
-	kafkaParams.put("ssl.truststore.location", "/some-directory/kafka.client.truststore.jks");
-	kafkaParams.put("ssl.truststore.password", "test1234");
-	kafkaParams.put("ssl.keystore.location", "/some-directory/kafka.client.keystore.jks");
-	kafkaParams.put("ssl.keystore.password", "test1234");
-	kafkaParams.put("ssl.key.password", "test1234");
+{% highlight java %}
+Map<String, Object> kafkaParams = new HashMap<String, Object>();
+// the usual params, make sure to change the port in bootstrap.servers if 9092 is not TLS
+kafkaParams.put("security.protocol", "SSL");
+kafkaParams.put("ssl.truststore.location", "/some-directory/kafka.client.truststore.jks");
+kafkaParams.put("ssl.truststore.password", "test1234");
+kafkaParams.put("ssl.keystore.location", "/some-directory/kafka.client.keystore.jks");
+kafkaParams.put("ssl.keystore.password", "test1234");
+kafkaParams.put("ssl.key.password", "test1234");
+{% endhighlight %}
 </div>
 </div>
 
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index a6c3b3a9024d8..c4c9fb3f7d3db 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -19,97 +19,103 @@ application. See the [Deploying](#deploying) subsection below.
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+{% highlight scala %}
 
-    // Subscribe to 1 topic
-    val ds1 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1")
-      .load()
-    ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
+// Subscribe to 1 topic
+val ds1 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
 
-    // Subscribe to multiple topics
-    val ds2 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1,topic2")
-      .load()
-    ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
+// Subscribe to multiple topics
+val ds2 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
 
-    // Subscribe to a pattern
-    val ds3 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribePattern", "topic.*")
-      .load()
-    ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
+// Subscribe to a pattern
+val ds3 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+  .as[(String, String)]
 
+{% endhighlight %}
 </div>
 <div data-lang="java" markdown="1">
+{% highlight java %}
 
-    // Subscribe to 1 topic
-    Dataset<Row> ds1 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1")
-      .load()
-    ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+// Subscribe to 1 topic
+Dataset<Row> ds1 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
-    // Subscribe to multiple topics
-    Dataset<Row> ds2 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1,topic2")
-      .load()
-    ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+// Subscribe to multiple topics
+Dataset<Row> ds2 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
-    // Subscribe to a pattern
-    Dataset<Row> ds3 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribePattern", "topic.*")
-      .load()
-    ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+// Subscribe to a pattern
+Dataset<Row> ds3 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
+{% endhighlight %}
 </div>
 <div data-lang="python" markdown="1">
+{% highlight python %}
 
-    # Subscribe to 1 topic
-    ds1 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1")
-      .load()
-    ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+# Subscribe to 1 topic
+ds1 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1")
+  .load()
+ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
-    # Subscribe to multiple topics
-    ds2 = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribe", "topic1,topic2")
-      .load()
-    ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+# Subscribe to multiple topics
+ds2 = spark
+  .readStream
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribe", "topic1,topic2")
+  .load()
+ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
-    # Subscribe to a pattern
-    ds3 = spark
-      .readStream()
-      .format("kafka")
-      .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
-      .option("subscribePattern", "topic.*")
-      .load()
-    ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+# Subscribe to a pattern
+ds3 = spark
+  .readStream()
+  .format("kafka")
+  .option("kafka.bootstrap.servers", "host1:port1,host2:port2")
+  .option("subscribePattern", "topic.*")
+  .load()
+ds3.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
 
+{% endhighlight %}
 </div>
 </div>
 

From 176afa5e8b207e28a16e1b22280ed05c10b7b486 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 2 Nov 2016 09:39:15 +0000
Subject: [PATCH 007/534] [SPARK-18076][CORE][SQL] Fix default Locale used in
 DateFormat, NumberFormat to Locale.US

## What changes were proposed in this pull request?

Fix `Locale.US` for all usages of `DateFormat`, `NumberFormat`
## How was this patch tested?

Existing tests.

Author: Sean Owen <sowen@cloudera.com>

Closes #15610 from srowen/SPARK-18076.

(cherry picked from commit 9c8deef64efee20a0ddc9b612f90e77c80aede60)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../org/apache/spark/SparkHadoopWriter.scala  |  8 +++----
 .../apache/spark/deploy/SparkHadoopUtil.scala |  4 ++--
 .../apache/spark/deploy/master/Master.scala   |  5 ++--
 .../apache/spark/deploy/worker/Worker.scala   |  4 ++--
 .../org/apache/spark/rdd/HadoopRDD.scala      |  5 ++--
 .../org/apache/spark/rdd/NewHadoopRDD.scala   |  4 ++--
 .../apache/spark/rdd/PairRDDFunctions.scala   |  4 ++--
 .../status/api/v1/JacksonMessageWriter.scala  |  4 ++--
 .../spark/status/api/v1/SimpleDateParam.scala |  6 ++---
 .../scala/org/apache/spark/ui/UIUtils.scala   |  3 ++-
 .../spark/util/logging/RollingPolicy.scala    |  6 ++---
 .../org/apache/spark/util/UtilsSuite.scala    |  2 +-
 .../deploy/rest/mesos/MesosRestServer.scala   | 11 ++++-----
 .../mllib/pmml/export/PMMLModelExport.scala   |  4 ++--
 .../expressions/datetimeExpressions.scala     | 17 ++++++-------
 .../expressions/stringExpressions.scala       |  2 +-
 .../spark/sql/catalyst/json/JSONOptions.scala |  6 +++--
 .../sql/catalyst/util/DateTimeUtils.scala     |  6 ++---
 .../expressions/DateExpressionsSuite.scala    | 24 +++++++++----------
 .../catalyst/util/DateTimeUtilsSuite.scala    |  6 ++---
 .../datasources/csv/CSVInferSchema.scala      |  4 ++--
 .../datasources/csv/CSVOptions.scala          |  5 ++--
 .../sql/execution/metric/SQLMetrics.scala     |  2 +-
 .../sql/execution/streaming/socket.scala      |  4 ++--
 .../apache/spark/sql/DateFunctionsSuite.scala | 11 +++++----
 .../execution/datasources/csv/CSVSuite.scala  |  9 +++----
 .../datasources/csv/CSVTypeCastSuite.scala    |  9 ++++---
 .../hive/execution/InsertIntoHiveTable.scala  |  9 +++----
 .../spark/sql/hive/hiveWriterContainers.scala |  4 ++--
 .../sql/sources/SimpleTextRelation.scala      |  3 ++-
 .../apache/spark/streaming/ui/UIUtils.scala   |  8 ++++---
 31 files changed, 103 insertions(+), 96 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
index 6550d703bc860..7f75a393bf8ff 100644
--- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala
@@ -20,7 +20,7 @@ package org.apache.spark
 import java.io.IOException
 import java.text.NumberFormat
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import org.apache.hadoop.fs.FileSystem
 import org.apache.hadoop.fs.Path
@@ -67,12 +67,12 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
 
   def setup(jobid: Int, splitid: Int, attemptid: Int) {
     setIDs(jobid, splitid, attemptid)
-    HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(now),
+    HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(now),
       jobid, splitID, attemptID, conf.value)
   }
 
   def open() {
-    val numfmt = NumberFormat.getInstance()
+    val numfmt = NumberFormat.getInstance(Locale.US)
     numfmt.setMinimumIntegerDigits(5)
     numfmt.setGroupingUsed(false)
 
@@ -162,7 +162,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable {
 private[spark]
 object SparkHadoopWriter {
   def createJobID(time: Date, id: Int): JobID = {
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
+    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
     val jobtrackerID = formatter.format(time)
     new JobID(jobtrackerID, id)
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 3f54ecc17ac33..23156072c3ebe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -21,7 +21,7 @@ import java.io.IOException
 import java.lang.reflect.Method
 import java.security.PrivilegedExceptionAction
 import java.text.DateFormat
-import java.util.{Arrays, Comparator, Date}
+import java.util.{Arrays, Comparator, Date, Locale}
 
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
@@ -357,7 +357,7 @@ class SparkHadoopUtil extends Logging {
    * @return a printable string value.
    */
   private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = {
-    val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT)
+    val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US)
     val buffer = new StringBuilder(128)
     buffer.append(token.toString)
     try {
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 8c91aa15167c4..4618e6117a4fb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.deploy.master
 
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 import java.util.concurrent.{ScheduledFuture, TimeUnit}
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
@@ -51,7 +51,8 @@ private[deploy] class Master(
 
   private val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
 
-  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs
+  // For application IDs
+  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
 
   private val WORKER_TIMEOUT_MS = conf.getLong("spark.worker.timeout", 60) * 1000
   private val RETAINED_APPLICATIONS = conf.getInt("spark.deploy.retainedApplications", 200)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 0bedd9a20a969..8b1c6bf2e5fd5 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -20,7 +20,7 @@ package org.apache.spark.deploy.worker
 import java.io.File
 import java.io.IOException
 import java.text.SimpleDateFormat
-import java.util.{Date, UUID}
+import java.util.{Date, Locale, UUID}
 import java.util.concurrent._
 import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFuture}
 
@@ -68,7 +68,7 @@ private[deploy] class Worker(
     ThreadUtils.newDaemonSingleThreadExecutor("worker-cleanup-thread"))
 
   // For worker and executor IDs
-  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")
+  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
   // Send a heartbeat every (heartbeat timeout) / 4 milliseconds
   private val HEARTBEAT_MILLIS = conf.getLong("spark.worker.timeout", 60) * 1000 / 4
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index e1cf3938de098..36a2f5c87e372 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.io.IOException
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.collection.immutable.Map
 import scala.reflect.ClassTag
@@ -243,7 +243,8 @@ class HadoopRDD[K, V](
 
       var reader: RecordReader[K, V] = null
       val inputFormat = getInputFormat(jobConf)
-      HadoopRDD.addLocalConfiguration(new SimpleDateFormat("yyyyMMddHHmmss").format(createTime),
+      HadoopRDD.addLocalConfiguration(
+        new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
         context.stageId, theSplit.index, context.attemptNumber, jobConf)
       reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index baf31fb658870..488e777fea371 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.io.IOException
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.reflect.ClassTag
 
@@ -79,7 +79,7 @@ class NewHadoopRDD[K, V](
   // private val serializableConf = new SerializableWritable(_conf)
 
   private val jobTrackerId: String = {
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
+    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
     formatter.format(new Date())
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 068f4ed8ad745..67baad1c51bca 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.rdd
 
 import java.nio.ByteBuffer
 import java.text.SimpleDateFormat
-import java.util.{Date, HashMap => JHashMap}
+import java.util.{Date, HashMap => JHashMap, Locale}
 
 import scala.collection.{mutable, Map}
 import scala.collection.JavaConverters._
@@ -1079,7 +1079,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
     val hadoopConf = conf
     val job = NewAPIHadoopJob.getInstance(hadoopConf)
-    val formatter = new SimpleDateFormat("yyyyMMddHHmmss")
+    val formatter = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
     val jobtrackerID = formatter.format(new Date())
     val stageId = self.id
     val jobConfiguration = job.getConfiguration
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
index f6a9f9c5573db..76af33c1a18db 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/JacksonMessageWriter.scala
@@ -21,7 +21,7 @@ import java.lang.annotation.Annotation
 import java.lang.reflect.Type
 import java.nio.charset.StandardCharsets
 import java.text.SimpleDateFormat
-import java.util.{Calendar, SimpleTimeZone}
+import java.util.{Calendar, Locale, SimpleTimeZone}
 import javax.ws.rs.Produces
 import javax.ws.rs.core.{MediaType, MultivaluedMap}
 import javax.ws.rs.ext.{MessageBodyWriter, Provider}
@@ -86,7 +86,7 @@ private[v1] class JacksonMessageWriter extends MessageBodyWriter[Object]{
 
 private[spark] object JacksonMessageWriter {
   def makeISODateFormat: SimpleDateFormat = {
-    val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'")
+    val iso8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'GMT'", Locale.US)
     val cal = Calendar.getInstance(new SimpleTimeZone(0, "GMT"))
     iso8601.setCalendar(cal)
     iso8601
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
index 0c71cd2382225..d8d5e8958b23c 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/SimpleDateParam.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.status.api.v1
 
 import java.text.{ParseException, SimpleDateFormat}
-import java.util.TimeZone
+import java.util.{Locale, TimeZone}
 import javax.ws.rs.WebApplicationException
 import javax.ws.rs.core.Response
 import javax.ws.rs.core.Response.Status
@@ -25,12 +25,12 @@ import javax.ws.rs.core.Response.Status
 private[v1] class SimpleDateParam(val originalValue: String) {
 
   val timestamp: Long = {
-    val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
+    val format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz", Locale.US)
     try {
       format.parse(originalValue).getTime()
     } catch {
       case _: ParseException =>
-        val gmtDay = new SimpleDateFormat("yyyy-MM-dd")
+        val gmtDay = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
         gmtDay.setTimeZone(TimeZone.getTimeZone("GMT"))
         try {
           gmtDay.parse(originalValue).getTime()
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index c0d1a2220f62a..66b097aa8166d 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -36,7 +36,8 @@ private[spark] object UIUtils extends Logging {
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
+    override def initialValue(): SimpleDateFormat =
+      new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US)
   }
 
   def formatDate(date: Date): String = dateFormat.get.format(date)
diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala
index 5c4238c0381a1..1f263df57c857 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.util.logging
 
 import java.text.SimpleDateFormat
-import java.util.Calendar
+import java.util.{Calendar, Locale}
 
 import org.apache.spark.internal.Logging
 
@@ -59,7 +59,7 @@ private[spark] class TimeBasedRollingPolicy(
   }
 
   @volatile private var nextRolloverTime = calculateNextRolloverTime()
-  private val formatter = new SimpleDateFormat(rollingFileSuffixPattern)
+  private val formatter = new SimpleDateFormat(rollingFileSuffixPattern, Locale.US)
 
   /** Should rollover if current time has exceeded next rollover time */
   def shouldRollover(bytesToBeWritten: Long): Boolean = {
@@ -109,7 +109,7 @@ private[spark] class SizeBasedRollingPolicy(
   }
 
   @volatile private var bytesWrittenSinceRollover = 0L
-  val formatter = new SimpleDateFormat("--yyyy-MM-dd--HH-mm-ss--SSSS")
+  val formatter = new SimpleDateFormat("--yyyy-MM-dd--HH-mm-ss--SSSS", Locale.US)
 
   /** Should rollover if the next set of bytes is going to exceed the size limit */
   def shouldRollover(bytesToBeWritten: Long): Boolean = {
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 15ef32f21d90c..feacfb7642f27 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -264,7 +264,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     val hour = minute * 60
     def str: (Long) => String = Utils.msDurationToString(_)
 
-    val sep = new DecimalFormatSymbols(Locale.getDefault()).getDecimalSeparator()
+    val sep = new DecimalFormatSymbols(Locale.US).getDecimalSeparator
 
     assert(str(123) === "123 ms")
     assert(str(second) === "1" + sep + "0 s")
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
index 3b96488a129a9..ff60b88c6d533 100644
--- a/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
+++ b/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.rest.mesos
 
 import java.io.File
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 import java.util.concurrent.atomic.AtomicLong
 import javax.servlet.http.HttpServletResponse
 
@@ -62,11 +62,10 @@ private[mesos] class MesosSubmitRequestServlet(
   private val DEFAULT_CORES = 1.0
 
   private val nextDriverNumber = new AtomicLong(0)
-  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss")  // For application IDs
-  private def newDriverId(submitDate: Date): String = {
-    "driver-%s-%04d".format(
-      createDateFormat.format(submitDate), nextDriverNumber.incrementAndGet())
-  }
+  // For application IDs
+  private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
+  private def newDriverId(submitDate: Date): String =
+    f"driver-${createDateFormat.format(submitDate)}-${nextDriverNumber.incrementAndGet()}%04d"
 
   /**
    * Build a driver description from the fields specified in the submit request.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
index 426bb818c9266..f5ca1c221d66b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExport.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.pmml.export
 
 import java.text.SimpleDateFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.beans.BeanProperty
 
@@ -34,7 +34,7 @@ private[mllib] trait PMMLModelExport {
     val version = getClass.getPackage.getImplementationVersion
     val app = new Application("Apache Spark MLlib").setVersion(version)
     val timestamp = new Timestamp()
-      .addContent(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(new Date()))
+      .addContent(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US).format(new Date()))
     val header = new Header()
       .setApplication(app)
       .setTimestamp(timestamp)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 7ab68a13e09cf..67c078ae5e264 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.text.SimpleDateFormat
-import java.util.{Calendar, TimeZone}
+import java.util.{Calendar, Locale, TimeZone}
 
 import scala.util.Try
 
@@ -331,7 +331,7 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, StringType)
 
   override protected def nullSafeEval(timestamp: Any, format: Any): Any = {
-    val sdf = new SimpleDateFormat(format.toString)
+    val sdf = new SimpleDateFormat(format.toString, Locale.US)
     UTF8String.fromString(sdf.format(new java.util.Date(timestamp.asInstanceOf[Long] / 1000)))
   }
 
@@ -400,7 +400,7 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
 
   private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
   private lazy val formatter: SimpleDateFormat =
-    Try(new SimpleDateFormat(constFormat.toString)).getOrElse(null)
+    Try(new SimpleDateFormat(constFormat.toString, Locale.US)).getOrElse(null)
 
   override def eval(input: InternalRow): Any = {
     val t = left.eval(input)
@@ -425,7 +425,7 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
             null
           } else {
             val formatString = f.asInstanceOf[UTF8String].toString
-            Try(new SimpleDateFormat(formatString).parse(
+            Try(new SimpleDateFormat(formatString, Locale.US).parse(
               t.asInstanceOf[UTF8String].toString).getTime / 1000L).getOrElse(null)
           }
       }
@@ -520,7 +520,7 @@ case class FromUnixTime(sec: Expression, format: Expression)
 
   private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String]
   private lazy val formatter: SimpleDateFormat =
-    Try(new SimpleDateFormat(constFormat.toString)).getOrElse(null)
+    Try(new SimpleDateFormat(constFormat.toString, Locale.US)).getOrElse(null)
 
   override def eval(input: InternalRow): Any = {
     val time = left.eval(input)
@@ -539,9 +539,10 @@ case class FromUnixTime(sec: Expression, format: Expression)
         if (f == null) {
           null
         } else {
-          Try(UTF8String.fromString(new SimpleDateFormat(
-            f.asInstanceOf[UTF8String].toString).format(new java.util.Date(
-              time.asInstanceOf[Long] * 1000L)))).getOrElse(null)
+          Try(
+            UTF8String.fromString(new SimpleDateFormat(f.toString, Locale.US).
+              format(new java.util.Date(time.asInstanceOf[Long] * 1000L)))
+          ).getOrElse(null)
         }
       }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 1bcbb6cfc9246..25a5e3fd7da73 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1415,7 +1415,7 @@ case class Sentences(
       val locale = if (languageStr != null && countryStr != null) {
         new Locale(languageStr.toString, countryStr.toString)
       } else {
-        Locale.getDefault
+        Locale.US
       }
       getSentences(string.asInstanceOf[UTF8String].toString, locale)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index aec18922ea6c8..c45970658cf07 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.json
 
+import java.util.Locale
+
 import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
 import org.apache.commons.lang3.time.FastDateFormat
 
@@ -56,11 +58,11 @@ private[sql] class JSONOptions(
 
   // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
   val dateFormat: FastDateFormat =
-    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"))
+    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US)
 
   val timestampFormat: FastDateFormat =
     FastDateFormat.getInstance(
-      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"))
+      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"), Locale.US)
 
   // Parse mode flags
   if (!ParseModes.isValidMode(parseMode)) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 0b643a5b84268..235ca8d2633a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.{DateFormat, SimpleDateFormat}
-import java.util.{Calendar, TimeZone}
+import java.util.{Calendar, Locale, TimeZone}
 import javax.xml.bind.DatatypeConverter
 
 import scala.annotation.tailrec
@@ -79,14 +79,14 @@ object DateTimeUtils {
   // `SimpleDateFormat` is not thread-safe.
   val threadLocalTimestampFormat = new ThreadLocal[DateFormat] {
     override def initialValue(): SimpleDateFormat = {
-      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     }
   }
 
   // `SimpleDateFormat` is not thread-safe.
   private val threadLocalDateFormat = new ThreadLocal[DateFormat] {
     override def initialValue(): SimpleDateFormat = {
-      new SimpleDateFormat("yyyy-MM-dd")
+      new SimpleDateFormat("yyyy-MM-dd", Locale.US)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 6118a34d29eaa..35cea25ba0b7d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
-import java.util.Calendar
+import java.util.{Calendar, Locale}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -30,8 +30,8 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   import IntegralLiteralTestUtils._
 
-  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
-  val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
+  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+  val sdfDate = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-11-08 13:10:15").getTime)
 
@@ -49,7 +49,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("DayOfYear") {
-    val sdfDay = new SimpleDateFormat("D")
+    val sdfDay = new SimpleDateFormat("D", Locale.US)
     (0 to 3).foreach { m =>
       (0 to 5).foreach { i =>
         val c = Calendar.getInstance()
@@ -411,9 +411,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("from_unixtime") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     checkEvaluation(
       FromUnixTime(Literal(0L), Literal("yyyy-MM-dd HH:mm:ss")), sdf1.format(new Timestamp(0)))
     checkEvaluation(FromUnixTime(
@@ -430,11 +430,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("unix_timestamp") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     val fmt3 = "yy-MM-dd"
-    val sdf3 = new SimpleDateFormat(fmt3)
+    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
     val date1 = Date.valueOf("2015-07-24")
     checkEvaluation(
       UnixTimestamp(Literal(sdf1.format(new Timestamp(0))), Literal("yyyy-MM-dd HH:mm:ss")), 0L)
@@ -466,11 +466,11 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("to_unix_timestamp") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     val fmt3 = "yy-MM-dd"
-    val sdf3 = new SimpleDateFormat(fmt3)
+    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
     val date1 = Date.valueOf("2015-07-24")
     checkEvaluation(
       ToUnixTimestamp(Literal(sdf1.format(new Timestamp(0))), Literal("yyyy-MM-dd HH:mm:ss")), 0L)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 4f516d006458e..e0a9a0c3d5c00 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
-import java.util.{Calendar, TimeZone}
+import java.util.{Calendar, Locale, TimeZone}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
@@ -68,8 +68,8 @@ class DateTimeUtilsSuite extends SparkFunSuite {
       assert(d2.toString === d1.toString)
     }
 
-    val df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
-    val df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z")
+    val df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+    val df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z", Locale.US)
 
     checkFromToJavaDate(new Date(100))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index 3ab775c909238..1981d8607c0c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -247,7 +247,7 @@ private[csv] object CSVTypeCast {
             case options.positiveInf => Float.PositiveInfinity
             case _ =>
               Try(datum.toFloat)
-                .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).floatValue())
+                .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue())
           }
         case _: DoubleType =>
           datum match {
@@ -256,7 +256,7 @@ private[csv] object CSVTypeCast {
             case options.positiveInf => Double.PositiveInfinity
             case _ =>
               Try(datum.toDouble)
-                .getOrElse(NumberFormat.getInstance(Locale.getDefault).parse(datum).doubleValue())
+                .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue())
           }
         case _: BooleanType => datum.toBoolean
         case dt: DecimalType =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 014614eb997a5..5903729c11fc5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.csv
 
 import java.nio.charset.StandardCharsets
+import java.util.Locale
 
 import org.apache.commons.lang3.time.FastDateFormat
 
@@ -104,11 +105,11 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str
 
   // Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
   val dateFormat: FastDateFormat =
-    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"))
+    FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), Locale.US)
 
   val timestampFormat: FastDateFormat =
     FastDateFormat.getInstance(
-      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"))
+      parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSZZ"), Locale.US)
 
   val maxColumns = getInt("maxColumns", 20480)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 0cc1edd196bc8..dbc27d8b237f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -102,7 +102,7 @@ object SQLMetrics {
    */
   def stringValue(metricsType: String, values: Seq[Long]): String = {
     if (metricsType == SUM_METRIC) {
-      val numberFormat = NumberFormat.getIntegerInstance(Locale.ENGLISH)
+      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
       numberFormat.format(values.sum)
     } else {
       val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index c662e7c6bc775..042977f870b8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -21,7 +21,7 @@ import java.io.{BufferedReader, InputStreamReader, IOException}
 import java.net.Socket
 import java.sql.Timestamp
 import java.text.SimpleDateFormat
-import java.util.Calendar
+import java.util.{Calendar, Locale}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable.ListBuffer
@@ -37,7 +37,7 @@ object TextSocketSource {
   val SCHEMA_REGULAR = StructType(StructField("value", StringType) :: Nil)
   val SCHEMA_TIMESTAMP = StructType(StructField("value", StringType) ::
     StructField("timestamp", TimestampType) :: Nil)
-  val DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+  val DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index f7aa3b747ae5d..e05b2252ee346 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
+import java.util.Locale
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.functions._
@@ -55,8 +56,8 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(sql("""SELECT CURRENT_TIMESTAMP() = NOW()"""), Row(true))
   }
 
-  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
-  val sdfDate = new SimpleDateFormat("yyyy-MM-dd")
+  val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+  val sdfDate = new SimpleDateFormat("yyyy-MM-dd", Locale.US)
   val d = new Date(sdf.parse("2015-04-08 13:10:15").getTime)
   val ts = new Timestamp(sdf.parse("2013-04-08 13:10:15").getTime)
 
@@ -395,11 +396,11 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("from_unixtime") {
-    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+    val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
-    val sdf2 = new SimpleDateFormat(fmt2)
+    val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
     val fmt3 = "yy-MM-dd HH-mm-ss"
-    val sdf3 = new SimpleDateFormat(fmt3)
+    val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
     val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b")
     checkAnswer(
       df.select(from_unixtime(col("a"))),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f7c22c6c93f7a..8209b5bd7f9de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.nio.charset.UnsupportedCharsetException
 import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
+import java.util.Locale
 
 import org.apache.commons.lang3.time.FastDateFormat
 import org.apache.hadoop.io.SequenceFile.CompressionType
@@ -487,7 +488,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .select("date")
       .collect()
 
-    val dateFormat = new SimpleDateFormat("dd/MM/yyyy HH:mm")
+    val dateFormat = new SimpleDateFormat("dd/MM/yyyy HH:mm", Locale.US)
     val expected =
       Seq(Seq(new Timestamp(dateFormat.parse("26/08/2015 18:00").getTime)),
         Seq(new Timestamp(dateFormat.parse("27/10/2014 18:30").getTime)),
@@ -509,7 +510,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       .select("date")
       .collect()
 
-    val dateFormat = new SimpleDateFormat("dd/MM/yyyy hh:mm")
+    val dateFormat = new SimpleDateFormat("dd/MM/yyyy hh:mm", Locale.US)
     val expected = Seq(
       new Date(dateFormat.parse("26/08/2015 18:00").getTime),
       new Date(dateFormat.parse("27/10/2014 18:30").getTime),
@@ -728,7 +729,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         .option("inferSchema", "false")
         .load(iso8601timestampsPath)
 
-      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSZZ")
+      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSZZ", Locale.US)
       val expectedTimestamps = timestamps.collect().map { r =>
         // This should be ISO8601 formatted string.
         Row(iso8501.format(r.toSeq.head))
@@ -761,7 +762,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         .option("inferSchema", "false")
         .load(iso8601datesPath)
 
-      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd")
+      val iso8501 = FastDateFormat.getInstance("yyyy-MM-dd", Locale.US)
       val expectedDates = dates.collect().map { r =>
         // This should be ISO8601 formatted string.
         Row(iso8501.format(r.toSeq.head))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
index 51832a13cfe0b..c74406b9cbfbb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
@@ -144,13 +144,12 @@ class CSVTypeCastSuite extends SparkFunSuite {
       DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
   }
 
-  test("Float and Double Types are cast correctly with Locale") {
+  test("Float and Double Types are cast without respect to platform default Locale") {
     val originalLocale = Locale.getDefault
     try {
-      val locale : Locale = new Locale("fr", "FR")
-      Locale.setDefault(locale)
-      assert(CSVTypeCast.castTo("1,00", FloatType) == 1.0)
-      assert(CSVTypeCast.castTo("1,00", DoubleType) == 1.0)
+      Locale.setDefault(new Locale("fr", "FR"))
+      assert(CSVTypeCast.castTo("1,00", FloatType) == 100.0) // Would parse as 1.0 in fr-FR
+      assert(CSVTypeCast.castTo("1,00", DoubleType) == 100.0)
     } finally {
       Locale.setDefault(originalLocale)
     }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 2843100fb3b36..05164d774ccaf 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -20,9 +20,7 @@ package org.apache.spark.sql.hive.execution
 import java.io.IOException
 import java.net.URI
 import java.text.SimpleDateFormat
-import java.util.{Date, Random}
-
-import scala.collection.JavaConverters._
+import java.util.{Date, Locale, Random}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
@@ -60,9 +58,8 @@ case class InsertIntoHiveTable(
 
   private def executionId: String = {
     val rand: Random = new Random
-    val format: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS")
-    val executionId: String = "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
-    return executionId
+    val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US)
+    "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong)
   }
 
   private def getStagingDir(inputPath: Path, hadoopConf: Configuration): Path = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index ea88276bb96c0..e53c3e4d4833b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.text.NumberFormat
-import java.util.Date
+import java.util.{Date, Locale}
 
 import scala.collection.JavaConverters._
 
@@ -95,7 +95,7 @@ private[hive] class SparkHiveWriterContainer(
   }
 
   protected def getOutputName: String = {
-    val numberFormat = NumberFormat.getInstance()
+    val numberFormat = NumberFormat.getInstance(Locale.US)
     numberFormat.setMinimumIntegerDigits(5)
     numberFormat.setGroupingUsed(false)
     val extension = Utilities.getFileExtension(conf.value, fileSinkConf.getCompressed, outputFormat)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
index 64d0ecbeefc98..cecfd99098659 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources
 
 import java.text.NumberFormat
+import java.util.Locale
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -141,7 +142,7 @@ class SimpleTextOutputWriter(path: String, context: TaskAttemptContext)
 
 class AppendingTextOutputFormat(path: String) extends TextOutputFormat[NullWritable, Text] {
 
-  val numberFormat = NumberFormat.getInstance()
+  val numberFormat = NumberFormat.getInstance(Locale.US)
   numberFormat.setMinimumIntegerDigits(5)
   numberFormat.setGroupingUsed(false)
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
index 9b1c939e9329f..84ecf81abfbf1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.streaming.ui
 
 import java.text.SimpleDateFormat
-import java.util.TimeZone
+import java.util.{Locale, TimeZone}
 import java.util.concurrent.TimeUnit
 
 import scala.xml.Node
@@ -80,11 +80,13 @@ private[streaming] object UIUtils {
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val batchTimeFormat = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
+    override def initialValue(): SimpleDateFormat =
+      new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US)
   }
 
   private val batchTimeFormatWithMilliseconds = new ThreadLocal[SimpleDateFormat]() {
-    override def initialValue(): SimpleDateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS")
+    override def initialValue(): SimpleDateFormat =
+      new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS", Locale.US)
   }
 
   /**

From 41491e54080742f6e4a1e80a72cd9f46a9336e31 Mon Sep 17 00:00:00 2001
From: eyal farago <eyal farago>
Date: Wed, 2 Nov 2016 11:12:20 +0100
Subject: [PATCH 008/534] [SPARK-16839][SQL] Simplify Struct creation code path

## What changes were proposed in this pull request?

Simplify struct creation, especially the aspect of `CleanupAliases` which missed some aliases when handling trees created by `CreateStruct`.

This PR includes:

1. A failing test (create struct with nested aliases, some of the aliases survive `CleanupAliases`).
2. A fix that transforms `CreateStruct` into a `CreateNamedStruct` constructor, effectively eliminating `CreateStruct` from all expression trees.
3. A `NamePlaceHolder` used by `CreateStruct` when column names cannot be extracted from unresolved `NamedExpression`.
4. A new Analyzer rule that resolves `NamePlaceHolder` into a string literal once the `NamedExpression` is resolved.
5. `CleanupAliases` code was simplified as it no longer has to deal with `CreateStruct`'s top level columns.

## How was this patch tested?
Running all tests-suits in package org.apache.spark.sql, especially including the analysis suite, making sure added test initially fails, after applying suggested fix rerun the entire analysis package successfully.

Modified few tests that expected `CreateStruct` which is now transformed into `CreateNamedStruct`.

Author: eyal farago <eyal farago>
Author: Herman van Hovell <hvanhovell@databricks.com>
Author: eyal farago <eyal.farago@gmail.com>
Author: Eyal Farago <eyal.farago@actimize.com>
Author: Hyukjin Kwon <gurwls223@gmail.com>
Author: eyalfa <eyal.farago@gmail.com>

Closes #15718 from hvanhovell/SPARK-16839-2.

(cherry picked from commit f151bd1af8a05d4b6c901ebe6ac0b51a4a1a20df)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 R/pkg/inst/tests/testthat/test_sparkSQL.R     |  12 +-
 .../sql/catalyst/analysis/Analyzer.scala      |  53 ++---
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +-
 .../sql/catalyst/expressions/Projection.scala |   2 -
 .../expressions/complexTypeCreator.scala      | 212 ++++++------------
 .../sql/catalyst/parser/AstBuilder.scala      |   4 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  38 +++-
 .../expressions/ComplexTypeSuite.scala        |   1 -
 .../scala/org/apache/spark/sql/Column.scala   |   3 +
 .../command/AnalyzeColumnCommand.scala        |   4 +-
 .../sql-tests/results/group-by.sql.out        |   2 +-
 .../apache/spark/sql/hive/test/TestHive.scala |  20 +-
 .../resources/sqlgen/subquery_in_having_2.sql |   2 +-
 .../sql/catalyst/LogicalPlanToSQLSuite.scala  |  12 +-
 14 files changed, 169 insertions(+), 198 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 806019d7524ff..d7fe6b32822a7 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1222,16 +1222,16 @@ test_that("column functions", {
   # Test struct()
   df <- createDataFrame(list(list(1L, 2L, 3L), list(4L, 5L, 6L)),
                         schema = c("a", "b", "c"))
-  result <- collect(select(df, struct("a", "c")))
+  result <- collect(select(df, alias(struct("a", "c"), "d")))
   expected <- data.frame(row.names = 1:2)
-  expected$"struct(a, c)" <- list(listToStruct(list(a = 1L, c = 3L)),
-                                 listToStruct(list(a = 4L, c = 6L)))
+  expected$"d" <- list(listToStruct(list(a = 1L, c = 3L)),
+                      listToStruct(list(a = 4L, c = 6L)))
   expect_equal(result, expected)
 
-  result <- collect(select(df, struct(df$a, df$b)))
+  result <- collect(select(df, alias(struct(df$a, df$b), "d")))
   expected <- data.frame(row.names = 1:2)
-  expected$"struct(a, b)" <- list(listToStruct(list(a = 1L, b = 2L)),
-                                 listToStruct(list(a = 4L, b = 5L)))
+  expected$"d" <- list(listToStruct(list(a = 1L, b = 2L)),
+                      listToStruct(list(a = 4L, b = 5L)))
   expect_equal(result, expected)
 
   # Test encode(), decode()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f8f4799322b3b..5011f2fdbf9b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.optimizer.BooleanSimplification
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, _}
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.trees.{TreeNodeRef}
+import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.util.toPrettySQL
 import org.apache.spark.sql.types._
 
@@ -83,6 +83,7 @@ class Analyzer(
       ResolveTableValuedFunctions ::
       ResolveRelations ::
       ResolveReferences ::
+      ResolveCreateNamedStruct ::
       ResolveDeserializer ::
       ResolveNewInstance ::
       ResolveUpCast ::
@@ -653,11 +654,12 @@ class Analyzer(
             case s: Star => s.expand(child, resolver)
             case o => o :: Nil
           })
-        case c: CreateStruct if containsStar(c.children) =>
-          c.copy(children = c.children.flatMap {
-            case s: Star => s.expand(child, resolver)
-            case o => o :: Nil
-          })
+        case c: CreateNamedStruct if containsStar(c.valExprs) =>
+          val newChildren = c.children.grouped(2).flatMap {
+            case Seq(k, s : Star) => CreateStruct(s.expand(child, resolver)).children
+            case kv => kv
+          }
+          c.copy(children = newChildren.toList )
         case c: CreateArray if containsStar(c.children) =>
           c.copy(children = c.children.flatMap {
             case s: Star => s.expand(child, resolver)
@@ -1141,7 +1143,7 @@ class Analyzer(
         case In(e, Seq(l @ ListQuery(_, exprId))) if e.resolved =>
           // Get the left hand side expressions.
           val expressions = e match {
-            case CreateStruct(exprs) => exprs
+            case cns : CreateNamedStruct => cns.valExprs
             case expr => Seq(expr)
           }
           resolveSubQuery(l, plans, expressions.size) { (rewrite, conditions) =>
@@ -2072,18 +2074,8 @@ object EliminateUnions extends Rule[LogicalPlan] {
  */
 object CleanupAliases extends Rule[LogicalPlan] {
   private def trimAliases(e: Expression): Expression = {
-    var stop = false
     e.transformDown {
-      // CreateStruct is a special case, we need to retain its top level Aliases as they decide the
-      // name of StructField. We also need to stop transform down this expression, or the Aliases
-      // under CreateStruct will be mistakenly trimmed.
-      case c: CreateStruct if !stop =>
-        stop = true
-        c.copy(children = c.children.map(trimNonTopLevelAliases))
-      case c: CreateStructUnsafe if !stop =>
-        stop = true
-        c.copy(children = c.children.map(trimNonTopLevelAliases))
-      case Alias(child, _) if !stop => child
+      case Alias(child, _) => child
     }
   }
 
@@ -2116,15 +2108,8 @@ object CleanupAliases extends Rule[LogicalPlan] {
     case a: AppendColumns => a
 
     case other =>
-      var stop = false
       other transformExpressionsDown {
-        case c: CreateStruct if !stop =>
-          stop = true
-          c.copy(children = c.children.map(trimNonTopLevelAliases))
-        case c: CreateStructUnsafe if !stop =>
-          stop = true
-          c.copy(children = c.children.map(trimNonTopLevelAliases))
-        case Alias(child, _) if !stop => child
+        case Alias(child, _) => child
       }
   }
 }
@@ -2217,3 +2202,19 @@ object TimeWindowing extends Rule[LogicalPlan] {
       }
   }
 }
+
+/**
+ * Resolve a [[CreateNamedStruct]] if it contains [[NamePlaceholder]]s.
+ */
+object ResolveCreateNamedStruct extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressions {
+    case e: CreateNamedStruct if !e.resolved =>
+      val children = e.children.grouped(2).flatMap {
+        case Seq(NamePlaceholder, e: NamedExpression) if e.resolved =>
+          Seq(Literal(e.name), e)
+        case kv =>
+          kv
+      }
+      CreateNamedStruct(children.toList)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 3e836ca375e2e..b028d07fb8d0c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -357,7 +357,7 @@ object FunctionRegistry {
     expression[MapValues]("map_values"),
     expression[Size]("size"),
     expression[SortArray]("sort_array"),
-    expression[CreateStruct]("struct"),
+    CreateStruct.registryEntry,
 
     // misc functions
     expression[AssertTrue]("assert_true"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index a81fa1ce3adcc..03e054d098511 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -119,7 +119,6 @@ object UnsafeProjection {
    */
   def create(exprs: Seq[Expression]): UnsafeProjection = {
     val unsafeExprs = exprs.map(_ transform {
-      case CreateStruct(children) => CreateStructUnsafe(children)
       case CreateNamedStruct(children) => CreateNamedStructUnsafe(children)
     })
     GenerateUnsafeProjection.generate(unsafeExprs)
@@ -145,7 +144,6 @@ object UnsafeProjection {
       subexpressionEliminationEnabled: Boolean): UnsafeProjection = {
     val e = exprs.map(BindReferences.bindReference(_, inputSchema))
       .map(_ transform {
-        case CreateStruct(children) => CreateStructUnsafe(children)
         case CreateNamedStruct(children) => CreateNamedStructUnsafe(children)
     })
     GenerateUnsafeProjection.generate(e, subexpressionEliminationEnabled)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 917aa0873130b..dbfb2996ec9d5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -18,9 +18,11 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder
+import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, MapData, TypeUtils}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, TypeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -172,101 +174,71 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
 }
 
 /**
- * Returns a Row containing the evaluation of all children expressions.
+ * An expression representing a not yet available attribute name. This expression is unevaluable
+ * and as its name suggests it is a temporary place holder until we're able to determine the
+ * actual attribute name.
  */
-@ExpressionDescription(
-  usage = "_FUNC_(col1, col2, col3, ...) - Creates a struct with the given field values.")
-case class CreateStruct(children: Seq[Expression]) extends Expression {
-
-  override def foldable: Boolean = children.forall(_.foldable)
-
-  override lazy val dataType: StructType = {
-    val fields = children.zipWithIndex.map { case (child, idx) =>
-      child match {
-        case ne: NamedExpression =>
-          StructField(ne.name, ne.dataType, ne.nullable, ne.metadata)
-        case _ =>
-          StructField(s"col${idx + 1}", child.dataType, child.nullable, Metadata.empty)
-      }
-    }
-    StructType(fields)
-  }
-
+case object NamePlaceholder extends LeafExpression with Unevaluable {
+  override lazy val resolved: Boolean = false
+  override def foldable: Boolean = false
   override def nullable: Boolean = false
+  override def dataType: DataType = StringType
+  override def prettyName: String = "NamePlaceholder"
+  override def toString: String = prettyName
+}
 
-  override def eval(input: InternalRow): Any = {
-    InternalRow(children.map(_.eval(input)): _*)
+/**
+ * Returns a Row containing the evaluation of all children expressions.
+ */
+object CreateStruct extends FunctionBuilder {
+  def apply(children: Seq[Expression]): CreateNamedStruct = {
+    CreateNamedStruct(children.zipWithIndex.flatMap {
+      case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e)
+      case (e: NamedExpression, _) => Seq(NamePlaceholder, e)
+      case (e, index) => Seq(Literal(s"col${index + 1}"), e)
+    })
   }
 
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val rowClass = classOf[GenericInternalRow].getName
-    val values = ctx.freshName("values")
-    ctx.addMutableState("Object[]", values, s"this.$values = null;")
-
-    ev.copy(code = s"""
-      boolean ${ev.isNull} = false;
-      this.$values = new Object[${children.size}];""" +
-      ctx.splitExpressions(
-        ctx.INPUT_ROW,
-        children.zipWithIndex.map { case (e, i) =>
-          val eval = e.genCode(ctx)
-          eval.code + s"""
-            if (${eval.isNull}) {
-              $values[$i] = null;
-            } else {
-              $values[$i] = ${eval.value};
-            }"""
-        }) +
-      s"""
-        final InternalRow ${ev.value} = new $rowClass($values);
-        this.$values = null;
-      """)
+  /**
+   * Entry to use in the function registry.
+   */
+  val registryEntry: (String, (ExpressionInfo, FunctionBuilder)) = {
+    val info: ExpressionInfo = new ExpressionInfo(
+      "org.apache.spark.sql.catalyst.expressions.NamedStruct",
+      null,
+      "struct",
+      "_FUNC_(col1, col2, col3, ...) - Creates a struct with the given field values.",
+      "")
+    ("struct", (info, this))
   }
-
-  override def prettyName: String = "struct"
 }
 
-
 /**
- * Creates a struct with the given field names and values
- *
- * @param children Seq(name1, val1, name2, val2, ...)
+ * Common base class for both [[CreateNamedStruct]] and [[CreateNamedStructUnsafe]].
  */
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.")
-// scalastyle:on line.size.limit
-case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
+trait CreateNamedStructLike extends Expression {
+  lazy val (nameExprs, valExprs) = children.grouped(2).map {
+    case Seq(name, value) => (name, value)
+  }.toList.unzip
 
-  /**
-   * Returns Aliased [[Expression]]s that could be used to construct a flattened version of this
-   * StructType.
-   */
-  def flatten: Seq[NamedExpression] = valExprs.zip(names).map {
-    case (v, n) => Alias(v, n.toString)()
-  }
+  lazy val names = nameExprs.map(_.eval(EmptyRow))
 
-  private lazy val (nameExprs, valExprs) =
-    children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
+  override def nullable: Boolean = false
 
-  private lazy val names = nameExprs.map(_.eval(EmptyRow))
+  override def foldable: Boolean = valExprs.forall(_.foldable)
 
   override lazy val dataType: StructType = {
     val fields = names.zip(valExprs).map {
-      case (name, valExpr: NamedExpression) =>
-        StructField(name.asInstanceOf[UTF8String].toString,
-          valExpr.dataType, valExpr.nullable, valExpr.metadata)
-      case (name, valExpr) =>
-        StructField(name.asInstanceOf[UTF8String].toString,
-          valExpr.dataType, valExpr.nullable, Metadata.empty)
+      case (name, expr) =>
+        val metadata = expr match {
+          case ne: NamedExpression => ne.metadata
+          case _ => Metadata.empty
+        }
+        StructField(name.toString, expr.dataType, expr.nullable, metadata)
     }
     StructType(fields)
   }
 
-  override def foldable: Boolean = valExprs.forall(_.foldable)
-
-  override def nullable: Boolean = false
-
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size % 2 != 0) {
       TypeCheckResult.TypeCheckFailure(s"$prettyName expects an even number of arguments.")
@@ -274,8 +246,8 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
       val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType)
       if (invalidNames.nonEmpty) {
         TypeCheckResult.TypeCheckFailure(
-          s"Only foldable StringType expressions are allowed to appear at odd position , got :" +
-            s" ${invalidNames.mkString(",")}")
+          "Only foldable StringType expressions are allowed to appear at odd position, got:" +
+          s" ${invalidNames.mkString(",")}")
       } else if (!names.contains(null)) {
         TypeCheckResult.TypeCheckSuccess
       } else {
@@ -284,9 +256,29 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
     }
   }
 
+  /**
+   * Returns Aliased [[Expression]]s that could be used to construct a flattened version of this
+   * StructType.
+   */
+  def flatten: Seq[NamedExpression] = valExprs.zip(names).map {
+    case (v, n) => Alias(v, n.toString)()
+  }
+
   override def eval(input: InternalRow): Any = {
     InternalRow(valExprs.map(_.eval(input)): _*)
   }
+}
+
+/**
+ * Creates a struct with the given field names and values
+ *
+ * @param children Seq(name1, val1, name2, val2, ...)
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.")
+// scalastyle:on line.size.limit
+case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStructLike {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rowClass = classOf[GenericInternalRow].getName
@@ -316,44 +308,6 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression {
   override def prettyName: String = "named_struct"
 }
 
-/**
- * Returns a Row containing the evaluation of all children expressions. This is a variant that
- * returns UnsafeRow directly. The unsafe projection operator replaces [[CreateStruct]] with
- * this expression automatically at runtime.
- */
-case class CreateStructUnsafe(children: Seq[Expression]) extends Expression {
-
-  override def foldable: Boolean = children.forall(_.foldable)
-
-  override lazy val resolved: Boolean = childrenResolved
-
-  override lazy val dataType: StructType = {
-    val fields = children.zipWithIndex.map { case (child, idx) =>
-      child match {
-        case ne: NamedExpression =>
-          StructField(ne.name, ne.dataType, ne.nullable, ne.metadata)
-        case _ =>
-          StructField(s"col${idx + 1}", child.dataType, child.nullable, Metadata.empty)
-      }
-    }
-    StructType(fields)
-  }
-
-  override def nullable: Boolean = false
-
-  override def eval(input: InternalRow): Any = {
-    InternalRow(children.map(_.eval(input)): _*)
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val eval = GenerateUnsafeProjection.createCode(ctx, children)
-    ExprCode(code = eval.code, isNull = eval.isNull, value = eval.value)
-  }
-
-  override def prettyName: String = "struct_unsafe"
-}
-
-
 /**
  * Creates a struct with the given field names and values. This is a variant that returns
  * UnsafeRow directly. The unsafe projection operator replaces [[CreateStruct]] with
@@ -361,31 +315,7 @@ case class CreateStructUnsafe(children: Seq[Expression]) extends Expression {
  *
  * @param children Seq(name1, val1, name2, val2, ...)
  */
-case class CreateNamedStructUnsafe(children: Seq[Expression]) extends Expression {
-
-  private lazy val (nameExprs, valExprs) =
-    children.grouped(2).map { case Seq(name, value) => (name, value) }.toList.unzip
-
-  private lazy val names = nameExprs.map(_.eval(EmptyRow).toString)
-
-  override lazy val dataType: StructType = {
-    val fields = names.zip(valExprs).map {
-      case (name, valExpr: NamedExpression) =>
-        StructField(name, valExpr.dataType, valExpr.nullable, valExpr.metadata)
-      case (name, valExpr) =>
-        StructField(name, valExpr.dataType, valExpr.nullable, Metadata.empty)
-    }
-    StructType(fields)
-  }
-
-  override def foldable: Boolean = valExprs.forall(_.foldable)
-
-  override def nullable: Boolean = false
-
-  override def eval(input: InternalRow): Any = {
-    InternalRow(valExprs.map(_.eval(input)): _*)
-  }
-
+case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateNamedStructLike {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val eval = GenerateUnsafeProjection.createCode(ctx, valExprs)
     ExprCode(code = eval.code, isNull = eval.isNull, value = eval.value)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index ac1577b3abb4d..4b151c81d8f8b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -688,8 +688,8 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
         // inline table comes in two styles:
         // style 1: values (1), (2), (3)  -- multiple columns are supported
         // style 2: values 1, 2, 3  -- only a single column is supported here
-        case CreateStruct(children) => children  // style 1
-        case child => Seq(child)  // style 2
+        case struct: CreateNamedStruct => struct.valExprs // style 1
+        case child => Seq(child)                          // style 2
       }
     }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 590774c043040..817de48de2798 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.scalatest.ShouldMatchers
+
 import org.apache.spark.sql.catalyst.{SimpleCatalystConf, TableIdentifier}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -25,7 +27,8 @@ import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
 
-class AnalysisSuite extends AnalysisTest {
+
+class AnalysisSuite extends AnalysisTest with ShouldMatchers {
   import org.apache.spark.sql.catalyst.analysis.TestRelations._
 
   test("union project *") {
@@ -218,9 +221,36 @@ class AnalysisSuite extends AnalysisTest {
 
     // CreateStruct is a special case that we should not trim Alias for it.
     plan = testRelation.select(CreateStruct(Seq(a, (a + 1).as("a+1"))).as("col"))
-    checkAnalysis(plan, plan)
-    plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
-    checkAnalysis(plan, plan)
+    expected = testRelation.select(CreateNamedStruct(Seq(
+      Literal(a.name), a,
+      Literal("a+1"), (a + 1))).as("col"))
+    checkAnalysis(plan, expected)
+  }
+
+  test("Analysis may leave unnecassary aliases") {
+    val att1 = testRelation.output.head
+    var plan = testRelation.select(
+      CreateStruct(Seq(att1, ((att1.as("aa")) + 1).as("a_plus_1"))).as("col"),
+      att1
+    )
+    val prevPlan = getAnalyzer(true).execute(plan)
+    plan = prevPlan.select(CreateArray(Seq(
+      CreateStruct(Seq(att1, (att1 + 1).as("a_plus_1"))).as("col1"),
+      /** alias should be eliminated by [[CleanupAliases]] */
+      "col".attr.as("col2")
+    )).as("arr"))
+    plan = getAnalyzer(true).execute(plan)
+
+    val expectedPlan = prevPlan.select(
+      CreateArray(Seq(
+        CreateNamedStruct(Seq(
+          Literal(att1.name), att1,
+          Literal("a_plus_1"), (att1 + 1))),
+          'col.struct(prevPlan.output(0).dataType.asInstanceOf[StructType]).notNull
+      )).as("arr")
+    )
+
+    checkAnalysis(plan, expectedPlan)
   }
 
   test("SPARK-10534: resolve attribute references in order by clause") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index 0c307b2b8576b..c21c6de32c0ba 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -243,7 +243,6 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
     val b = AttributeReference("b", IntegerType)()
     checkMetadata(CreateStruct(Seq(a, b)))
     checkMetadata(CreateNamedStruct(Seq("a", a, "b", b)))
-    checkMetadata(CreateStructUnsafe(Seq(a, b)))
     checkMetadata(CreateNamedStructUnsafe(Seq("a", a, "b", b)))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 249408e0fbce4..7a131b30eafd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -186,6 +186,9 @@ class Column(val expr: Expression) extends Logging {
     case a: AggregateExpression if a.aggregateFunction.isInstanceOf[TypedAggregateExpression] =>
       UnresolvedAlias(a, Some(Column.generateAlias))
 
+    // Wait until the struct is resolved. This will generate a nicer looking alias.
+    case struct: CreateNamedStructLike => UnresolvedAlias(struct)
+
     case expr: Expression => Alias(expr, usePrettyExpression(expr).sql)()
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index f873f34a845ef..6141fab4aff0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -137,7 +137,7 @@ object ColumnStatStruct {
   private def numTrues(e: Expression): Expression = Sum(If(e, one, zero))
   private def numFalses(e: Expression): Expression = Sum(If(Not(e), one, zero))
 
-  private def getStruct(exprs: Seq[Expression]): CreateStruct = {
+  private def getStruct(exprs: Seq[Expression]): CreateNamedStruct = {
     CreateStruct(exprs.map { expr: Expression =>
       expr.transformUp {
         case af: AggregateFunction => af.toAggregateExpression()
@@ -168,7 +168,7 @@ object ColumnStatStruct {
     }
   }
 
-  def apply(attr: Attribute, relativeSD: Double): CreateStruct = attr.dataType match {
+  def apply(attr: Attribute, relativeSD: Double): CreateNamedStruct = attr.dataType match {
     // Use aggregate functions to compute statistics we need.
     case _: NumericType | TimestampType | DateType => getStruct(numericColumnStat(attr, relativeSD))
     case StringType => getStruct(stringColumnStat(attr, relativeSD))
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index a91f04e098b18..af6c930d64b76 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -87,7 +87,7 @@ struct<foo:string,approx_count_distinct(a):bigint>
 -- !query 9
 SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1
 -- !query 9 schema
-struct<foo:string,max(struct(a)):struct<a:int>>
+struct<foo:string,max(named_struct(a, a)):struct<a:int>>
 -- !query 9 output
 
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6eb571b91ffab..90000445dffb2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -190,6 +190,12 @@ private[hive] class TestHiveSparkSession(
     new File(Thread.currentThread().getContextClassLoader.getResource(path).getFile)
   }
 
+  private def quoteHiveFile(path : String) = if (Utils.isWindows) {
+    getHiveFile(path).getPath.replace('\\', '/')
+  } else {
+    getHiveFile(path).getPath
+  }
+
   def getWarehousePath(): String = {
     val tempConf = new SQLConf
     sc.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) }
@@ -225,16 +231,16 @@ private[hive] class TestHiveSparkSession(
     val hiveQTestUtilTables: Seq[TestTable] = Seq(
       TestTable("src",
         "CREATE TABLE src (key INT, value STRING)".cmd,
-        s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd),
+        s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd),
       TestTable("src1",
         "CREATE TABLE src1 (key INT, value STRING)".cmd,
-        s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd),
+        s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd),
       TestTable("srcpart", () => {
         sql(
           "CREATE TABLE srcpart (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING)")
         for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq("11", "12")) {
           sql(
-            s"""LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}'
+            s"""LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}'
                |OVERWRITE INTO TABLE srcpart PARTITION (ds='$ds',hr='$hr')
              """.stripMargin)
         }
@@ -244,7 +250,7 @@ private[hive] class TestHiveSparkSession(
           "CREATE TABLE srcpart1 (key INT, value STRING) PARTITIONED BY (ds STRING, hr INT)")
         for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- 11 to 12) {
           sql(
-            s"""LOAD DATA LOCAL INPATH '${getHiveFile("data/files/kv1.txt")}'
+            s"""LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}'
                |OVERWRITE INTO TABLE srcpart1 PARTITION (ds='$ds',hr='$hr')
              """.stripMargin)
         }
@@ -269,7 +275,7 @@ private[hive] class TestHiveSparkSession(
 
         sql(
           s"""
-             |LOAD DATA LOCAL INPATH '${getHiveFile("data/files/complex.seq")}'
+             |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/complex.seq")}'
              |INTO TABLE src_thrift
            """.stripMargin)
       }),
@@ -308,7 +314,7 @@ private[hive] class TestHiveSparkSession(
            |)
          """.stripMargin.cmd,
         s"""
-           |LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}'
+           |LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/episodes.avro")}'
            |INTO TABLE episodes
          """.stripMargin.cmd
       ),
@@ -379,7 +385,7 @@ private[hive] class TestHiveSparkSession(
       TestTable("src_json",
         s"""CREATE TABLE src_json (json STRING) STORED AS TEXTFILE
          """.stripMargin.cmd,
-        s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd)
+        s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/json.txt")}' INTO TABLE src_json".cmd)
     )
 
     hiveQTestUtilTables.foreach(registerTestTable)
diff --git a/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql b/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
index de0116a4dcbaf..cdda29af50e37 100644
--- a/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
+++ b/sql/hive/src/test/resources/sqlgen/subquery_in_having_2.sql
@@ -7,4 +7,4 @@ having b.key in (select a.key
                  where a.value > 'val_9' and a.value = min(b.value))
 order by b.key
 --------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `min(value)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, min(`gen_attr_5`) AS `gen_attr_1`, min(`gen_attr_5`) AS `gen_attr_4` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_5` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (struct(`gen_attr_0`, `gen_attr_4`) IN (SELECT `gen_attr_6` AS `_c0`, `gen_attr_7` AS `_c1` FROM (SELECT `gen_attr_2` AS `gen_attr_6`, `gen_attr_3` AS `gen_attr_7` FROM (SELECT `gen_attr_2`, `gen_attr_3` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_3` > 'val_9')) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS b
+SELECT `gen_attr_0` AS `key`, `gen_attr_1` AS `min(value)` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `gen_attr_0`, min(`gen_attr_5`) AS `gen_attr_1`, min(`gen_attr_5`) AS `gen_attr_4` FROM (SELECT `key` AS `gen_attr_0`, `value` AS `gen_attr_5` FROM `default`.`src`) AS gen_subquery_0 GROUP BY `gen_attr_0` HAVING (named_struct('gen_attr_0', `gen_attr_0`, 'gen_attr_4', `gen_attr_4`) IN (SELECT `gen_attr_6` AS `_c0`, `gen_attr_7` AS `_c1` FROM (SELECT `gen_attr_2` AS `gen_attr_6`, `gen_attr_3` AS `gen_attr_7` FROM (SELECT `gen_attr_2`, `gen_attr_3` FROM (SELECT `key` AS `gen_attr_2`, `value` AS `gen_attr_3` FROM `default`.`src`) AS gen_subquery_3 WHERE (`gen_attr_3` > 'val_9')) AS gen_subquery_2) AS gen_subquery_4))) AS gen_subquery_1 ORDER BY `gen_attr_0` ASC NULLS FIRST) AS b
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
index c7f10e569fa4d..12d18dc87ceb4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst
 import java.nio.charset.StandardCharsets
 import java.nio.file.{Files, NoSuchFileException, Paths}
 
+import scala.io.Source
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.Column
@@ -109,12 +110,15 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
         Files.write(path, answerText.getBytes(StandardCharsets.UTF_8))
       } else {
         val goldenFileName = s"sqlgen/$answerFile.sql"
-        val resourceFile = getClass.getClassLoader.getResource(goldenFileName)
-        if (resourceFile == null) {
+        val resourceStream = getClass.getClassLoader.getResourceAsStream(goldenFileName)
+        if (resourceStream == null) {
           throw new NoSuchFileException(goldenFileName)
         }
-        val path = resourceFile.getPath
-        val answerText = new String(Files.readAllBytes(Paths.get(path)), StandardCharsets.UTF_8)
+        val answerText = try {
+          Source.fromInputStream(resourceStream).mkString
+        } finally {
+          resourceStream.close
+        }
         val sqls = answerText.split(separator)
         assert(sqls.length == 2, "Golden sql files should have a separator.")
         val expectedSQL = sqls(1).trim()

From 9be069125f7e94df9d862f307b87965baf9416e3 Mon Sep 17 00:00:00 2001
From: Takeshi YAMAMURO <linguin.m.s@gmail.com>
Date: Wed, 2 Nov 2016 11:29:26 -0700
Subject: [PATCH 009/534] [SPARK-17683][SQL] Support ArrayType in Literal.apply

## What changes were proposed in this pull request?

This pr is to add pattern-matching entries for array data in `Literal.apply`.
## How was this patch tested?

Added tests in `LiteralExpressionSuite`.

Author: Takeshi YAMAMURO <linguin.m.s@gmail.com>

Closes #15257 from maropu/SPARK-17683.

(cherry picked from commit 4af0ce2d96de3397c9bc05684cad290a52486577)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/expressions/literals.scala   | 57 ++++++++++++++++++-
 .../expressions/LiteralExpressionSuite.scala  | 27 ++++++++-
 2 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index a597a17aadd99..1985e68c94e2d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -17,14 +17,25 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.lang.{Boolean => JavaBoolean}
+import java.lang.{Byte => JavaByte}
+import java.lang.{Double => JavaDouble}
+import java.lang.{Float => JavaFloat}
+import java.lang.{Integer => JavaInteger}
+import java.lang.{Long => JavaLong}
+import java.lang.{Short => JavaShort}
+import java.math.{BigDecimal => JavaBigDecimal}
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 import java.util
 import java.util.Objects
 import javax.xml.bind.DatatypeConverter
 
+import scala.math.{BigDecimal, BigInt}
+
 import org.json4s.JsonAST._
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -46,12 +57,17 @@ object Literal {
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
     case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale))
-    case d: java.math.BigDecimal =>
+    case d: JavaBigDecimal =>
       Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale()))
     case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale))
     case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType)
     case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
     case a: Array[Byte] => Literal(a, BinaryType)
+    case a: Array[_] =>
+      val elementType = componentTypeToDataType(a.getClass.getComponentType())
+      val dataType = ArrayType(elementType)
+      val convert = CatalystTypeConverters.createToCatalystConverter(dataType)
+      Literal(convert(a), dataType)
     case i: CalendarInterval => Literal(i, CalendarIntervalType)
     case null => Literal(null, NullType)
     case v: Literal => v
@@ -59,6 +75,45 @@ object Literal {
       throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v)
   }
 
+  /**
+   * Returns the Spark SQL DataType for a given class object. Since this type needs to be resolved
+   * in runtime, we use match-case idioms for class objects here. However, there are similar
+   * functions in other files (e.g., HiveInspectors), so these functions need to merged into one.
+   */
+  private[this] def componentTypeToDataType(clz: Class[_]): DataType = clz match {
+    // primitive types
+    case JavaShort.TYPE => ShortType
+    case JavaInteger.TYPE => IntegerType
+    case JavaLong.TYPE => LongType
+    case JavaDouble.TYPE => DoubleType
+    case JavaByte.TYPE => ByteType
+    case JavaFloat.TYPE => FloatType
+    case JavaBoolean.TYPE => BooleanType
+
+    // java classes
+    case _ if clz == classOf[Date] => DateType
+    case _ if clz == classOf[Timestamp] => TimestampType
+    case _ if clz == classOf[JavaBigDecimal] => DecimalType.SYSTEM_DEFAULT
+    case _ if clz == classOf[Array[Byte]] => BinaryType
+    case _ if clz == classOf[JavaShort] => ShortType
+    case _ if clz == classOf[JavaInteger] => IntegerType
+    case _ if clz == classOf[JavaLong] => LongType
+    case _ if clz == classOf[JavaDouble] => DoubleType
+    case _ if clz == classOf[JavaByte] => ByteType
+    case _ if clz == classOf[JavaFloat] => FloatType
+    case _ if clz == classOf[JavaBoolean] => BooleanType
+
+    // other scala classes
+    case _ if clz == classOf[String] => StringType
+    case _ if clz == classOf[BigInt] => DecimalType.SYSTEM_DEFAULT
+    case _ if clz == classOf[BigDecimal] => DecimalType.SYSTEM_DEFAULT
+    case _ if clz == classOf[CalendarInterval] => CalendarIntervalType
+
+    case _ if clz.isArray => ArrayType(componentTypeToDataType(clz.getComponentType))
+
+    case _ => throw new AnalysisException(s"Unsupported component type $clz in arrays")
+  }
+
   /**
    * Constructs a [[Literal]] of [[ObjectType]], for example when you need to pass an object
    * into code generation.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index 450222d8cbba3..4af4da8a9f0c2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -43,6 +44,7 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, TimestampType), null)
     checkEvaluation(Literal.create(null, CalendarIntervalType), null)
     checkEvaluation(Literal.create(null, ArrayType(ByteType, true)), null)
+    checkEvaluation(Literal.create(null, ArrayType(StringType, true)), null)
     checkEvaluation(Literal.create(null, MapType(StringType, IntegerType)), null)
     checkEvaluation(Literal.create(null, StructType(Seq.empty)), null)
   }
@@ -122,5 +124,28 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
-  // TODO(davies): add tests for ArrayType, MapType and StructType
+  test("array") {
+    def checkArrayLiteral(a: Array[_], elementType: DataType): Unit = {
+      val toCatalyst = (a: Array[_], elementType: DataType) => {
+        CatalystTypeConverters.createToCatalystConverter(ArrayType(elementType))(a)
+      }
+      checkEvaluation(Literal(a), toCatalyst(a, elementType))
+    }
+    checkArrayLiteral(Array(1, 2, 3), IntegerType)
+    checkArrayLiteral(Array("a", "b", "c"), StringType)
+    checkArrayLiteral(Array(1.0, 4.0), DoubleType)
+    checkArrayLiteral(Array(CalendarInterval.MICROS_PER_DAY, CalendarInterval.MICROS_PER_HOUR),
+      CalendarIntervalType)
+  }
+
+  test("unsupported types (map and struct) in literals") {
+    def checkUnsupportedTypeInLiteral(v: Any): Unit = {
+      val errMsgMap = intercept[RuntimeException] {
+        Literal(v)
+      }
+      assert(errMsgMap.getMessage.startsWith("Unsupported literal type"))
+    }
+    checkUnsupportedTypeInLiteral(Map("key1" -> 1, "key2" -> 2))
+    checkUnsupportedTypeInLiteral(("mike", 29, 1.0))
+  }
 }

From a885d5bbce9dba66b394850b3aac51ae97cb18dd Mon Sep 17 00:00:00 2001
From: buzhihuojie <ren.weiluo@gmail.com>
Date: Wed, 2 Nov 2016 11:36:20 -0700
Subject: [PATCH 010/534] [SPARK-17895] Improve doc for rangeBetween and
 rowsBetween

## What changes were proposed in this pull request?

Copied description for row and range based frame boundary from https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala#L56

Added examples to show different behavior of rangeBetween and rowsBetween when involving duplicate values.

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

Author: buzhihuojie <ren.weiluo@gmail.com>

Closes #15727 from david-weiluo-ren/improveDocForRangeAndRowsBetween.

(cherry picked from commit 742e0fea5391857964e90d396641ecf95cac4248)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/sql/expressions/Window.scala | 55 +++++++++++++++++++
 .../spark/sql/expressions/WindowSpec.scala    | 55 +++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 0b26d863cac5d..327bc379d4132 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -121,6 +121,32 @@ object Window {
    * and [[Window.currentRow]] to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A row based boundary is based on the position of the row within the partition.
+   * An offset indicates the number of rows above or below the current row, the frame for the
+   * current row starts or ends. For instance, given a row based sliding frame with a lower bound
+   * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from
+   * index 4 to index 6.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rowsBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  2|
+   *   |  1|       a|  3|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
    *              the minimum long value ([[Window.unboundedPreceding]]).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
@@ -144,6 +170,35 @@ object Window {
    * and [[Window.currentRow]] to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A range based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY expression, for
+   * instance if the current order by expression has a value of 10 and the lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one expression and this
+   * expression must have a numerical data type. An exception can be made when the offset is 0,
+   * because no value modification is needed, in this case multiple and non-numeric ORDER BY
+   * expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rangeBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  4|
+   *   |  1|       a|  4|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
    *              the minimum long value ([[Window.unboundedPreceding]]).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index 1e85b6e7881ad..4a8ce695bd4da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -89,6 +89,32 @@ class WindowSpec private[sql](
    * and [[Window.currentRow]] to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A row based boundary is based on the position of the row within the partition.
+   * An offset indicates the number of rows above or below the current row, the frame for the
+   * current row starts or ends. For instance, given a row based sliding frame with a lower bound
+   * offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from
+   * index 4 to index 6.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rowsBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  2|
+   *   |  1|       a|  3|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
    *              the minimum long value ([[Window.unboundedPreceding]]).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
@@ -111,6 +137,35 @@ class WindowSpec private[sql](
    * and [[Window.currentRow]] to specify special boundary values, rather than using integral
    * values directly.
    *
+   * A range based boundary is based on the actual value of the ORDER BY
+   * expression(s). An offset is used to alter the value of the ORDER BY expression, for
+   * instance if the current order by expression has a value of 10 and the lower bound offset
+   * is -3, the resulting lower bound for the current row will be 10 - 3 = 7. This however puts a
+   * number of constraints on the ORDER BY expressions: there can be only one expression and this
+   * expression must have a numerical data type. An exception can be made when the offset is 0,
+   * because no value modification is needed, in this case multiple and non-numeric ORDER BY
+   * expression are allowed.
+   *
+   * {{{
+   *   import org.apache.spark.sql.expressions.Window
+   *   val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b"))
+   *     .toDF("id", "category")
+   *   df.withColumn("sum",
+   *       sum('id) over Window.partitionBy('category).orderBy('id).rangeBetween(0,1))
+   *     .show()
+   *
+   *   +---+--------+---+
+   *   | id|category|sum|
+   *   +---+--------+---+
+   *   |  1|       b|  3|
+   *   |  2|       b|  5|
+   *   |  3|       b|  3|
+   *   |  1|       a|  4|
+   *   |  1|       a|  4|
+   *   |  2|       a|  2|
+   *   +---+--------+---+
+   * }}}
+   *
    * @param start boundary start, inclusive. The frame is unbounded if this is
    *              the minimum long value ([[Window.unboundedPreceding]]).
    * @param end boundary end, inclusive. The frame is unbounded if this is the

From 0093257ea94d3a197ca061b54c04685d7c1f616a Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Wed, 2 Nov 2016 11:41:49 -0700
Subject: [PATCH 011/534] [SPARK-14393][SQL] values generated by
 non-deterministic functions shouldn't change after coalesce or union

## What changes were proposed in this pull request?

When a user appended a column using a "nondeterministic" function to a DataFrame, e.g., `rand`, `randn`, and `monotonically_increasing_id`, the expected semantic is the following:
- The value in each row should remain unchanged, as if we materialize the column immediately, regardless of later DataFrame operations.

However, since we use `TaskContext.getPartitionId` to get the partition index from the current thread, the values from nondeterministic columns might change if we call `union` or `coalesce` after. `TaskContext.getPartitionId` returns the partition index of the current Spark task, which might not be the corresponding partition index of the DataFrame where we defined the column.

See the unit tests below or JIRA for examples.

This PR uses the partition index from `RDD.mapPartitionWithIndex` instead of `TaskContext` and fixes the partition initialization logic in whole-stage codegen, normal codegen, and codegen fallback. `initializeStatesForPartition(partitionIndex: Int)` was added to `Projection`, `Nondeterministic`, and `Predicate` (codegen) and initialized right after object creation in `mapPartitionWithIndex`. `newPredicate` now returns a `Predicate` instance rather than a function for proper initialization.
## How was this patch tested?

Unit tests. (Actually I'm not very confident that this PR fixed all issues without introducing new ones ...)

cc: rxin davies

Author: Xiangrui Meng <meng@databricks.com>

Closes #15567 from mengxr/SPARK-14393.

(cherry picked from commit 02f203107b8eda1f1576e36c4f12b0e3bc5e910e)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../main/scala/org/apache/spark/rdd/RDD.scala | 16 +++++-
 .../sql/catalyst/expressions/Expression.scala | 19 +++++--
 .../catalyst/expressions/InputFileName.scala  |  2 +-
 .../MonotonicallyIncreasingID.scala           | 11 ++--
 .../sql/catalyst/expressions/Projection.scala | 22 +++++---
 .../expressions/SparkPartitionID.scala        | 13 +++--
 .../expressions/codegen/CodeGenerator.scala   | 14 +++++
 .../expressions/codegen/CodegenFallback.scala | 18 +++++--
 .../codegen/GenerateMutableProjection.scala   |  4 ++
 .../codegen/GeneratePredicate.scala           | 18 +++++--
 .../codegen/GenerateSafeProjection.scala      |  4 ++
 .../codegen/GenerateUnsafeProjection.scala    |  4 ++
 .../sql/catalyst/expressions/package.scala    | 10 +++-
 .../sql/catalyst/expressions/predicates.scala |  4 --
 .../expressions/randomExpressions.scala       | 14 ++---
 .../sql/catalyst/optimizer/Optimizer.scala    |  1 +
 .../expressions/ExpressionEvalHelper.scala    |  5 +-
 .../CodegenExpressionCachingSuite.scala       | 13 +++--
 .../sql/execution/DataSourceScanExec.scala    |  6 ++-
 .../spark/sql/execution/ExistingRDD.scala     |  3 +-
 .../spark/sql/execution/GenerateExec.scala    |  3 +-
 .../spark/sql/execution/SparkPlan.scala       |  4 +-
 .../sql/execution/WholeStageCodegenExec.scala |  8 ++-
 .../execution/basicPhysicalOperators.scala    |  8 +--
 .../columnar/InMemoryTableScanExec.scala      |  5 +-
 .../joins/BroadcastNestedLoopJoinExec.scala   |  7 +--
 .../joins/CartesianProductExec.scala          |  8 +--
 .../spark/sql/execution/joins/HashJoin.scala  |  2 +-
 .../execution/joins/SortMergeJoinExec.scala   |  2 +-
 .../apache/spark/sql/execution/objects.scala  |  6 ++-
 .../spark/sql/DataFrameFunctionsSuite.scala   | 52 +++++++++++++++++++
 .../hive/execution/HiveTableScanExec.scala    |  3 +-
 32 files changed, 231 insertions(+), 78 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index db535de9e9bb3..e018af35cb18d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -788,14 +788,26 @@ abstract class RDD[T: ClassTag](
   }
 
   /**
-   * [performance] Spark's internal mapPartitions method which skips closure cleaning. It is a
-   * performance API to be used carefully only if we are sure that the RDD elements are
+   * [performance] Spark's internal mapPartitionsWithIndex method that skips closure cleaning.
+   * It is a performance API to be used carefully only if we are sure that the RDD elements are
    * serializable and don't require closure cleaning.
    *
    * @param preservesPartitioning indicates whether the input function preserves the partitioner,
    * which should be `false` unless this is a pair RDD and the input function doesn't modify
    * the keys.
    */
+  private[spark] def mapPartitionsWithIndexInternal[U: ClassTag](
+      f: (Int, Iterator[T]) => Iterator[U],
+      preservesPartitioning: Boolean = false): RDD[U] = withScope {
+    new MapPartitionsRDD(
+      this,
+      (context: TaskContext, index: Int, iter: Iterator[T]) => f(index, iter),
+      preservesPartitioning)
+  }
+
+  /**
+   * [performance] Spark's internal mapPartitions method that skips closure cleaning.
+   */
   private[spark] def mapPartitionsInternal[U: ClassTag](
       f: Iterator[T] => Iterator[U],
       preservesPartitioning: Boolean = false): RDD[U] = withScope {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 9edc1ceff26a7..726a231fd814e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -272,17 +272,28 @@ trait Nondeterministic extends Expression {
   final override def deterministic: Boolean = false
   final override def foldable: Boolean = false
 
+  @transient
   private[this] var initialized = false
 
-  final def setInitialValues(): Unit = {
-    initInternal()
+  /**
+   * Initializes internal states given the current partition index and mark this as initialized.
+   * Subclasses should override [[initializeInternal()]].
+   */
+  final def initialize(partitionIndex: Int): Unit = {
+    initializeInternal(partitionIndex)
     initialized = true
   }
 
-  protected def initInternal(): Unit
+  protected def initializeInternal(partitionIndex: Int): Unit
 
+  /**
+   * @inheritdoc
+   * Throws an exception if [[initialize()]] is not called yet.
+   * Subclasses should override [[evalInternal()]].
+   */
   final override def eval(input: InternalRow = null): Any = {
-    require(initialized, "nondeterministic expression should be initialized before evaluate")
+    require(initialized,
+      s"Nondeterministic expression ${this.getClass.getName} should be initialized before eval.")
     evalInternal(input)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
index 96929ecf56375..b6c12c5351119 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
@@ -37,7 +37,7 @@ case class InputFileName() extends LeafExpression with Nondeterministic {
 
   override def prettyName: String = "input_file_name"
 
-  override protected def initInternal(): Unit = {}
+  override protected def initializeInternal(partitionIndex: Int): Unit = {}
 
   override protected def evalInternal(input: InternalRow): UTF8String = {
     InputFileNameHolder.getInputFileName()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index 5b4922e0cf2b7..72b8dcca26e2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -50,9 +50,9 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
 
   @transient private[this] var partitionMask: Long = _
 
-  override protected def initInternal(): Unit = {
+  override protected def initializeInternal(partitionIndex: Int): Unit = {
     count = 0L
-    partitionMask = TaskContext.getPartitionId().toLong << 33
+    partitionMask = partitionIndex.toLong << 33
   }
 
   override def nullable: Boolean = false
@@ -68,9 +68,10 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val countTerm = ctx.freshName("count")
     val partitionMaskTerm = ctx.freshName("partitionMask")
-    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
-    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
-      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")
+    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
+    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
+    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
+    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")
 
     ev.copy(code = s"""
       final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
index 03e054d098511..476e37e6a9bac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Projection.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
  * A [[Projection]] that is calculated by calling the `eval` of each of the specified expressions.
+ *
  * @param expressions a sequence of expressions that determine the value of each column of the
  *                    output row.
  */
@@ -30,10 +31,12 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
   def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) =
     this(expressions.map(BindReferences.bindReference(_, inputSchema)))
 
-  expressions.foreach(_.foreach {
-    case n: Nondeterministic => n.setInitialValues()
-    case _ =>
-  })
+  override def initialize(partitionIndex: Int): Unit = {
+    expressions.foreach(_.foreach {
+      case n: Nondeterministic => n.initialize(partitionIndex)
+      case _ =>
+    })
+  }
 
   // null check is required for when Kryo invokes the no-arg constructor.
   protected val exprArray = if (expressions != null) expressions.toArray else null
@@ -54,6 +57,7 @@ class InterpretedProjection(expressions: Seq[Expression]) extends Projection {
 /**
  * A [[MutableProjection]] that is calculated by calling `eval` on each of the specified
  * expressions.
+ *
  * @param expressions a sequence of expressions that determine the value of each column of the
  *                    output row.
  */
@@ -63,10 +67,12 @@ case class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mu
 
   private[this] val buffer = new Array[Any](expressions.size)
 
-  expressions.foreach(_.foreach {
-    case n: Nondeterministic => n.setInitialValues()
-    case _ =>
-  })
+  override def initialize(partitionIndex: Int): Unit = {
+    expressions.foreach(_.foreach {
+      case n: Nondeterministic => n.initialize(partitionIndex)
+      case _ =>
+    })
+  }
 
   private[this] val exprArray = expressions.toArray
   private[this] var mutableRow: InternalRow = new GenericInternalRow(exprArray.length)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 1f675d5b07270..6bef473cac060 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -17,16 +17,15 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.types.{DataType, IntegerType}
 
 /**
- * Expression that returns the current partition id of the Spark task.
+ * Expression that returns the current partition id.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the current partition id of the Spark task",
+  usage = "_FUNC_() - Returns the current partition id",
   extended = "> SELECT _FUNC_();\n 0")
 case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
@@ -38,16 +37,16 @@ case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
   override val prettyName = "SPARK_PARTITION_ID"
 
-  override protected def initInternal(): Unit = {
-    partitionId = TaskContext.getPartitionId()
+  override protected def initializeInternal(partitionIndex: Int): Unit = {
+    partitionId = partitionIndex
   }
 
   override protected def evalInternal(input: InternalRow): Int = partitionId
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val idTerm = ctx.freshName("partitionId")
-    ctx.addMutableState(ctx.JAVA_INT, idTerm,
-      s"$idTerm = org.apache.spark.TaskContext.getPartitionId();")
+    ctx.addMutableState(ctx.JAVA_INT, idTerm, "")
+    ctx.addPartitionInitializationStatement(s"$idTerm = partitionIndex;")
     ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;", isNull = "false")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 6cab50ae1bf8d..9c3c6d3b2a7f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -184,6 +184,20 @@ class CodegenContext {
     splitExpressions(initCodes, "init", Nil)
   }
 
+  /**
+   * Code statements to initialize states that depend on the partition index.
+   * An integer `partitionIndex` will be made available within the scope.
+   */
+  val partitionInitializationStatements: mutable.ArrayBuffer[String] = mutable.ArrayBuffer.empty
+
+  def addPartitionInitializationStatement(statement: String): Unit = {
+    partitionInitializationStatements += statement
+  }
+
+  def initPartition(): String = {
+    partitionInitializationStatements.mkString("\n")
+  }
+
   /**
    * Holding all the functions those will be added into generated class.
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
index 6a5a3e7933eea..0322d1dd6a9ff 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenFallback.scala
@@ -25,15 +25,23 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, No
 trait CodegenFallback extends Expression {
 
   protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    foreach {
-      case n: Nondeterministic => n.setInitialValues()
-      case _ =>
-    }
-
     // LeafNode does not need `input`
     val input = if (this.isInstanceOf[LeafExpression]) "null" else ctx.INPUT_ROW
     val idx = ctx.references.length
     ctx.references += this
+    var childIndex = idx
+    this.foreach {
+      case n: Nondeterministic =>
+        // This might add the current expression twice, but it won't hurt.
+        ctx.references += n
+        childIndex += 1
+        ctx.addPartitionInitializationStatement(
+          s"""
+             |((Nondeterministic) references[$childIndex])
+             |  .initialize(partitionIndex);
+          """.stripMargin)
+      case _ =>
+    }
     val objectTerm = ctx.freshName("obj")
     val placeHolder = ctx.registerComment(this.toString)
     if (nullable) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 5c4b56b0b224c..4d732445544a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -111,6 +111,10 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         public ${classOf[BaseMutableProjection].getName} target(InternalRow row) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
index 39aa7b17de6c9..dcd1ed96a298e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala
@@ -25,19 +25,26 @@ import org.apache.spark.sql.catalyst.expressions._
  */
 abstract class Predicate {
   def eval(r: InternalRow): Boolean
+
+  /**
+   * Initializes internal states given the current partition index.
+   * This is used by nondeterministic expressions to set initial states.
+   * The default implementation does nothing.
+   */
+  def initialize(partitionIndex: Int): Unit = {}
 }
 
 /**
  * Generates bytecode that evaluates a boolean [[Expression]] on a given input [[InternalRow]].
  */
-object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Boolean] {
+object GeneratePredicate extends CodeGenerator[Expression, Predicate] {
 
   protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)
 
   protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
     BindReferences.bindReference(in, inputSchema)
 
-  protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
+  protected def create(predicate: Expression): Predicate = {
     val ctx = newCodeGenContext()
     val eval = predicate.genCode(ctx)
 
@@ -55,6 +62,10 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
@@ -67,7 +78,6 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
       new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
     logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}")
 
-    val p = CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
-    (r: InternalRow) => p.eval(r)
+    CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 2773e1a666212..b1cb6edefb852 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -173,6 +173,10 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         public java.lang.Object apply(java.lang.Object _i) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 7cc45372daa5a..7e4c9089a2cb9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -380,6 +380,10 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           ${ctx.initMutableStates()}
         }
 
+        public void initialize(int partitionIndex) {
+          ${ctx.initPartition()}
+        }
+
         ${ctx.declareAddedFunctions()}
 
         // Scala.Function1 need this
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
index 1510a4796683c..1b00c9e79da22 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/package.scala
@@ -64,7 +64,15 @@ package object expressions  {
    * column of the new row. If the schema of the input row is specified, then the given expression
    * will be bound to that schema.
    */
-  abstract class Projection extends (InternalRow => InternalRow)
+  abstract class Projection extends (InternalRow => InternalRow) {
+
+    /**
+     * Initializes internal states given the current partition index.
+     * This is used by nondeterministic expressions to set initial states.
+     * The default implementation does nothing.
+     */
+    def initialize(partitionIndex: Int): Unit = {}
+  }
 
   /**
    * Converts a [[InternalRow]] to another Row given a sequence of expression that define each
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 9394e39aadd9d..c941a576d00d6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -31,10 +31,6 @@ object InterpretedPredicate {
     create(BindReferences.bindReference(expression, inputSchema))
 
   def create(expression: Expression): (InternalRow => Boolean) = {
-    expression.foreach {
-      case n: Nondeterministic => n.setInitialValues()
-      case _ =>
-    }
     (r: InternalRow) => expression.eval(r).asInstanceOf[Boolean]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index ca200768b2286..e09029f5aab9b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -42,8 +42,8 @@ abstract class RDG extends LeafExpression with Nondeterministic {
    */
   @transient protected var rng: XORShiftRandom = _
 
-  override protected def initInternal(): Unit = {
-    rng = new XORShiftRandom(seed + TaskContext.getPartitionId)
+  override protected def initializeInternal(partitionIndex: Int): Unit = {
+    rng = new XORShiftRandom(seed + partitionIndex)
   }
 
   override def nullable: Boolean = false
@@ -70,8 +70,9 @@ case class Rand(seed: Long) extends RDG {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
-    ctx.addMutableState(className, rngTerm,
-      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
+    ctx.addMutableState(className, rngTerm, "")
+    ctx.addPartitionInitializationStatement(
+      s"$rngTerm = new $className(${seed}L + partitionIndex);")
     ev.copy(code = s"""
       final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextDouble();""", isNull = "false")
   }
@@ -93,8 +94,9 @@ case class Randn(seed: Long) extends RDG {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
-    ctx.addMutableState(className, rngTerm,
-      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
+    ctx.addMutableState(className, rngTerm, "")
+    ctx.addPartitionInitializationStatement(
+      s"$rngTerm = new $className(${seed}L + partitionIndex);")
     ev.copy(code = s"""
       final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index e5e2cd7d27d15..b6ad5db74e3c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1060,6 +1060,7 @@ object ConvertToLocalRelation extends Rule[LogicalPlan] {
     case Project(projectList, LocalRelation(output, data))
         if !projectList.exists(hasUnevaluableExpr) =>
       val projection = new InterpretedProjection(projectList, output)
+      projection.initialize(0)
       LocalRelation(projectList.map(_.toAttribute), data.map(projection))
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index f0c149c02b9aa..9ceb709185417 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -75,7 +75,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
 
   protected def evaluate(expression: Expression, inputRow: InternalRow = EmptyRow): Any = {
     expression.foreach {
-      case n: Nondeterministic => n.setInitialValues()
+      case n: Nondeterministic => n.initialize(0)
       case _ =>
     }
     expression.eval(inputRow)
@@ -121,6 +121,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
     val plan = generateProject(
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
+    plan.initialize(0)
 
     val actual = plan(inputRow).get(0, expression.dataType)
     if (!checkResult(actual, expected)) {
@@ -182,12 +183,14 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
     var plan = generateProject(
       GenerateMutableProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
+    plan.initialize(0)
     var actual = plan(inputRow).get(0, expression.dataType)
     assert(checkResult(actual, expected))
 
     plan = generateProject(
       GenerateUnsafeProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil),
       expression)
+    plan.initialize(0)
     actual = FromUnsafeProjection(expression.dataType :: Nil)(
       plan(inputRow)).get(0, expression.dataType)
     assert(checkResult(actual, expected))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
index 06dc3bd33b90e..fe5cb8eda824f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodegenExpressionCachingSuite.scala
@@ -31,19 +31,22 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
     // Use an Add to wrap two of them together in case we only initialize the top level expressions.
     val expr = And(NondeterministicExpression(), NondeterministicExpression())
     val instance = UnsafeProjection.create(Seq(expr))
+    instance.initialize(0)
     assert(instance.apply(null).getBoolean(0) === false)
   }
 
   test("GenerateMutableProjection should initialize expressions") {
     val expr = And(NondeterministicExpression(), NondeterministicExpression())
     val instance = GenerateMutableProjection.generate(Seq(expr))
+    instance.initialize(0)
     assert(instance.apply(null).getBoolean(0) === false)
   }
 
   test("GeneratePredicate should initialize expressions") {
     val expr = And(NondeterministicExpression(), NondeterministicExpression())
     val instance = GeneratePredicate.generate(expr)
-    assert(instance.apply(null) === false)
+    instance.initialize(0)
+    assert(instance.eval(null) === false)
   }
 
   test("GenerateUnsafeProjection should not share expression instances") {
@@ -73,13 +76,13 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
   test("GeneratePredicate should not share expression instances") {
     val expr1 = MutableExpression()
     val instance1 = GeneratePredicate.generate(expr1)
-    assert(instance1.apply(null) === false)
+    assert(instance1.eval(null) === false)
 
     val expr2 = MutableExpression()
     expr2.mutableState = true
     val instance2 = GeneratePredicate.generate(expr2)
-    assert(instance1.apply(null) === false)
-    assert(instance2.apply(null) === true)
+    assert(instance1.eval(null) === false)
+    assert(instance2.eval(null) === true)
   }
 
 }
@@ -89,7 +92,7 @@ class CodegenExpressionCachingSuite extends SparkFunSuite {
  */
 case class NondeterministicExpression()
   extends LeafExpression with Nondeterministic with CodegenFallback {
-  override protected def initInternal(): Unit = { }
+  override protected def initializeInternal(partitionIndex: Int): Unit = {}
   override protected def evalInternal(input: InternalRow): Any = false
   override def nullable: Boolean = false
   override def dataType: DataType = BooleanType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index fdd1fa3648251..e485b52b43f76 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -71,8 +71,9 @@ case class RowDataSourceScanExec(
     val unsafeRow = if (outputUnsafeRows) {
       rdd
     } else {
-      rdd.mapPartitionsInternal { iter =>
+      rdd.mapPartitionsWithIndexInternal { (index, iter) =>
         val proj = UnsafeProjection.create(schema)
+        proj.initialize(index)
         iter.map(proj)
       }
     }
@@ -284,8 +285,9 @@ case class FileSourceScanExec(
       val unsafeRows = {
         val scan = inputRDD
         if (needsUnsafeRowConversion) {
-          scan.mapPartitionsInternal { iter =>
+          scan.mapPartitionsWithIndexInternal { (index, iter) =>
             val proj = UnsafeProjection.create(schema)
+            proj.initialize(index)
             iter.map(proj)
           }
         } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index 455fb5bfbb6f7..aab087cd98716 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -190,8 +190,9 @@ case class RDDScanExec(
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
-    rdd.mapPartitionsInternal { iter =>
+    rdd.mapPartitionsWithIndexInternal { (index, iter) =>
       val proj = UnsafeProjection.create(schema)
+      proj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         proj(r)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index 2663129562660..19fbf0c162048 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -94,8 +94,9 @@ case class GenerateExec(
     }
 
     val numOutputRows = longMetric("numOutputRows")
-    rows.mapPartitionsInternal { iter =>
+    rows.mapPartitionsWithIndexInternal { (index, iter) =>
       val proj = UnsafeProjection.create(output, output)
+      proj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         proj(r)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 24d0cffef82a2..cadab37a449aa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.expressions.codegen.{Predicate => GenPredicate, _}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetric
@@ -354,7 +354,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   }
 
   protected def newPredicate(
-      expression: Expression, inputSchema: Seq[Attribute]): (InternalRow) => Boolean = {
+      expression: Expression, inputSchema: Seq[Attribute]): GenPredicate = {
     GeneratePredicate.generate(expression, inputSchema)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 6303483f22fd3..516b9d5444d31 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -331,6 +331,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
           partitionIndex = index;
           this.inputs = inputs;
           ${ctx.initMutableStates()}
+          ${ctx.initPartition()}
         }
 
         ${ctx.declareAddedFunctions()}
@@ -383,10 +384,13 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     } else {
       // Right now, we support up to two input RDDs.
       rdds.head.zipPartitions(rdds(1)) { (leftIter, rightIter) =>
-        val partitionIndex = TaskContext.getPartitionId()
+        Iterator((leftIter, rightIter))
+        // a small hack to obtain the correct partition index
+      }.mapPartitionsWithIndex { (index, zippedIter) =>
+        val (leftIter, rightIter) = zippedIter.next()
         val clazz = CodeGenerator.compile(cleanedSource)
         val buffer = clazz.generate(references).asInstanceOf[BufferedRowIterator]
-        buffer.init(partitionIndex, Array(leftIter, rightIter))
+        buffer.init(index, Array(leftIter, rightIter))
         new Iterator[InternalRow] {
           override def hasNext: Boolean = {
             val v = buffer.hasNext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index a5291e0c12f88..32133f52630cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -70,9 +70,10 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan)
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val project = UnsafeProjection.create(projectList, child.output,
         subexpressionEliminationEnabled)
+      project.initialize(index)
       iter.map(project)
     }
   }
@@ -205,10 +206,11 @@ case class FilterExec(condition: Expression, child: SparkPlan)
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val predicate = newPredicate(condition, child.output)
+      predicate.initialize(0)
       iter.filter { row =>
-        val r = predicate(row)
+        val r = predicate.eval(row)
         if (r) numOutputRows += 1
         r
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index b87016d5a5696..9028caa446e8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -132,10 +132,11 @@ case class InMemoryTableScanExec(
     val relOutput: AttributeSeq = relation.output
     val buffers = relation.cachedColumnBuffers
 
-    buffers.mapPartitionsInternal { cachedBatchIterator =>
+    buffers.mapPartitionsWithIndexInternal { (index, cachedBatchIterator) =>
       val partitionFilter = newPredicate(
         partitionFilters.reduceOption(And).getOrElse(Literal(true)),
         schema)
+      partitionFilter.initialize(index)
 
       // Find the ordinals and data types of the requested columns.
       val (requestedColumnIndices, requestedColumnDataTypes) =
@@ -147,7 +148,7 @@ case class InMemoryTableScanExec(
       val cachedBatchesToScan =
         if (inMemoryPartitionPruningEnabled) {
           cachedBatchIterator.filter { cachedBatch =>
-            if (!partitionFilter(cachedBatch.stats)) {
+            if (!partitionFilter.eval(cachedBatch.stats)) {
               def statsString: String = schemaIndex.map {
                 case (a, i) =>
                   val value = cachedBatch.stats.get(i, a.dataType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
index bfe7e3dea45df..f526a19876670 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
@@ -52,7 +52,7 @@ case class BroadcastNestedLoopJoinExec(
       UnspecifiedDistribution :: BroadcastDistribution(IdentityBroadcastMode) :: Nil
   }
 
-  private[this] def genResultProjection: InternalRow => InternalRow = joinType match {
+  private[this] def genResultProjection: UnsafeProjection = joinType match {
     case LeftExistence(j) =>
       UnsafeProjection.create(output, output)
     case other =>
@@ -84,7 +84,7 @@ case class BroadcastNestedLoopJoinExec(
 
   @transient private lazy val boundCondition = {
     if (condition.isDefined) {
-      newPredicate(condition.get, streamed.output ++ broadcast.output)
+      newPredicate(condition.get, streamed.output ++ broadcast.output).eval _
     } else {
       (r: InternalRow) => true
     }
@@ -366,8 +366,9 @@ case class BroadcastNestedLoopJoinExec(
     }
 
     val numOutputRows = longMetric("numOutputRows")
-    resultRdd.mapPartitionsInternal { iter =>
+    resultRdd.mapPartitionsWithIndexInternal { (index, iter) =>
       val resultProj = genResultProjection
+      resultProj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         resultProj(r)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
index 15dc9b40662e2..8341fe2ffd078 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala
@@ -98,15 +98,15 @@ case class CartesianProductExec(
     val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]
 
     val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
-    pair.mapPartitionsInternal { iter =>
+    pair.mapPartitionsWithIndexInternal { (index, iter) =>
       val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
       val filtered = if (condition.isDefined) {
-        val boundCondition: (InternalRow) => Boolean =
-          newPredicate(condition.get, left.output ++ right.output)
+        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
+        boundCondition.initialize(index)
         val joined = new JoinedRow
 
         iter.filter { r =>
-          boundCondition(joined(r._1, r._2))
+          boundCondition.eval(joined(r._1, r._2))
         }
       } else {
         iter
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 05c5e2f4cd77b..1aef5f6864263 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -81,7 +81,7 @@ trait HashJoin {
     UnsafeProjection.create(streamedKeys)
 
   @transient private[this] lazy val boundCondition = if (condition.isDefined) {
-    newPredicate(condition.get, streamedPlan.output ++ buildPlan.output)
+    newPredicate(condition.get, streamedPlan.output ++ buildPlan.output).eval _
   } else {
     (r: InternalRow) => true
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index ecf7cf289f034..ca9c0ed8cec32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -101,7 +101,7 @@ case class SortMergeJoinExec(
     left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) =>
       val boundCondition: (InternalRow) => Boolean = {
         condition.map { cond =>
-          newPredicate(cond, left.output ++ right.output)
+          newPredicate(cond, left.output ++ right.output).eval _
         }.getOrElse {
           (r: InternalRow) => true
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index 9df56bbf1ef87..fde3b2a528994 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -87,8 +87,9 @@ case class DeserializeToObjectExec(
   }
 
   override protected def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val projection = GenerateSafeProjection.generate(deserializer :: Nil, child.output)
+      projection.initialize(index)
       iter.map(projection)
     }
   }
@@ -124,8 +125,9 @@ case class SerializeFromObjectExec(
   }
 
   override protected def doExecute(): RDD[InternalRow] = {
-    child.execute().mapPartitionsInternal { iter =>
+    child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val projection = UnsafeProjection.create(serializer)
+      projection.initialize(index)
       iter.map(projection)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 586a0fffeb7a1..0e9a2c6cf7dec 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -19,7 +19,13 @@ package org.apache.spark.sql
 
 import java.nio.charset.StandardCharsets
 
+import scala.util.Random
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -406,4 +412,50 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
       Seq(Row(true), Row(true))
     )
   }
+
+  private def assertValuesDoNotChangeAfterCoalesceOrUnion(v: Column): Unit = {
+    import DataFrameFunctionsSuite.CodegenFallbackExpr
+    for ((codegenFallback, wholeStage) <- Seq((true, false), (false, false), (false, true))) {
+      val c = if (codegenFallback) {
+        Column(CodegenFallbackExpr(v.expr))
+      } else {
+        v
+      }
+      withSQLConf(
+        (SQLConf.WHOLESTAGE_FALLBACK.key, codegenFallback.toString),
+        (SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, wholeStage.toString)) {
+        val df = spark.range(0, 4, 1, 4).withColumn("c", c)
+        val rows = df.collect()
+        val rowsAfterCoalesce = df.coalesce(2).collect()
+        assert(rows === rowsAfterCoalesce, "Values changed after coalesce when " +
+          s"codegenFallback=$codegenFallback and wholeStage=$wholeStage.")
+
+        val df1 = spark.range(0, 2, 1, 2).withColumn("c", c)
+        val rows1 = df1.collect()
+        val df2 = spark.range(2, 4, 1, 2).withColumn("c", c)
+        val rows2 = df2.collect()
+        val rowsAfterUnion = df1.union(df2).collect()
+        assert(rowsAfterUnion === rows1 ++ rows2, "Values changed after union when " +
+          s"codegenFallback=$codegenFallback and wholeStage=$wholeStage.")
+      }
+    }
+  }
+
+  test("SPARK-14393: values generated by non-deterministic functions shouldn't change after " +
+    "coalesce or union") {
+    Seq(
+      monotonically_increasing_id(), spark_partition_id(),
+      rand(Random.nextLong()), randn(Random.nextLong())
+    ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
+  }
+}
+
+object DataFrameFunctionsSuite {
+  case class CodegenFallbackExpr(child: Expression) extends Expression with CodegenFallback {
+    override def children: Seq[Expression] = Seq(child)
+    override def nullable: Boolean = child.nullable
+    override def dataType: DataType = child.dataType
+    override lazy val resolved = true
+    override def eval(input: InternalRow): Any = child.eval(input)
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
index 231f204b12b47..c80695bd3e0fe 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -154,8 +154,9 @@ case class HiveTableScanExec(
     val numOutputRows = longMetric("numOutputRows")
     // Avoid to serialize MetastoreRelation because schema is lazy. (see SPARK-15649)
     val outputSchema = schema
-    rdd.mapPartitionsInternal { iter =>
+    rdd.mapPartitionsWithIndexInternal { (index, iter) =>
       val proj = UnsafeProjection.create(outputSchema)
+      proj.initialize(index)
       iter.map { r =>
         numOutputRows += 1
         proj(r)

From bd3ea6595788a4fe5399e6c6c666618d8cb6872c Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Wed, 2 Nov 2016 11:47:45 -0700
Subject: [PATCH 012/534] [SPARK-18160][CORE][YARN] spark.files & spark.jars
 should not be passed to driver in yarn mode

## What changes were proposed in this pull request?

spark.files is still passed to driver in yarn mode, so SparkContext will still handle it which cause the error in the jira desc.

## How was this patch tested?

Tested manually in a 5 node cluster. As this issue only happens in multiple node cluster, so I didn't write test for it.

Author: Jeff Zhang <zjffdu@apache.org>

Closes #15669 from zjffdu/SPARK-18160.

(cherry picked from commit 3c24299b71e23e159edbb972347b13430f92a465)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../scala/org/apache/spark/SparkContext.scala | 29 ++++---------------
 .../org/apache/spark/deploy/yarn/Client.scala |  5 +++-
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 4694790c72cd8..63478c88b057b 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1716,29 +1716,12 @@ class SparkContext(config: SparkConf) extends Logging {
         key = uri.getScheme match {
           // A JAR file which exists only on the driver node
           case null | "file" =>
-            if (master == "yarn" && deployMode == "cluster") {
-              // In order for this to work in yarn cluster mode the user must specify the
-              // --addJars option to the client to upload the file into the distributed cache
-              // of the AM to make it show up in the current working directory.
-              val fileName = new Path(uri.getPath).getName()
-              try {
-                env.rpcEnv.fileServer.addJar(new File(fileName))
-              } catch {
-                case e: Exception =>
-                  // For now just log an error but allow to go through so spark examples work.
-                  // The spark examples don't really need the jar distributed since its also
-                  // the app jar.
-                  logError("Error adding jar (" + e + "), was the --addJars option used?")
-                  null
-              }
-            } else {
-              try {
-                env.rpcEnv.fileServer.addJar(new File(uri.getPath))
-              } catch {
-                case exc: FileNotFoundException =>
-                  logError(s"Jar not found at $path")
-                  null
-              }
+            try {
+              env.rpcEnv.fileServer.addJar(new File(uri.getPath))
+            } catch {
+              case exc: FileNotFoundException =>
+                logError(s"Jar not found at $path")
+                null
             }
           // A JAR file which exists locally on every worker node
           case "local" =>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 55e4a833b6707..053a78617d4e0 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1202,7 +1202,10 @@ private object Client extends Logging {
     // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
     System.setProperty("SPARK_YARN_MODE", "true")
     val sparkConf = new SparkConf
-
+    // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
+    // so remove them from sparkConf here for yarn mode.
+    sparkConf.remove("spark.jars")
+    sparkConf.remove("spark.files")
     val args = new ClientArguments(argStrings)
     new Client(args, sparkConf).run()
   }

From 1eef8e5cd09dfb8b77044ef9864321618e8ea8c8 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@apache.org>
Date: Wed, 2 Nov 2016 11:52:29 -0700
Subject: [PATCH 013/534] [SPARK-17058][BUILD] Add maven snapshots-and-staging
 profile to build/test against staging artifacts

## What changes were proposed in this pull request?

Adds a `snapshots-and-staging profile` so that  RCs of projects like Hadoop and HBase can be used in developer-only build and test runs. There's a comment above the profile telling people not to use this in production.

There's no attempt to do the same for SBT, as Ivy is different.
## How was this patch tested?

Tested by building against the Hadoop 2.7.3 RC 1 JARs

without the profile (and without any local copy of the 2.7.3 artifacts), the build failed

```
mvn install -DskipTests -Pyarn,hadoop-2.7,hive -Dhadoop.version=2.7.3

...

[INFO] ------------------------------------------------------------------------
[INFO] Building Spark Project Launcher 2.1.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
Downloading: https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.pom
[WARNING] The POM for org.apache.hadoop:hadoop-client:jar:2.7.3 is missing, no dependency information available
Downloading: https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.7.3/hadoop-client-2.7.3.jar
[INFO] ------------------------------------------------------------------------
[INFO] Reactor Summary:
[INFO]
[INFO] Spark Project Parent POM ........................... SUCCESS [  4.482 s]
[INFO] Spark Project Tags ................................. SUCCESS [ 17.402 s]
[INFO] Spark Project Sketch ............................... SUCCESS [ 11.252 s]
[INFO] Spark Project Networking ........................... SUCCESS [ 13.458 s]
[INFO] Spark Project Shuffle Streaming Service ............ SUCCESS [  9.043 s]
[INFO] Spark Project Unsafe ............................... SUCCESS [ 16.027 s]
[INFO] Spark Project Launcher ............................. FAILURE [  1.653 s]
[INFO] Spark Project Core ................................. SKIPPED
...
```

With the profile, the build completed

```
mvn install -DskipTests -Pyarn,hadoop-2.7,hive,snapshots-and-staging -Dhadoop.version=2.7.3
```

Author: Steve Loughran <stevel@apache.org>

Closes #14646 from steveloughran/stevel/SPARK-17058-support-asf-snapshots.

(cherry picked from commit 37d95227a21de602b939dae84943ba007f434513)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 pom.xml | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/pom.xml b/pom.xml
index aaf7cfa7eb2ad..04d2eaa1d3bac 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2693,6 +2693,54 @@
       </build>
     </profile>
 
+    <!--
+     This is a profile to enable the use of the ASF snapshot and staging repositories
+     during a build. It is useful when testing againt nightly or RC releases of dependencies.
+     It MUST NOT be used when building copies of Spark to use in production of for distribution,
+     -->
+    <profile>
+      <id>snapshots-and-staging</id>
+      <properties>
+        <!-- override point for ASF staging/snapshot repos -->
+        <asf.staging>https://repository.apache.org/content/groups/staging/</asf.staging>
+        <asf.snapshots>https://repository.apache.org/content/repositories/snapshots/</asf.snapshots>
+      </properties>
+
+      <pluginRepositories>
+        <pluginRepository>
+          <id>ASF Staging</id>
+          <url>${asf.staging}</url>
+        </pluginRepository>
+        <pluginRepository>
+          <id>ASF Snapshots</id>
+          <url>${asf.snapshots}</url>
+          <snapshots>
+            <enabled>true</enabled>
+          </snapshots>
+          <releases>
+            <enabled>false</enabled>
+          </releases>
+        </pluginRepository>
+
+      </pluginRepositories>
+      <repositories>
+        <repository>
+          <id>ASF Staging</id>
+          <url>${asf.staging}</url>
+        </repository>
+        <repository>
+          <id>ASF Snapshots</id>
+          <url>${asf.snapshots}</url>
+          <snapshots>
+            <enabled>true</enabled>
+          </snapshots>
+          <releases>
+            <enabled>false</enabled>
+          </releases>
+        </repository>
+      </repositories>
+    </profile>
+
     <!--
       These empty profiles are available in some sub-modules. Declare them here so that
       maven does not complain when they're provided on the command line for a sub-module

From 2aff2ea81d260a47e7762b2990ed62a91e5d0198 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 2 Nov 2016 15:53:02 -0700
Subject: [PATCH 014/534] [SPARK-18214][SQL] Simplify RuntimeReplaceable type
 coercion

## What changes were proposed in this pull request?
RuntimeReplaceable is used to create aliases for expressions, but the way it deals with type coercion is pretty weird (each expression is responsible for how to handle type coercion, which does not obey the normal implicit type cast rules).

This patch simplifies its handling by allowing the analyzer to traverse into the actual expression of a RuntimeReplaceable.

## How was this patch tested?
- Correctness should be guaranteed by existing unit tests already
- Removed SQLCompatibilityFunctionSuite and moved it sql-compatibility-functions.sql
- Added a new test case in sql-compatibility-functions.sql for verifying explain behavior.

Author: Reynold Xin <rxin@databricks.com>

Closes #15723 from rxin/SPARK-18214.

(cherry picked from commit fd90541c35af2bccf0155467bec8cea7c8865046)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/analysis/TypeCoercion.scala  |   2 -
 .../sql/catalyst/expressions/Expression.scala |  30 ++---
 .../expressions/datetimeExpressions.scala     |   2 -
 .../expressions/nullExpressions.scala         |  75 ++++-------
 .../catalyst/optimizer/finishAnalysis.scala   |   2 +-
 .../expressions/NullFunctionsSuite.scala      |  19 ++-
 .../inputs/sql-compatibility-functions.sql    |  25 ++++
 .../resources/sql-tests/results/array.sql.out |   5 +-
 .../sql-compatibility-functions.sql.out       | 124 ++++++++++++++++++
 .../sql/SQLCompatibilityFunctionSuite.scala   |  98 --------------
 .../apache/spark/sql/SQLQueryTestSuite.scala  |   4 +-
 11 files changed, 204 insertions(+), 182 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 01b04c036d150..6662a9e974fc2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -528,8 +528,6 @@ object TypeCoercion {
         NaNvl(l, Cast(r, DoubleType))
       case NaNvl(l, r) if l.dataType == FloatType && r.dataType == DoubleType =>
         NaNvl(Cast(l, DoubleType), r)
-
-      case e: RuntimeReplaceable => e.replaceForTypeCoercion()
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 726a231fd814e..221f830aa8583 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -186,7 +186,7 @@ abstract class Expression extends TreeNode[Expression] {
    */
   def prettyName: String = nodeName.toLowerCase
 
-  protected def flatArguments = productIterator.flatMap {
+  protected def flatArguments: Iterator[Any] = productIterator.flatMap {
     case t: Traversable[_] => t
     case single => single :: Nil
   }
@@ -229,26 +229,16 @@ trait Unevaluable extends Expression {
  * An expression that gets replaced at runtime (currently by the optimizer) into a different
  * expression for evaluation. This is mainly used to provide compatibility with other databases.
  * For example, we use this to support "nvl" by replacing it with "coalesce".
+ *
+ * A RuntimeReplaceable should have the original parameters along with a "child" expression in the
+ * case class constructor, and define a normal constructor that accepts only the original
+ * parameters. For an example, see [[Nvl]]. To make sure the explain plan and expression SQL
+ * works correctly, the implementation should also override flatArguments method and sql method.
  */
-trait RuntimeReplaceable extends Unevaluable {
-  /**
-   * Method for concrete implementations to override that specifies how to construct the expression
-   * that should replace the current one.
-   */
-  def replaceForEvaluation(): Expression
-
-  /**
-   * Method for concrete implementations to override that specifies how to coerce the input types.
-   */
-  def replaceForTypeCoercion(): Expression
-
-  /** The expression that should be used during evaluation. */
-  lazy val replaced: Expression = replaceForEvaluation()
-
-  override def nullable: Boolean = replaced.nullable
-  override def foldable: Boolean = replaced.foldable
-  override def dataType: DataType = replaced.dataType
-  override def checkInputDataTypes(): TypeCheckResult = replaced.checkInputDataTypes()
+trait RuntimeReplaceable extends UnaryExpression with Unevaluable {
+  override def nullable: Boolean = child.nullable
+  override def foldable: Boolean = child.foldable
+  override def dataType: DataType = child.dataType
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 67c078ae5e264..05bfa7dcfc88f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -488,8 +488,6 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
           }""")
     }
   }
-
-  override def prettyName: String = "unix_time"
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 1c18265e0fed4..70862a87ef9c6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -89,78 +89,53 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 
 
 @ExpressionDescription(usage = "_FUNC_(a,b) - Returns b if a is null, or a otherwise.")
-case class IfNull(left: Expression, right: Expression) extends RuntimeReplaceable {
-  override def children: Seq[Expression] = Seq(left, right)
-
-  override def replaceForEvaluation(): Expression = Coalesce(Seq(left, right))
+case class IfNull(left: Expression, right: Expression, child: Expression)
+  extends RuntimeReplaceable {
 
-  override def replaceForTypeCoercion(): Expression = {
-    if (left.dataType != right.dataType) {
-      TypeCoercion.findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { dtype =>
-        copy(left = Cast(left, dtype), right = Cast(right, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
+  def this(left: Expression, right: Expression) = {
+    this(left, right, Coalesce(Seq(left, right)))
   }
+
+  override def flatArguments: Iterator[Any] = Iterator(left, right)
+  override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
 }
 
 
 @ExpressionDescription(usage = "_FUNC_(a,b) - Returns null if a equals to b, or a otherwise.")
-case class NullIf(left: Expression, right: Expression) extends RuntimeReplaceable {
-  override def children: Seq[Expression] = Seq(left, right)
+case class NullIf(left: Expression, right: Expression, child: Expression)
+  extends RuntimeReplaceable {
 
-  override def replaceForEvaluation(): Expression = {
-    If(EqualTo(left, right), Literal.create(null, left.dataType), left)
+  def this(left: Expression, right: Expression) = {
+    this(left, right, If(EqualTo(left, right), Literal.create(null, left.dataType), left))
   }
 
-  override def replaceForTypeCoercion(): Expression = {
-    if (left.dataType != right.dataType) {
-      TypeCoercion.findTightestCommonTypeOfTwo(left.dataType, right.dataType).map { dtype =>
-        copy(left = Cast(left, dtype), right = Cast(right, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
-  }
+  override def flatArguments: Iterator[Any] = Iterator(left, right)
+  override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
 }
 
 
 @ExpressionDescription(usage = "_FUNC_(a,b) - Returns b if a is null, or a otherwise.")
-case class Nvl(left: Expression, right: Expression) extends RuntimeReplaceable {
-  override def children: Seq[Expression] = Seq(left, right)
+case class Nvl(left: Expression, right: Expression, child: Expression) extends RuntimeReplaceable {
 
-  override def replaceForEvaluation(): Expression = Coalesce(Seq(left, right))
-
-  override def replaceForTypeCoercion(): Expression = {
-    if (left.dataType != right.dataType) {
-      TypeCoercion.findTightestCommonTypeToString(left.dataType, right.dataType).map { dtype =>
-        copy(left = Cast(left, dtype), right = Cast(right, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
+  def this(left: Expression, right: Expression) = {
+    this(left, right, Coalesce(Seq(left, right)))
   }
+
+  override def flatArguments: Iterator[Any] = Iterator(left, right)
+  override def sql: String = s"$prettyName(${left.sql}, ${right.sql})"
 }
 
 
 @ExpressionDescription(usage = "_FUNC_(a,b,c) - Returns b if a is not null, or c otherwise.")
-case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression)
+case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression, child: Expression)
   extends RuntimeReplaceable {
 
-  override def replaceForEvaluation(): Expression = If(IsNotNull(expr1), expr2, expr3)
-
-  override def children: Seq[Expression] = Seq(expr1, expr2, expr3)
-
-  override def replaceForTypeCoercion(): Expression = {
-    if (expr2.dataType != expr3.dataType) {
-      TypeCoercion.findTightestCommonTypeOfTwo(expr2.dataType, expr3.dataType).map { dtype =>
-        copy(expr2 = Cast(expr2, dtype), expr3 = Cast(expr3, dtype))
-      }.getOrElse(this)
-    } else {
-      this
-    }
+  def this(expr1: Expression, expr2: Expression, expr3: Expression) = {
+    this(expr1, expr2, expr3, If(IsNotNull(expr1), expr2, expr3))
   }
+
+  override def flatArguments: Iterator[Any] = Iterator(expr1, expr2, expr3)
+  override def sql: String = s"$prettyName(${expr1.sql}, ${expr2.sql}, ${expr3.sql})"
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
index 7c667315870f5..f20eb958fe973 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
  */
 object ReplaceExpressions extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-    case e: RuntimeReplaceable => e.replaced
+    case e: RuntimeReplaceable => e.child
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
index e736379930619..62c9ab3b67fb6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
 import org.apache.spark.sql.types._
 
 class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -86,18 +88,23 @@ class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("SPARK-16602 Nvl should support numeric-string cases") {
+    def analyze(expr: Expression): Expression = {
+      val relation = LocalRelation()
+      SimpleAnalyzer.execute(Project(Seq(Alias(expr, "c")()), relation)).expressions.head
+    }
+
     val intLit = Literal.create(1, IntegerType)
     val doubleLit = Literal.create(2.2, DoubleType)
     val stringLit = Literal.create("c", StringType)
     val nullLit = Literal.create(null, NullType)
 
-    assert(Nvl(intLit, doubleLit).replaceForTypeCoercion().dataType == DoubleType)
-    assert(Nvl(intLit, stringLit).replaceForTypeCoercion().dataType == StringType)
-    assert(Nvl(stringLit, doubleLit).replaceForTypeCoercion().dataType == StringType)
+    assert(analyze(new Nvl(intLit, doubleLit)).dataType == DoubleType)
+    assert(analyze(new Nvl(intLit, stringLit)).dataType == StringType)
+    assert(analyze(new Nvl(stringLit, doubleLit)).dataType == StringType)
 
-    assert(Nvl(nullLit, intLit).replaceForTypeCoercion().dataType == IntegerType)
-    assert(Nvl(doubleLit, nullLit).replaceForTypeCoercion().dataType == DoubleType)
-    assert(Nvl(nullLit, stringLit).replaceForTypeCoercion().dataType == StringType)
+    assert(analyze(new Nvl(nullLit, intLit)).dataType == IntegerType)
+    assert(analyze(new Nvl(doubleLit, nullLit)).dataType == DoubleType)
+    assert(analyze(new Nvl(nullLit, stringLit)).dataType == StringType)
   }
 
   test("AtLeastNNonNulls") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
new file mode 100644
index 0000000000000..2b5b692d29ef4
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/sql-compatibility-functions.sql
@@ -0,0 +1,25 @@
+-- A test suite for functions added for compatibility with other databases such as Oracle, MSSQL.
+-- These functions are typically implemented using the trait RuntimeReplaceable.
+
+SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null);
+SELECT nullif('x', 'x'), nullif('x', 'y');
+SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null);
+SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null);
+
+-- type coercion
+SELECT ifnull(1, 2.1d), ifnull(null, 2.1d);
+SELECT nullif(1, 2.1d), nullif(1, 1.0d);
+SELECT nvl(1, 2.1d), nvl(null, 2.1d);
+SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d);
+
+-- explain for these functions; use range to avoid constant folding
+explain extended
+select ifnull(id, 'x'), nullif(id, 'x'), nvl(id, 'x'), nvl2(id, 'x', 'y')
+from range(2);
+
+-- SPARK-16730 cast alias functions for Hive compatibility
+SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1);
+SELECT float(1), double(1), decimal(1);
+SELECT date("2014-04-04"), timestamp(date("2014-04-04"));
+-- error handling: only one argument
+SELECT string(1, 2);
diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out
index 499a3d5fb72f6..981b2504bcaad 100644
--- a/sql/core/src/test/resources/sql-tests/results/array.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 10
+-- Number of queries: 12
 
 
 -- !query 0
@@ -124,6 +124,7 @@ struct<sort_array(boolean_array, true):array<boolean>,sort_array(tinyint_array,
 -- !query 8 output
 [true]	[1,2]	[1,2]	[1,2]	[1,2]	[9223372036854775808,9223372036854775809]	[1.0,2.0]	[1.0,2.0]	[2016-03-13,2016-03-14]	[2016-11-12 20:54:00.0,2016-11-15 20:54:00.0]
 
+
 -- !query 9
 select sort_array(array('b', 'd'), '1')
 -- !query 9 schema
@@ -132,6 +133,7 @@ struct<>
 org.apache.spark.sql.AnalysisException
 cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7
 
+
 -- !query 10
 select sort_array(array('b', 'd'), cast(NULL as boolean))
 -- !query 10 schema
@@ -140,6 +142,7 @@ struct<>
 org.apache.spark.sql.AnalysisException
 cannot resolve 'sort_array(array('b', 'd'), CAST(NULL AS BOOLEAN))' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7
 
+
 -- !query 11
 select
   size(boolean_array),
diff --git a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
new file mode 100644
index 0000000000000..9f0b95994be53
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out
@@ -0,0 +1,124 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 13
+
+
+-- !query 0
+SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null)
+-- !query 0 schema
+struct<ifnull(NULL, 'x'):string,ifnull('y', 'x'):string,ifnull(NULL, NULL):null>
+-- !query 0 output
+x	y	NULL
+
+
+-- !query 1
+SELECT nullif('x', 'x'), nullif('x', 'y')
+-- !query 1 schema
+struct<nullif('x', 'x'):string,nullif('x', 'y'):string>
+-- !query 1 output
+NULL	x
+
+
+-- !query 2
+SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)
+-- !query 2 schema
+struct<nvl(NULL, 'x'):string,nvl('y', 'x'):string,nvl(NULL, NULL):null>
+-- !query 2 output
+x	y	NULL
+
+
+-- !query 3
+SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null)
+-- !query 3 schema
+struct<nvl2(NULL, 'x', 'y'):string,nvl2('n', 'x', 'y'):string,nvl2(NULL, NULL, NULL):null>
+-- !query 3 output
+y	x	NULL
+
+
+-- !query 4
+SELECT ifnull(1, 2.1d), ifnull(null, 2.1d)
+-- !query 4 schema
+struct<ifnull(1, 2.1D):double,ifnull(NULL, 2.1D):double>
+-- !query 4 output
+1.0	2.1
+
+
+-- !query 5
+SELECT nullif(1, 2.1d), nullif(1, 1.0d)
+-- !query 5 schema
+struct<nullif(1, 2.1D):int,nullif(1, 1.0D):int>
+-- !query 5 output
+1	NULL
+
+
+-- !query 6
+SELECT nvl(1, 2.1d), nvl(null, 2.1d)
+-- !query 6 schema
+struct<nvl(1, 2.1D):double,nvl(NULL, 2.1D):double>
+-- !query 6 output
+1.0	2.1
+
+
+-- !query 7
+SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d)
+-- !query 7 schema
+struct<nvl2(NULL, 1, 2.1D):double,nvl2('n', 1, 2.1D):double>
+-- !query 7 output
+2.1	1.0
+
+
+-- !query 8
+explain extended
+select ifnull(id, 'x'), nullif(id, 'x'), nvl(id, 'x'), nvl2(id, 'x', 'y')
+from range(2)
+-- !query 8 schema
+struct<plan:string>
+-- !query 8 output
+== Parsed Logical Plan ==
+'Project [unresolvedalias('ifnull('id, x), None), unresolvedalias('nullif('id, x), None), unresolvedalias('nvl('id, x), None), unresolvedalias('nvl2('id, x, y), None)]
++- 'UnresolvedTableValuedFunction range, [2]
+
+== Analyzed Logical Plan ==
+ifnull(`id`, 'x'): string, nullif(`id`, 'x'): bigint, nvl(`id`, 'x'): string, nvl2(`id`, 'x', 'y'): string
+Project [ifnull(id#xL, x) AS ifnull(`id`, 'x')#x, nullif(id#xL, x) AS nullif(`id`, 'x')#xL, nvl(id#xL, x) AS nvl(`id`, 'x')#x, nvl2(id#xL, x, y) AS nvl2(`id`, 'x', 'y')#x]
++- Range (0, 2, step=1, splits=None)
+
+== Optimized Logical Plan ==
+Project [coalesce(cast(id#xL as string), x) AS ifnull(`id`, 'x')#x, id#xL AS nullif(`id`, 'x')#xL, coalesce(cast(id#xL as string), x) AS nvl(`id`, 'x')#x, x AS nvl2(`id`, 'x', 'y')#x]
++- Range (0, 2, step=1, splits=None)
+
+== Physical Plan ==
+*Project [coalesce(cast(id#xL as string), x) AS ifnull(`id`, 'x')#x, id#xL AS nullif(`id`, 'x')#xL, coalesce(cast(id#xL as string), x) AS nvl(`id`, 'x')#x, x AS nvl2(`id`, 'x', 'y')#x]
++- *Range (0, 2, step=1, splits=None)
+
+
+-- !query 9
+SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1)
+-- !query 9 schema
+struct<CAST(1 AS BOOLEAN):boolean,CAST(1 AS TINYINT):tinyint,CAST(1 AS SMALLINT):smallint,CAST(1 AS INT):int,CAST(1 AS BIGINT):bigint>
+-- !query 9 output
+true	1	1	1	1
+
+
+-- !query 10
+SELECT float(1), double(1), decimal(1)
+-- !query 10 schema
+struct<CAST(1 AS FLOAT):float,CAST(1 AS DOUBLE):double,CAST(1 AS DECIMAL(10,0)):decimal(10,0)>
+-- !query 10 output
+1.0	1.0	1
+
+
+-- !query 11
+SELECT date("2014-04-04"), timestamp(date("2014-04-04"))
+-- !query 11 schema
+struct<CAST(2014-04-04 AS DATE):date,CAST(CAST(2014-04-04 AS DATE) AS TIMESTAMP):timestamp>
+-- !query 11 output
+2014-04-04	2014-04-04 00:00:00
+
+
+-- !query 12
+SELECT string(1, 2)
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+Function string accepts only one argument; line 1 pos 7
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala
deleted file mode 100644
index 27b60e0d9def8..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLCompatibilityFunctionSuite.scala
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import java.math.BigDecimal
-import java.sql.Timestamp
-
-import org.apache.spark.sql.test.SharedSQLContext
-
-/**
- * A test suite for functions added for compatibility with other databases such as Oracle, MSSQL.
- *
- * These functions are typically implemented using the trait
- * [[org.apache.spark.sql.catalyst.expressions.RuntimeReplaceable]].
- */
-class SQLCompatibilityFunctionSuite extends QueryTest with SharedSQLContext {
-
-  test("ifnull") {
-    checkAnswer(
-      sql("SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null)"),
-      Row("x", "y", null))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT ifnull(1, 2.1d), ifnull(null, 2.1d)"),
-      Row(1.0, 2.1))
-  }
-
-  test("nullif") {
-    checkAnswer(
-      sql("SELECT nullif('x', 'x'), nullif('x', 'y')"),
-      Row(null, "x"))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT nullif(1, 2.1d), nullif(1, 1.0d)"),
-      Row(1.0, null))
-  }
-
-  test("nvl") {
-    checkAnswer(
-      sql("SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)"),
-      Row("x", "y", null))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT nvl(1, 2.1d), nvl(null, 2.1d)"),
-      Row(1.0, 2.1))
-  }
-
-  test("nvl2") {
-    checkAnswer(
-      sql("SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null)"),
-      Row("y", "x", null))
-
-    // Type coercion
-    checkAnswer(
-      sql("SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d)"),
-      Row(2.1, 1.0))
-  }
-
-  test("SPARK-16730 cast alias functions for Hive compatibility") {
-    checkAnswer(
-      sql("SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1)"),
-      Row(true, 1.toByte, 1.toShort, 1, 1L))
-
-    checkAnswer(
-      sql("SELECT float(1), double(1), decimal(1)"),
-      Row(1.toFloat, 1.0, new BigDecimal(1)))
-
-    checkAnswer(
-      sql("SELECT date(\"2014-04-04\"), timestamp(date(\"2014-04-04\"))"),
-      Row(new java.util.Date(114, 3, 4), new Timestamp(114, 3, 4, 0, 0, 0, 0)))
-
-    checkAnswer(
-      sql("SELECT string(1)"),
-      Row("1"))
-
-    // Error handling: only one argument
-    val errorMsg = intercept[AnalysisException](sql("SELECT string(1, 2)")).getMessage
-    assert(errorMsg.contains("Function string accepts only one argument"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index 2d73d9f1fc802..1a4049fb339cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile}
-import org.apache.spark.sql.execution.command.ShowColumnsCommand
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.StructType
 
@@ -215,7 +214,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
     try {
       val df = session.sql(sql)
       val schema = df.schema
-      val answer = df.queryExecution.hiveResultString()
+      // Get answer, but also get rid of the #1234 expression ids that show up in explain plans
+      val answer = df.queryExecution.hiveResultString().map(_.replaceAll("#\\d+", "#x"))
 
       // If the output is not pre-sorted, sort it.
       if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)

From 5ea2f9e5e449c02f77635918bfcc7ba7193c97a2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 2 Nov 2016 18:05:14 -0700
Subject: [PATCH 015/534] [SPARK-17470][SQL] unify path for data source table
 and locationUri for hive serde table

## What changes were proposed in this pull request?

Due to a limitation of hive metastore(table location must be directory path, not file path), we always store `path` for data source table in storage properties, instead of the `locationUri` field. However, we should not expose this difference to `CatalogTable` level, but just treat it as a hack in `HiveExternalCatalog`, like we store table schema of data source table in table properties.

This PR unifies `path` and `locationUri` outside of `HiveExternalCatalog`, both data source table and hive serde table should use the `locationUri` field.

This PR also unifies the way we handle default table location for managed table. Previously, the default table location of hive serde managed table is set by external catalog, but the one of data source table is set by command. After this PR, we follow the hive way and the default table location is always set by external catalog.

For managed non-file-based tables, we will assign a default table location and create an empty directory for it, the table location will be removed when the table is dropped. This is reasonable as metastore doesn't care about whether a table is file-based or not, and an empty table directory has no harm.
For external non-file-based tables, ideally we can omit the table location, but due to a hive metastore issue, we will assign a random location to it, and remove it right after the table is created. See SPARK-15269 for more details. This is fine as it's well isolated in `HiveExternalCatalog`.

To keep the existing behaviour of the `path` option, in this PR we always add the `locationUri` to storage properties using key `path`, before passing storage properties to `DataSource` as data source options.
## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15024 from cloud-fan/path.

(cherry picked from commit 3a1bc6f4780f8384c1211b1335e7394a4a28377e)
Signed-off-by: Yin Huai <yhuai@databricks.com>
---
 R/pkg/inst/tests/testthat/test_sparkSQL.R     |   4 +-
 .../catalyst/catalog/InMemoryCatalog.scala    |  40 ++-
 .../apache/spark/sql/DataFrameWriter.scala    |   5 +-
 .../spark/sql/execution/SparkSqlParser.scala  |  17 +-
 .../command/createDataSourceTables.scala      |  37 +--
 .../spark/sql/execution/command/ddl.scala     |  23 +-
 .../spark/sql/execution/command/tables.scala  |  50 +---
 .../execution/datasources/DataSource.scala    | 241 ++++++++++--------
 .../datasources/DataSourceStrategy.scala      |   3 +-
 .../spark/sql/internal/CatalogImpl.scala      |   4 +-
 .../sql/execution/command/DDLSuite.scala      |   1 -
 .../spark/sql/sources/PathOptionSuite.scala   | 136 ++++++++++
 .../spark/sql/hive/HiveExternalCatalog.scala  | 227 +++++++++++------
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  16 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |   3 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  28 +-
 .../spark/sql/hive/MultiDatabaseSuite.scala   |   2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |  14 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   4 +-
 19 files changed, 520 insertions(+), 335 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index d7fe6b32822a7..ee48baa59c7af 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2659,7 +2659,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume
   # It makes sure that we can omit path argument in write.df API and then it calls
   # DataFrameWriter.save() without path.
   expect_error(write.df(df, source = "csv"),
-               "Error in save : illegal argument - 'path' is not specified")
+              "Error in save : illegal argument - Expected exactly one path to be specified")
   expect_error(write.json(df, jsonPath),
               "Error in json : analysis error - path file:.*already exists")
   expect_error(write.text(df, jsonPath),
@@ -2667,7 +2667,7 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume
   expect_error(write.orc(df, jsonPath),
               "Error in orc : analysis error - path file:.*already exists")
   expect_error(write.parquet(df, jsonPath),
-                            "Error in parquet : analysis error - path file:.*already exists")
+              "Error in parquet : analysis error - path file:.*already exists")
 
   # Arguments checking in R side.
   expect_error(write.df(df, "data.tmp", source = c(1, 2)),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index f95c9f8cfa2d4..ea675b76607d6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -196,18 +196,32 @@ class InMemoryCatalog(
         throw new TableAlreadyExistsException(db = db, table = table)
       }
     } else {
-      if (tableDefinition.tableType == CatalogTableType.MANAGED) {
-        val dir = new Path(catalog(db).db.locationUri, table)
+      // Set the default table location if this is a managed table and its location is not
+      // specified.
+      // Ideally we should not create a managed table with location, but Hive serde table can
+      // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
+      // to create the table directory and write out data before we create this table, to avoid
+      // exposing a partial written table.
+      val needDefaultTableLocation =
+        tableDefinition.tableType == CatalogTableType.MANAGED &&
+          tableDefinition.storage.locationUri.isEmpty
+
+      val tableWithLocation = if (needDefaultTableLocation) {
+        val defaultTableLocation = new Path(catalog(db).db.locationUri, table)
         try {
-          val fs = dir.getFileSystem(hadoopConfig)
-          fs.mkdirs(dir)
+          val fs = defaultTableLocation.getFileSystem(hadoopConfig)
+          fs.mkdirs(defaultTableLocation)
         } catch {
           case e: IOException =>
             throw new SparkException(s"Unable to create table $table as failed " +
-              s"to create its directory $dir", e)
+              s"to create its directory $defaultTableLocation", e)
         }
+        tableDefinition.withNewStorage(locationUri = Some(defaultTableLocation.toUri.toString))
+      } else {
+        tableDefinition
       }
-      catalog(db).tables.put(table, new TableDesc(tableDefinition))
+
+      catalog(db).tables.put(table, new TableDesc(tableWithLocation))
     }
   }
 
@@ -218,8 +232,12 @@ class InMemoryCatalog(
       purge: Boolean): Unit = synchronized {
     requireDbExists(db)
     if (tableExists(db, table)) {
-      if (getTable(db, table).tableType == CatalogTableType.MANAGED) {
-        val dir = new Path(catalog(db).db.locationUri, table)
+      val tableMeta = getTable(db, table)
+      if (tableMeta.tableType == CatalogTableType.MANAGED) {
+        assert(tableMeta.storage.locationUri.isDefined,
+          "Managed table should always have table location, as we will assign a default location " +
+            "to it if it doesn't have one.")
+        val dir = new Path(tableMeta.storage.locationUri.get)
         try {
           val fs = dir.getFileSystem(hadoopConfig)
           fs.delete(dir, true)
@@ -244,7 +262,10 @@ class InMemoryCatalog(
     oldDesc.table = oldDesc.table.copy(identifier = TableIdentifier(newName, Some(db)))
 
     if (oldDesc.table.tableType == CatalogTableType.MANAGED) {
-      val oldDir = new Path(catalog(db).db.locationUri, oldName)
+      assert(oldDesc.table.storage.locationUri.isDefined,
+        "Managed table should always have table location, as we will assign a default location " +
+          "to it if it doesn't have one.")
+      val oldDir = new Path(oldDesc.table.storage.locationUri.get)
       val newDir = new Path(catalog(db).db.locationUri, newName)
       try {
         val fs = oldDir.getFileSystem(hadoopConfig)
@@ -254,6 +275,7 @@ class InMemoryCatalog(
           throw new SparkException(s"Unable to rename table $oldName to $newName as failed " +
             s"to rename its directory $oldDir", e)
       }
+      oldDesc.table = oldDesc.table.withNewStorage(locationUri = Some(newDir.toUri.toString))
     }
 
     catalog(db).tables.put(newName, oldDesc)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 700f4835ac89a..f95362e292280 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -373,7 +373,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         throw new AnalysisException(s"Table $tableIdent already exists.")
 
       case _ =>
-        val tableType = if (new CaseInsensitiveMap(extraOptions.toMap).contains("path")) {
+        val storage = DataSource.buildStorageFormatFromOptions(extraOptions.toMap)
+        val tableType = if (storage.locationUri.isDefined) {
           CatalogTableType.EXTERNAL
         } else {
           CatalogTableType.MANAGED
@@ -382,7 +383,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         val tableDesc = CatalogTable(
           identifier = tableIdent,
           tableType = tableType,
-          storage = CatalogStorageFormat.empty.copy(properties = extraOptions.toMap),
+          storage = storage,
           schema = new StructType,
           provider = Some(source),
           partitionColumnNames = partitioningColumns.getOrElse(Nil),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index fe183d0097d03..634ffde3543cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -343,7 +343,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
 
     // TODO: this may be wrong for non file-based data source like JDBC, which should be external
     // even there is no `path` in options. We should consider allow the EXTERNAL keyword.
-    val tableType = if (new CaseInsensitiveMap(options).contains("path")) {
+    val storage = DataSource.buildStorageFormatFromOptions(options)
+    val tableType = if (storage.locationUri.isDefined) {
       CatalogTableType.EXTERNAL
     } else {
       CatalogTableType.MANAGED
@@ -352,7 +353,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     val tableDesc = CatalogTable(
       identifier = table,
       tableType = tableType,
-      storage = CatalogStorageFormat.empty.copy(properties = options),
+      storage = storage,
       schema = schema.getOrElse(new StructType),
       provider = Some(provider),
       partitionColumnNames = partitionColumnNames,
@@ -1062,17 +1063,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
         if (conf.convertCTAS && !hasStorageProperties) {
           // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
           // are empty Maps.
-          val optionsWithPath = if (location.isDefined) {
-            Map("path" -> location.get)
-          } else {
-            Map.empty[String, String]
-          }
-
           val newTableDesc = tableDesc.copy(
-            storage = CatalogStorageFormat.empty.copy(properties = optionsWithPath),
-            provider = Some(conf.defaultDataSourceName)
-          )
-
+            storage = CatalogStorageFormat.empty.copy(locationUri = location),
+            provider = Some(conf.defaultDataSourceName))
           CreateTable(newTableDesc, mode, Some(q))
         } else {
           CreateTable(tableDesc, mode, Some(q))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 2a9743130d4c4..d4b28274cc453 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -57,13 +57,14 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
 
     // Create the relation to validate the arguments before writing the metadata to the metastore,
     // and infer the table schema and partition if users didn't specify schema in CREATE TABLE.
+    val pathOption = table.storage.locationUri.map("path" -> _)
     val dataSource: BaseRelation =
       DataSource(
         sparkSession = sparkSession,
         userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
         className = table.provider.get,
         bucketSpec = table.bucketSpec,
-        options = table.storage.properties).resolveRelation()
+        options = table.storage.properties ++ pathOption).resolveRelation()
 
     dataSource match {
       case fs: HadoopFsRelation =>
@@ -85,14 +86,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
       }
     }
 
-    val optionsWithPath = if (table.tableType == CatalogTableType.MANAGED) {
-      table.storage.properties + ("path" -> sessionState.catalog.defaultTablePath(table.identifier))
-    } else {
-      table.storage.properties
-    }
-
     val newTable = table.copy(
-      storage = table.storage.copy(properties = optionsWithPath),
       schema = dataSource.schema,
       partitionColumnNames = partitionColumnNames,
       // If metastore partition management for file source tables is enabled, we start off with
@@ -140,12 +134,6 @@ case class CreateDataSourceTableAsSelectCommand(
     val tableIdentWithDB = table.identifier.copy(database = Some(db))
     val tableName = tableIdentWithDB.unquotedString
 
-    val optionsWithPath = if (table.tableType == CatalogTableType.MANAGED) {
-      table.storage.properties + ("path" -> sessionState.catalog.defaultTablePath(table.identifier))
-    } else {
-      table.storage.properties
-    }
-
     var createMetastoreTable = false
     var existingSchema = Option.empty[StructType]
     if (sparkSession.sessionState.catalog.tableExists(tableIdentWithDB)) {
@@ -162,13 +150,7 @@ case class CreateDataSourceTableAsSelectCommand(
           return Seq.empty[Row]
         case SaveMode.Append =>
           // Check if the specified data source match the data source of the existing table.
-          val dataSource = DataSource(
-            sparkSession = sparkSession,
-            userSpecifiedSchema = Some(query.schema.asNullable),
-            partitionColumns = table.partitionColumnNames,
-            bucketSpec = table.bucketSpec,
-            className = provider,
-            options = optionsWithPath)
+          val existingProvider = DataSource.lookupDataSource(provider)
           // TODO: Check that options from the resolved relation match the relation that we are
           // inserting into (i.e. using the same compression).
 
@@ -178,7 +160,7 @@ case class CreateDataSourceTableAsSelectCommand(
             case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _, _) =>
               // check if the file formats match
               l.relation match {
-                case r: HadoopFsRelation if r.fileFormat.getClass != dataSource.providingClass =>
+                case r: HadoopFsRelation if r.fileFormat.getClass != existingProvider =>
                   throw new AnalysisException(
                     s"The file format of the existing table $tableName is " +
                       s"`${r.fileFormat.getClass.getName}`. It doesn't match the specified " +
@@ -213,13 +195,20 @@ case class CreateDataSourceTableAsSelectCommand(
       case None => data
     }
 
+    val tableLocation = if (table.tableType == CatalogTableType.MANAGED) {
+      Some(sessionState.catalog.defaultTablePath(table.identifier))
+    } else {
+      table.storage.locationUri
+    }
+
     // Create the relation based on the data of df.
+    val pathOption = tableLocation.map("path" -> _)
     val dataSource = DataSource(
       sparkSession,
       className = provider,
       partitionColumns = table.partitionColumnNames,
       bucketSpec = table.bucketSpec,
-      options = optionsWithPath)
+      options = table.storage.properties ++ pathOption)
 
     val result = try {
       dataSource.write(mode, df)
@@ -230,7 +219,7 @@ case class CreateDataSourceTableAsSelectCommand(
     }
     if (createMetastoreTable) {
       val newTable = table.copy(
-        storage = table.storage.copy(properties = optionsWithPath),
+        storage = table.storage.copy(locationUri = tableLocation),
         // We will use the schema of resolved.relation as the schema of the table (instead of
         // the schema of df). It is important since the nullability may be changed by the relation
         // provider (for example, see org.apache.spark.sql.parquet.DefaultSource).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 61e0550cef5e3..52af915b0be65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -485,14 +485,6 @@ case class AlterTableRecoverPartitionsCommand(
     }
   }
 
-  private def getBasePath(table: CatalogTable): Option[String] = {
-    if (table.provider == Some("hive")) {
-      table.storage.locationUri
-    } else {
-      new CaseInsensitiveMap(table.storage.properties).get("path")
-    }
-  }
-
   override def run(spark: SparkSession): Seq[Row] = {
     val catalog = spark.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
@@ -503,13 +495,12 @@ case class AlterTableRecoverPartitionsCommand(
         s"Operation not allowed: $cmd only works on partitioned tables: $tableIdentWithDB")
     }
 
-    val tablePath = getBasePath(table)
-    if (tablePath.isEmpty) {
+    if (table.storage.locationUri.isEmpty) {
       throw new AnalysisException(s"Operation not allowed: $cmd only works on table with " +
         s"location provided: $tableIdentWithDB")
     }
 
-    val root = new Path(tablePath.get)
+    val root = new Path(table.storage.locationUri.get)
     logInfo(s"Recover all the partitions in $root")
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
 
@@ -688,15 +679,7 @@ case class AlterTableSetLocationCommand(
         catalog.alterPartitions(table.identifier, Seq(newPart))
       case None =>
         // No partition spec is specified, so we set the location for the table itself
-        val newTable =
-          if (DDLUtils.isDatasourceTable(table)) {
-            table.withNewStorage(
-              locationUri = Some(location),
-              properties = table.storage.properties ++ Map("path" -> location))
-          } else {
-            table.withNewStorage(locationUri = Some(location))
-          }
-        catalog.alterTable(newTable)
+        catalog.alterTable(table.withNewStorage(locationUri = Some(location)))
     }
     Seq.empty[Row]
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 4acfffb628047..f32c956f5999e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
+import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -62,25 +63,6 @@ case class CreateTableLikeCommand(
     val catalog = sparkSession.sessionState.catalog
     val sourceTableDesc = catalog.getTempViewOrPermanentTableMetadata(sourceTable)
 
-    // Storage format
-    val newStorage =
-      if (sourceTableDesc.tableType == CatalogTableType.VIEW) {
-        val newPath = catalog.defaultTablePath(targetTable)
-        CatalogStorageFormat.empty.copy(properties = Map("path" -> newPath))
-      } else if (DDLUtils.isDatasourceTable(sourceTableDesc)) {
-        val newPath = catalog.defaultTablePath(targetTable)
-        val newSerdeProp =
-          sourceTableDesc.storage.properties.filterKeys(_.toLowerCase != "path") ++
-            Map("path" -> newPath)
-        sourceTableDesc.storage.copy(
-          locationUri = None,
-          properties = newSerdeProp)
-      } else {
-        sourceTableDesc.storage.copy(
-          locationUri = None,
-          properties = sourceTableDesc.storage.properties)
-      }
-
     val newProvider = if (sourceTableDesc.tableType == CatalogTableType.VIEW) {
       Some(sparkSession.sessionState.conf.defaultDataSourceName)
     } else {
@@ -91,7 +73,8 @@ case class CreateTableLikeCommand(
       CatalogTable(
         identifier = targetTable,
         tableType = CatalogTableType.MANAGED,
-        storage = newStorage,
+        // We are creating a new managed table, which should not have custom table location.
+        storage = sourceTableDesc.storage.copy(locationUri = None),
         schema = sourceTableDesc.schema,
         provider = newProvider,
         partitionColumnNames = sourceTableDesc.partitionColumnNames,
@@ -170,13 +153,6 @@ case class AlterTableRenameCommand(
           case NonFatal(e) => log.warn(e.toString, e)
         }
       }
-      // For datasource tables, we also need to update the "path" serde property
-      if (DDLUtils.isDatasourceTable(table) && table.tableType == CatalogTableType.MANAGED) {
-        val newPath = catalog.defaultTablePath(newName)
-        val newTable = table.withNewStorage(
-          properties = table.storage.properties ++ Map("path" -> newPath))
-        catalog.alterTable(newTable)
-      }
       // Invalidate the table last, otherwise uncaching the table would load the logical plan
       // back into the hive metastore cache
       catalog.refreshTable(oldName)
@@ -367,8 +343,9 @@ case class TruncateTableCommand(
       DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
     }
     val locations =
-      if (DDLUtils.isDatasourceTable(table)) {
-        Seq(table.storage.properties.get("path"))
+      // TODO: The `InMemoryCatalog` doesn't support listPartition with partial partition spec.
+      if (spark.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") {
+        Seq(table.storage.locationUri)
       } else if (table.partitionColumnNames.isEmpty) {
         Seq(table.storage.locationUri)
       } else {
@@ -916,17 +893,18 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
   }
 
   private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = {
-    val props = metadata.properties
-
     builder ++= s"USING ${metadata.provider.get}\n"
 
-    val dataSourceOptions = metadata.storage.properties.filterNot {
-      case (key, value) =>
+    val dataSourceOptions = metadata.storage.properties.map {
+      case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
+    } ++ metadata.storage.locationUri.flatMap { location =>
+      if (metadata.tableType == MANAGED) {
         // If it's a managed table, omit PATH option. Spark SQL always creates external table
         // when the table creation DDL contains the PATH option.
-        key.toLowerCase == "path" && metadata.tableType == MANAGED
-    }.map {
-      case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'"
+        None
+      } else {
+        Some(s"path '${escapeSingleQuotedString(location)}'")
+      }
     }
 
     if (dataSourceOptions.nonEmpty) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 3f956c427655e..0b50448a7af18 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
@@ -78,115 +78,9 @@ case class DataSource(
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
-  lazy val providingClass: Class[_] = lookupDataSource(className)
+  lazy val providingClass: Class[_] = DataSource.lookupDataSource(className)
   lazy val sourceInfo = sourceSchema()
 
-  /** A map to maintain backward compatibility in case we move data sources around. */
-  private val backwardCompatibilityMap: Map[String, String] = {
-    val jdbc = classOf[JdbcRelationProvider].getCanonicalName
-    val json = classOf[JsonFileFormat].getCanonicalName
-    val parquet = classOf[ParquetFileFormat].getCanonicalName
-    val csv = classOf[CSVFileFormat].getCanonicalName
-    val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
-    val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
-
-    Map(
-      "org.apache.spark.sql.jdbc" -> jdbc,
-      "org.apache.spark.sql.jdbc.DefaultSource" -> jdbc,
-      "org.apache.spark.sql.execution.datasources.jdbc.DefaultSource" -> jdbc,
-      "org.apache.spark.sql.execution.datasources.jdbc" -> jdbc,
-      "org.apache.spark.sql.json" -> json,
-      "org.apache.spark.sql.json.DefaultSource" -> json,
-      "org.apache.spark.sql.execution.datasources.json" -> json,
-      "org.apache.spark.sql.execution.datasources.json.DefaultSource" -> json,
-      "org.apache.spark.sql.parquet" -> parquet,
-      "org.apache.spark.sql.parquet.DefaultSource" -> parquet,
-      "org.apache.spark.sql.execution.datasources.parquet" -> parquet,
-      "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
-      "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
-      "org.apache.spark.sql.hive.orc" -> orc,
-      "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
-      "org.apache.spark.ml.source.libsvm" -> libsvm,
-      "com.databricks.spark.csv" -> csv
-    )
-  }
-
-  /**
-   * Class that were removed in Spark 2.0. Used to detect incompatibility libraries for Spark 2.0.
-   */
-  private val spark2RemovedClasses = Set(
-    "org.apache.spark.sql.DataFrame",
-    "org.apache.spark.sql.sources.HadoopFsRelationProvider",
-    "org.apache.spark.Logging")
-
-  /** Given a provider name, look up the data source class definition. */
-  private def lookupDataSource(provider0: String): Class[_] = {
-    val provider = backwardCompatibilityMap.getOrElse(provider0, provider0)
-    val provider2 = s"$provider.DefaultSource"
-    val loader = Utils.getContextOrSparkClassLoader
-    val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
-
-    try {
-      serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider)).toList match {
-        // the provider format did not match any given registered aliases
-        case Nil =>
-          try {
-            Try(loader.loadClass(provider)).orElse(Try(loader.loadClass(provider2))) match {
-              case Success(dataSource) =>
-                // Found the data source using fully qualified path
-                dataSource
-              case Failure(error) =>
-                if (provider.toLowerCase == "orc" ||
-                  provider.startsWith("org.apache.spark.sql.hive.orc")) {
-                  throw new AnalysisException(
-                    "The ORC data source must be used with Hive support enabled")
-                } else if (provider.toLowerCase == "avro" ||
-                  provider == "com.databricks.spark.avro") {
-                  throw new AnalysisException(
-                    s"Failed to find data source: ${provider.toLowerCase}. Please find an Avro " +
-                      "package at " +
-                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects")
-                } else {
-                  throw new ClassNotFoundException(
-                    s"Failed to find data source: $provider. Please find packages at " +
-                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects",
-                    error)
-                }
-            }
-          } catch {
-            case e: NoClassDefFoundError => // This one won't be caught by Scala NonFatal
-              // NoClassDefFoundError's class name uses "/" rather than "." for packages
-              val className = e.getMessage.replaceAll("/", ".")
-              if (spark2RemovedClasses.contains(className)) {
-                throw new ClassNotFoundException(s"$className was removed in Spark 2.0. " +
-                  "Please check if your library is compatible with Spark 2.0", e)
-              } else {
-                throw e
-              }
-          }
-        case head :: Nil =>
-          // there is exactly one registered alias
-          head.getClass
-        case sources =>
-          // There are multiple registered aliases for the input
-          sys.error(s"Multiple sources found for $provider " +
-            s"(${sources.map(_.getClass.getName).mkString(", ")}), " +
-            "please specify the fully qualified class name.")
-      }
-    } catch {
-      case e: ServiceConfigurationError if e.getCause.isInstanceOf[NoClassDefFoundError] =>
-        // NoClassDefFoundError's class name uses "/" rather than "." for packages
-        val className = e.getCause.getMessage.replaceAll("/", ".")
-        if (spark2RemovedClasses.contains(className)) {
-          throw new ClassNotFoundException(s"Detected an incompatible DataSourceRegister. " +
-            "Please remove the incompatible library from classpath or upgrade it. " +
-            s"Error: ${e.getMessage}", e)
-        } else {
-          throw e
-        }
-    }
-  }
-
   /**
    * Infer the schema of the given FileFormat, returns a pair of schema and partition column names.
    */
@@ -470,13 +364,14 @@ case class DataSource(
         //  1. Only one output path can be specified on the write path;
         //  2. Output path must be a legal HDFS style file system path;
         //  3. It's OK that the output path doesn't exist yet;
-        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-        val outputPath = {
-          val path = new Path(caseInsensitiveOptions.getOrElse("path", {
-            throw new IllegalArgumentException("'path' is not specified")
-          }))
+        val allPaths = paths ++ new CaseInsensitiveMap(options).get("path")
+        val outputPath = if (allPaths.length == 1) {
+          val path = new Path(allPaths.head)
           val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
           path.makeQualified(fs.getUri, fs.getWorkingDirectory)
+        } else {
+          throw new IllegalArgumentException("Expected exactly one path to be specified, but " +
+            s"got: ${allPaths.mkString(", ")}")
         }
 
         val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
@@ -539,3 +434,123 @@ case class DataSource(
     }
   }
 }
+
+object DataSource {
+
+  /** A map to maintain backward compatibility in case we move data sources around. */
+  private val backwardCompatibilityMap: Map[String, String] = {
+    val jdbc = classOf[JdbcRelationProvider].getCanonicalName
+    val json = classOf[JsonFileFormat].getCanonicalName
+    val parquet = classOf[ParquetFileFormat].getCanonicalName
+    val csv = classOf[CSVFileFormat].getCanonicalName
+    val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
+    val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
+
+    Map(
+      "org.apache.spark.sql.jdbc" -> jdbc,
+      "org.apache.spark.sql.jdbc.DefaultSource" -> jdbc,
+      "org.apache.spark.sql.execution.datasources.jdbc.DefaultSource" -> jdbc,
+      "org.apache.spark.sql.execution.datasources.jdbc" -> jdbc,
+      "org.apache.spark.sql.json" -> json,
+      "org.apache.spark.sql.json.DefaultSource" -> json,
+      "org.apache.spark.sql.execution.datasources.json" -> json,
+      "org.apache.spark.sql.execution.datasources.json.DefaultSource" -> json,
+      "org.apache.spark.sql.parquet" -> parquet,
+      "org.apache.spark.sql.parquet.DefaultSource" -> parquet,
+      "org.apache.spark.sql.execution.datasources.parquet" -> parquet,
+      "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
+      "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
+      "org.apache.spark.sql.hive.orc" -> orc,
+      "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
+      "org.apache.spark.ml.source.libsvm" -> libsvm,
+      "com.databricks.spark.csv" -> csv
+    )
+  }
+
+  /**
+   * Class that were removed in Spark 2.0. Used to detect incompatibility libraries for Spark 2.0.
+   */
+  private val spark2RemovedClasses = Set(
+    "org.apache.spark.sql.DataFrame",
+    "org.apache.spark.sql.sources.HadoopFsRelationProvider",
+    "org.apache.spark.Logging")
+
+  /** Given a provider name, look up the data source class definition. */
+  def lookupDataSource(provider: String): Class[_] = {
+    val provider1 = backwardCompatibilityMap.getOrElse(provider, provider)
+    val provider2 = s"$provider1.DefaultSource"
+    val loader = Utils.getContextOrSparkClassLoader
+    val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
+
+    try {
+      serviceLoader.asScala.filter(_.shortName().equalsIgnoreCase(provider1)).toList match {
+        // the provider format did not match any given registered aliases
+        case Nil =>
+          try {
+            Try(loader.loadClass(provider1)).orElse(Try(loader.loadClass(provider2))) match {
+              case Success(dataSource) =>
+                // Found the data source using fully qualified path
+                dataSource
+              case Failure(error) =>
+                if (provider1.toLowerCase == "orc" ||
+                  provider1.startsWith("org.apache.spark.sql.hive.orc")) {
+                  throw new AnalysisException(
+                    "The ORC data source must be used with Hive support enabled")
+                } else if (provider1.toLowerCase == "avro" ||
+                  provider1 == "com.databricks.spark.avro") {
+                  throw new AnalysisException(
+                    s"Failed to find data source: ${provider1.toLowerCase}. Please find an Avro " +
+                      "package at " +
+                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects")
+                } else {
+                  throw new ClassNotFoundException(
+                    s"Failed to find data source: $provider1. Please find packages at " +
+                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects",
+                    error)
+                }
+            }
+          } catch {
+            case e: NoClassDefFoundError => // This one won't be caught by Scala NonFatal
+              // NoClassDefFoundError's class name uses "/" rather than "." for packages
+              val className = e.getMessage.replaceAll("/", ".")
+              if (spark2RemovedClasses.contains(className)) {
+                throw new ClassNotFoundException(s"$className was removed in Spark 2.0. " +
+                  "Please check if your library is compatible with Spark 2.0", e)
+              } else {
+                throw e
+              }
+          }
+        case head :: Nil =>
+          // there is exactly one registered alias
+          head.getClass
+        case sources =>
+          // There are multiple registered aliases for the input
+          sys.error(s"Multiple sources found for $provider1 " +
+            s"(${sources.map(_.getClass.getName).mkString(", ")}), " +
+            "please specify the fully qualified class name.")
+      }
+    } catch {
+      case e: ServiceConfigurationError if e.getCause.isInstanceOf[NoClassDefFoundError] =>
+        // NoClassDefFoundError's class name uses "/" rather than "." for packages
+        val className = e.getCause.getMessage.replaceAll("/", ".")
+        if (spark2RemovedClasses.contains(className)) {
+          throw new ClassNotFoundException(s"Detected an incompatible DataSourceRegister. " +
+            "Please remove the incompatible library from classpath or upgrade it. " +
+            s"Error: ${e.getMessage}", e)
+        } else {
+          throw e
+        }
+    }
+  }
+
+  /**
+   * When creating a data source table, the `path` option has a special meaning: the table location.
+   * This method extracts the `path` option and treat it as table location to build a
+   * [[CatalogStorageFormat]]. Note that, the `path` option is removed from options after this.
+   */
+  def buildStorageFormatFromOptions(options: Map[String, String]): CatalogStorageFormat = {
+    val path = new CaseInsensitiveMap(options).get("path")
+    val optionsWithoutPath = options.filterKeys(_.toLowerCase != "path")
+    CatalogStorageFormat.empty.copy(locationUri = path, properties = optionsWithoutPath)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 47c1f9d3fac1e..e87998fe4ad8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -237,6 +237,7 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
       sparkSession: SparkSession,
       simpleCatalogRelation: SimpleCatalogRelation): LogicalPlan = {
     val table = simpleCatalogRelation.catalogTable
+    val pathOption = table.storage.locationUri.map("path" -> _)
     val dataSource =
       DataSource(
         sparkSession,
@@ -244,7 +245,7 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
         partitionColumns = table.partitionColumnNames,
         bucketSpec = table.bucketSpec,
         className = table.provider.get,
-        options = table.storage.properties)
+        options = table.storage.properties ++ pathOption)
 
     LogicalRelation(
       dataSource.resolveRelation(),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 44fd38dfb96f6..d3e323cb12891 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, FunctionIdenti
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.datasources.CreateTable
+import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource}
 import org.apache.spark.sql.types.StructType
 
 
@@ -354,7 +354,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     val tableDesc = CatalogTable(
       identifier = tableIdent,
       tableType = CatalogTableType.EXTERNAL,
-      storage = CatalogStorageFormat.empty.copy(properties = options),
+      storage = DataSource.buildStorageFormatFromOptions(options),
       schema = schema,
       provider = Some(source)
     )
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 9fb0f5384d889..bde3c8a42e1c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1145,7 +1145,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           assert(storageFormat.properties.isEmpty)
           assert(storageFormat.locationUri === Some(expected))
         } else {
-          assert(storageFormat.properties.get("path") === Some(expected))
           assert(storageFormat.locationUri === Some(expected))
         }
       } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
new file mode 100644
index 0000000000000..bef47aacd3379
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala
@@ -0,0 +1,136 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.sources
+
+import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{IntegerType, Metadata, MetadataBuilder, StructType}
+
+class TestOptionsSource extends SchemaRelationProvider with CreatableRelationProvider {
+
+  // This is used in the read path.
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation = {
+    new TestOptionsRelation(parameters)(sqlContext.sparkSession)
+  }
+
+  // This is used in the write path.
+  override def createRelation(
+      sqlContext: SQLContext,
+      mode: SaveMode,
+      parameters: Map[String, String],
+      data: DataFrame): BaseRelation = {
+    new TestOptionsRelation(parameters)(sqlContext.sparkSession)
+  }
+}
+
+class TestOptionsRelation(val options: Map[String, String])(@transient val session: SparkSession)
+  extends BaseRelation {
+
+  override def sqlContext: SQLContext = session.sqlContext
+
+  def pathOption: Option[String] = options.get("path")
+
+  // We can't get the relation directly for write path, here we put the path option in schema
+  // metadata, so that we can test it later.
+  override def schema: StructType = {
+    val metadataWithPath = pathOption.map {
+      path => new MetadataBuilder().putString("path", path).build()
+    }
+    new StructType().add("i", IntegerType, true, metadataWithPath.getOrElse(Metadata.empty))
+  }
+}
+
+class PathOptionSuite extends DataSourceTest with SharedSQLContext {
+
+  test("path option always exist") {
+    withTable("src") {
+      sql(
+        s"""
+           |CREATE TABLE src(i int)
+           |USING ${classOf[TestOptionsSource].getCanonicalName}
+           |OPTIONS (PATH '/tmp/path')
+        """.stripMargin)
+      assert(getPathOption("src") == Some("/tmp/path"))
+    }
+
+    // should exist even path option is not specified when creating table
+    withTable("src") {
+      sql(s"CREATE TABLE src(i int) USING ${classOf[TestOptionsSource].getCanonicalName}")
+      assert(getPathOption("src") == Some(defaultTablePath("src")))
+    }
+  }
+
+  test("path option also exist for write path") {
+    withTable("src") {
+      withTempPath { path =>
+        sql(
+          s"""
+            |CREATE TABLE src
+            |USING ${classOf[TestOptionsSource].getCanonicalName}
+            |OPTIONS (PATH '${path.getAbsolutePath}')
+            |AS SELECT 1
+          """.stripMargin)
+        assert(spark.table("src").schema.head.metadata.getString("path") == path.getAbsolutePath)
+      }
+    }
+
+    // should exist even path option is not specified when creating table
+    withTable("src") {
+      sql(
+        s"""
+           |CREATE TABLE src
+           |USING ${classOf[TestOptionsSource].getCanonicalName}
+           |AS SELECT 1
+          """.stripMargin)
+      assert(spark.table("src").schema.head.metadata.getString("path") == defaultTablePath("src"))
+    }
+  }
+
+  test("path option always represent the value of table location") {
+    withTable("src") {
+      sql(
+        s"""
+           |CREATE TABLE src(i int)
+           |USING ${classOf[TestOptionsSource].getCanonicalName}
+           |OPTIONS (PATH '/tmp/path')""".stripMargin)
+      sql("ALTER TABLE src SET LOCATION '/tmp/path2'")
+      assert(getPathOption("src") == Some("/tmp/path2"))
+    }
+
+    withTable("src", "src2") {
+      sql(s"CREATE TABLE src(i int) USING ${classOf[TestOptionsSource].getCanonicalName}")
+      sql("ALTER TABLE src RENAME TO src2")
+      assert(getPathOption("src2") == Some(defaultTablePath("src2")))
+    }
+  }
+
+  private def getPathOption(tableName: String): Option[String] = {
+    spark.table(tableName).queryExecution.analyzed.collect {
+      case LogicalRelation(r: TestOptionsRelation, _, _) => r.pathOption
+    }.head
+  }
+
+  private def defaultTablePath(tableName: String): String = {
+    spark.sessionState.catalog.defaultTablePath(TableIdentifier(tableName))
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 409c316c6802c..ebba203ac593c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -29,7 +29,7 @@ import org.apache.thrift.TException
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
@@ -38,9 +38,8 @@ import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
 import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
-import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.sql.types.{DataType, StructType}
 
 
 /**
@@ -189,66 +188,39 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       throw new TableAlreadyExistsException(db = db, table = table)
     }
     // Before saving data source table metadata into Hive metastore, we should:
-    //  1. Put table provider, schema, partition column names, bucket specification and partition
-    //     provider in table properties.
+    //  1. Put table metadata like provider, schema, etc. in table properties.
     //  2. Check if this table is hive compatible
-    //    2.1  If it's not hive compatible, set schema, partition columns and bucket spec to empty
-    //         and save table metadata to Hive.
+    //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
+    //         spec to empty and save table metadata to Hive.
     //    2.2  If it's hive compatible, set serde information in table metadata and try to save
     //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
     if (DDLUtils.isDatasourceTable(tableDefinition)) {
-      // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
-      val provider = tableDefinition.provider.get
-      val partitionColumns = tableDefinition.partitionColumnNames
-      val bucketSpec = tableDefinition.bucketSpec
-
-      val tableProperties = new scala.collection.mutable.HashMap[String, String]
-      tableProperties.put(DATASOURCE_PROVIDER, provider)
-      if (tableDefinition.partitionProviderIsHive) {
-        tableProperties.put(TABLE_PARTITION_PROVIDER, "hive")
-      }
-
-      // Serialized JSON schema string may be too long to be stored into a single metastore table
-      // property. In this case, we split the JSON string and store each part as a separate table
-      // property.
-      val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
-      val schemaJsonString = tableDefinition.schema.json
-      // Split the JSON string.
-      val parts = schemaJsonString.grouped(threshold).toSeq
-      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
-      parts.zipWithIndex.foreach { case (part, index) =>
-        tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
-      }
-
-      if (partitionColumns.nonEmpty) {
-        tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
-        partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
-          tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
-        }
-      }
-
-      if (bucketSpec.isDefined) {
-        val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
+      val tableProperties = tableMetaToTableProps(tableDefinition)
 
-        tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString)
-        tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString)
-        bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) =>
-          tableProperties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol)
-        }
-
-        if (sortColumnNames.nonEmpty) {
-          tableProperties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString)
-          sortColumnNames.zipWithIndex.foreach { case (sortCol, index) =>
-            tableProperties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol)
-          }
-        }
+      val needDefaultTableLocation = tableDefinition.tableType == MANAGED &&
+        tableDefinition.storage.locationUri.isEmpty
+      val tableLocation = if (needDefaultTableLocation) {
+        Some(defaultTablePath(tableDefinition.identifier))
+      } else {
+        tableDefinition.storage.locationUri
       }
+      // Ideally we should also put `locationUri` in table properties like provider, schema, etc.
+      // However, in older version of Spark we already store table location in storage properties
+      // with key "path". Here we keep this behaviour for backward compatibility.
+      val storagePropsWithLocation = tableDefinition.storage.properties ++
+        tableLocation.map("path" -> _)
 
       // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
       // bucket specification to empty. Note that partition columns are retained, so that we can
       // call partition-related Hive API later.
       def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
         tableDefinition.copy(
+          // Hive only allows directory paths as location URIs while Spark SQL data source tables
+          // also allow file paths. For non-hive-compatible format, we should not set location URI
+          // to avoid hive metastore to throw exception.
+          storage = tableDefinition.storage.copy(
+            locationUri = None,
+            properties = storagePropsWithLocation),
           schema = tableDefinition.partitionSchema,
           bucketSpec = None,
           properties = tableDefinition.properties ++ tableProperties)
@@ -259,10 +231,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         val location = if (tableDefinition.tableType == EXTERNAL) {
           // When we hit this branch, we are saving an external data source table with hive
           // compatible format, which means the data source is file-based and must have a `path`.
-          val map = new CaseInsensitiveMap(tableDefinition.storage.properties)
-          require(map.contains("path"),
+          require(tableDefinition.storage.locationUri.isDefined,
             "External file-based data source table must have a `path` entry in storage properties.")
-          Some(new Path(map("path")).toUri.toString)
+          Some(new Path(tableDefinition.storage.locationUri.get).toUri.toString)
         } else {
           None
         }
@@ -272,7 +243,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
             locationUri = location,
             inputFormat = serde.inputFormat,
             outputFormat = serde.outputFormat,
-            serde = serde.serde
+            serde = serde.serde,
+            properties = storagePropsWithLocation
           ),
           properties = tableDefinition.properties ++ tableProperties)
       }
@@ -337,6 +309,68 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
   }
 
+  /**
+   * Data source tables may be non Hive compatible and we need to store table metadata in table
+   * properties to workaround some Hive metastore limitations.
+   * This method puts table provider, partition provider, schema, partition column names, bucket
+   * specification into a map, which can be used as table properties later.
+   */
+  private def tableMetaToTableProps(table: CatalogTable): scala.collection.Map[String, String] = {
+    // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
+    val provider = table.provider.get
+    val partitionColumns = table.partitionColumnNames
+    val bucketSpec = table.bucketSpec
+
+    val properties = new scala.collection.mutable.HashMap[String, String]
+    properties.put(DATASOURCE_PROVIDER, provider)
+    if (table.partitionProviderIsHive) {
+      properties.put(TABLE_PARTITION_PROVIDER, "hive")
+    }
+
+    // Serialized JSON schema string may be too long to be stored into a single metastore table
+    // property. In this case, we split the JSON string and store each part as a separate table
+    // property.
+    val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
+    val schemaJsonString = table.schema.json
+    // Split the JSON string.
+    val parts = schemaJsonString.grouped(threshold).toSeq
+    properties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
+    parts.zipWithIndex.foreach { case (part, index) =>
+      properties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
+    }
+
+    if (partitionColumns.nonEmpty) {
+      properties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
+      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
+        properties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
+      }
+    }
+
+    if (bucketSpec.isDefined) {
+      val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
+
+      properties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString)
+      properties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString)
+      bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) =>
+        properties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol)
+      }
+
+      if (sortColumnNames.nonEmpty) {
+        properties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString)
+        sortColumnNames.zipWithIndex.foreach { case (sortCol, index) =>
+          properties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol)
+        }
+      }
+    }
+
+    properties
+  }
+
+  private def defaultTablePath(tableIdent: TableIdentifier): String = {
+    val dbLocation = getDatabase(tableIdent.database.get).locationUri
+    new Path(new Path(dbLocation), tableIdent.table).toString
+  }
+
   private def saveTableIntoHive(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = {
     assert(DDLUtils.isDatasourceTable(tableDefinition),
       "saveTableIntoHive only takes data source table.")
@@ -383,11 +417,35 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   override def renameTable(db: String, oldName: String, newName: String): Unit = withClient {
-    val newTable = client.getTable(db, oldName)
-      .copy(identifier = TableIdentifier(newName, Some(db)))
+    val rawTable = client.getTable(db, oldName)
+
+    val storageWithNewPath = if (rawTable.tableType == MANAGED) {
+      // If it's a managed table and we are renaming it, then the path option becomes inaccurate
+      // and we need to update it according to the new table name.
+      val newTablePath = defaultTablePath(TableIdentifier(newName, Some(db)))
+      updateLocationInStorageProps(rawTable, Some(newTablePath))
+    } else {
+      rawTable.storage
+    }
+
+    val newTable = rawTable.copy(
+      identifier = TableIdentifier(newName, Some(db)),
+      storage = storageWithNewPath)
+
     client.alterTable(oldName, newTable)
   }
 
+  private def getLocationFromStorageProps(table: CatalogTable): Option[String] = {
+    new CaseInsensitiveMap(table.storage.properties).get("path")
+  }
+
+  private def updateLocationInStorageProps(
+      table: CatalogTable,
+      newPath: Option[String]): CatalogStorageFormat = {
+    val propsWithoutPath = table.storage.properties.filterKeys(_.toLowerCase != "path")
+    table.storage.copy(properties = propsWithoutPath ++ newPath.map("path" -> _))
+  }
+
   /**
    * Alter a table whose name that matches the one specified in `tableDefinition`,
    * assuming the table exists.
@@ -418,21 +476,36 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
 
     if (DDLUtils.isDatasourceTable(withStatsProps)) {
-      val oldDef = client.getTable(db, withStatsProps.identifier.table)
-      // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
-      // to retain the spark specific format if it is. Also add old data source properties to table
-      // properties, to retain the data source table format.
-      val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(SPARK_SQL_PREFIX))
+      val oldTableDef = client.getTable(db, withStatsProps.identifier.table)
+
+      val oldLocation = getLocationFromStorageProps(oldTableDef)
+      val newLocation = tableDefinition.storage.locationUri
+      // Only update the `locationUri` field if the location is really changed, because this table
+      // may be not Hive-compatible and can not set the `locationUri` field. We should respect the
+      // old `locationUri` even it's None.
+      val storageWithNewLocation = if (oldLocation == newLocation) {
+        oldTableDef.storage
+      } else {
+        updateLocationInStorageProps(oldTableDef, newLocation).copy(locationUri = newLocation)
+      }
+
       val partitionProviderProp = if (tableDefinition.partitionProviderIsHive) {
         TABLE_PARTITION_PROVIDER -> "hive"
       } else {
         TABLE_PARTITION_PROVIDER -> "builtin"
       }
+
+      // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
+      // to retain the spark specific format if it is. Also add old data source properties to table
+      // properties, to retain the data source table format.
+      val oldDataSourceProps = oldTableDef.properties.filter(_._1.startsWith(SPARK_SQL_PREFIX))
+      val newTableProps = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp
       val newDef = withStatsProps.copy(
-        schema = oldDef.schema,
-        partitionColumnNames = oldDef.partitionColumnNames,
-        bucketSpec = oldDef.bucketSpec,
-        properties = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp)
+        storage = storageWithNewLocation,
+        schema = oldTableDef.schema,
+        partitionColumnNames = oldTableDef.partitionColumnNames,
+        bucketSpec = oldTableDef.bucketSpec,
+        properties = newTableProps)
 
       client.alterTable(newDef)
     } else {
@@ -465,22 +538,16 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     } else {
       getProviderFromTableProperties(table).map { provider =>
         assert(provider != "hive", "Hive serde table should not save provider in table properties.")
-        // SPARK-15269: Persisted data source tables always store the location URI as a storage
-        // property named "path" instead of standard Hive `dataLocation`, because Hive only
-        // allows directory paths as location URIs while Spark SQL data source tables also
-        // allows file paths. So the standard Hive `dataLocation` is meaningless for Spark SQL
-        // data source tables.
-        // Spark SQL may also save external data source in Hive compatible format when
-        // possible, so that these tables can be directly accessed by Hive. For these tables,
-        // `dataLocation` is still necessary. Here we also check for input format because only
-        // these Hive compatible tables set this field.
-        val storage = if (table.tableType == EXTERNAL && table.storage.inputFormat.isEmpty) {
-          table.storage.copy(locationUri = None)
-        } else {
-          table.storage
+        // Internally we store the table location in storage properties with key "path" for data
+        // source tables. Here we set the table location to `locationUri` field and filter out the
+        // path option in storage properties, to avoid exposing this concept externally.
+        val storageWithLocation = {
+          val tableLocation = getLocationFromStorageProps(table)
+          updateLocationInStorageProps(table, None).copy(locationUri = tableLocation)
         }
+
         table.copy(
-          storage = storage,
+          storage = storageWithLocation,
           schema = getSchemaFromTableProperties(table),
           provider = Some(provider),
           partitionColumnNames = getPartitionColumnsFromTableProperties(table),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 624ab747e442f..8e5fc88aad448 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.sql.hive
 
-import scala.collection.JavaConverters._
-
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.execution.command.DDLUtils
@@ -56,12 +53,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
       tableIdent.table.toLowerCase)
   }
 
-  private def getQualifiedTableName(t: CatalogTable): QualifiedTableName = {
-    QualifiedTableName(
-      t.identifier.database.getOrElse(getCurrentDatabase).toLowerCase,
-      t.identifier.table.toLowerCase)
-  }
-
   /** A cache of Spark SQL data source tables that have been accessed. */
   protected[hive] val cachedDataSourceTables: LoadingCache[QualifiedTableName, LogicalPlan] = {
     val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() {
@@ -69,6 +60,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         logDebug(s"Creating new cached data source for $in")
         val table = sparkSession.sharedState.externalCatalog.getTable(in.database, in.name)
 
+        val pathOption = table.storage.locationUri.map("path" -> _)
         val dataSource =
           DataSource(
             sparkSession,
@@ -76,7 +68,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = table.provider.get,
-            options = table.storage.properties,
+            options = table.storage.properties ++ pathOption,
             catalogTable = Some(table))
 
         LogicalRelation(dataSource.resolveRelation(), catalogTable = Some(table))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 0477ea4d4c380..7abc4d9623f71 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -142,8 +142,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
           assert(hiveTable.storage.serde === Some(serde))
 
           assert(hiveTable.tableType === CatalogTableType.EXTERNAL)
-          assert(hiveTable.storage.locationUri ===
-            Some(path.toURI.toString.stripSuffix(File.separator)))
+          assert(hiveTable.storage.locationUri === Some(path.toString))
 
           val columns = hiveTable.schema
           assert(columns.map(_.name) === Seq("d1", "d2"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index eaa67d370db37..c50f92e783c88 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -998,7 +998,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         identifier = TableIdentifier("not_skip_hive_metadata"),
         tableType = CatalogTableType.EXTERNAL,
         storage = CatalogStorageFormat.empty.copy(
-          properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "false")
+          locationUri = Some(tempPath.getCanonicalPath),
+          properties = Map("skipHiveMetadata" -> "false")
         ),
         schema = schema,
         provider = Some("parquet")
@@ -1282,9 +1283,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         sql("insert into t values (2, 3, 4)")
         checkAnswer(table("t"), Seq(Row(1, 2, 3), Row(2, 3, 4)))
         val catalogTable = hiveClient.getTable("default", "t")
-        // there should not be a lowercase key 'path' now
-        assert(catalogTable.storage.properties.get("path").isEmpty)
-        assert(catalogTable.storage.properties.get("PATH").isDefined)
+        assert(catalogTable.storage.locationUri.isDefined)
       }
     }
   }
@@ -1351,4 +1350,25 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       sparkSession.sparkContext.conf.set(DEBUG_MODE, previousValue)
     }
   }
+
+  test("SPARK-17470: support old table that stores table location in storage properties") {
+    withTable("old") {
+      withTempPath { path =>
+        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
+        val tableDesc = CatalogTable(
+          identifier = TableIdentifier("old", Some("default")),
+          tableType = CatalogTableType.EXTERNAL,
+          storage = CatalogStorageFormat.empty.copy(
+            properties = Map("path" -> path.getAbsolutePath)
+          ),
+          schema = new StructType(),
+          properties = Map(
+            HiveExternalCatalog.DATASOURCE_PROVIDER -> "parquet",
+            HiveExternalCatalog.DATASOURCE_SCHEMA ->
+              new StructType().add("i", "int").add("j", "string").json))
+        hiveClient.createTable(tableDesc, ignoreIfExists = false)
+        checkAnswer(spark.table("old"), Row(1, "a"))
+      }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 7ba880e476137..cfc1d81d544eb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -29,7 +29,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
     val expectedPath =
       spark.sharedState.externalCatalog.getDatabase(dbName).locationUri + "/" + tableName
 
-    assert(metastoreTable.storage.properties("path") === expectedPath)
+    assert(metastoreTable.storage.locationUri.get === expectedPath)
   }
 
   private def getTableNames(dbName: Option[String] = None): Array[String] = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index e9268a922cf54..682d7d4b163dd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -859,14 +859,6 @@ class HiveDDLSuite
     }
   }
 
-  private def getTablePath(table: CatalogTable): Option[String] = {
-    if (DDLUtils.isDatasourceTable(table)) {
-      new CaseInsensitiveMap(table.storage.properties).get("path")
-    } else {
-      table.storage.locationUri
-    }
-  }
-
   private def checkCreateTableLike(sourceTable: CatalogTable, targetTable: CatalogTable): Unit = {
     // The created table should be a MANAGED table with empty view text and original text.
     assert(targetTable.tableType == CatalogTableType.MANAGED,
@@ -915,10 +907,8 @@ class HiveDDLSuite
       assert(targetTable.provider == sourceTable.provider)
     }
 
-    val sourceTablePath = getTablePath(sourceTable)
-    val targetTablePath = getTablePath(targetTable)
-    assert(targetTablePath.nonEmpty, "target table path should not be empty")
-    assert(sourceTablePath != targetTablePath,
+    assert(targetTable.storage.locationUri.nonEmpty, "target table path should not be empty")
+    assert(sourceTable.storage.locationUri != targetTable.storage.locationUri,
       "source table/view path should be different from target table path")
 
     // The source table contents should not been seen in the target table.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index b9353b5b5d2a7..3a597d6afb153 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -517,7 +517,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     val catalogTable =
       sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
     relation match {
-      case LogicalRelation(r: HadoopFsRelation, _, _) =>
+      case LogicalRelation(r: HadoopFsRelation, _, Some(table)) =>
         if (!isDataSourceTable) {
           fail(
             s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
@@ -525,7 +525,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         }
         userSpecifiedLocation match {
           case Some(location) =>
-            assert(r.options("path") === location)
+            assert(table.storage.locationUri.get === location)
           case None => // OK.
         }
         assert(catalogTable.provider.get === format)

From 1e29f0a0d2772efc5e9cdc9727847388a87547d4 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 2 Nov 2016 20:56:30 -0700
Subject: [PATCH 016/534] [SPARK-17963][SQL][DOCUMENTATION] Add examples
 (extend) in each expression and improve documentation

## What changes were proposed in this pull request?

This PR proposes to change the documentation for functions. Please refer the discussion from https://github.com/apache/spark/pull/15513

The changes include
- Re-indent the documentation
- Add examples/arguments in `extended` where the arguments are multiple or specific format (e.g. xml/ json).

For examples, the documentation was updated as below:
### Functions with single line usage

**Before**
- `pow`

  ``` sql
  Usage: pow(x1, x2) - Raise x1 to the power of x2.
  Extended Usage:
  > SELECT pow(2, 3);
   8.0
  ```
- `current_timestamp`

  ``` sql
  Usage: current_timestamp() - Returns the current timestamp at the start of query evaluation.
  Extended Usage:
  No example for current_timestamp.
  ```

**After**
- `pow`

  ``` sql
  Usage: pow(expr1, expr2) - Raises `expr1` to the power of `expr2`.
  Extended Usage:
      Examples:
        > SELECT pow(2, 3);
         8.0
  ```

- `current_timestamp`

  ``` sql
  Usage: current_timestamp() - Returns the current timestamp at the start of query evaluation.
  Extended Usage:
      No example/argument for current_timestamp.
  ```
### Functions with (already) multiple line usage

**Before**
- `approx_count_distinct`

  ``` sql
  Usage: approx_count_distinct(expr) - Returns the estimated cardinality by HyperLogLog++.
      approx_count_distinct(expr, relativeSD=0.05) - Returns the estimated cardinality by HyperLogLog++
        with relativeSD, the maximum estimation error allowed.

  Extended Usage:
  No example for approx_count_distinct.
  ```
- `percentile_approx`

  ``` sql
  Usage:
        percentile_approx(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
        column `col` at the given percentage. The value of percentage must be between 0.0
        and 1.0. The `accuracy` parameter (default: 10000) is a positive integer literal which
        controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields
        better accuracy, `1.0/accuracy` is the relative error of the approximation.

        percentile_approx(col, array(percentage1 [, percentage2]...) [, accuracy]) - Returns the approximate
        percentile array of column `col` at the given percentage array. Each value of the
        percentage array must be between 0.0 and 1.0. The `accuracy` parameter (default: 10000) is
        a positive integer literal which controls approximation accuracy at the cost of memory.
        Higher value of `accuracy` yields better accuracy, `1.0/accuracy` is the relative error of
        the approximation.

  Extended Usage:
  No example for percentile_approx.
  ```

**After**
- `approx_count_distinct`

  ``` sql
  Usage:
      approx_count_distinct(expr[, relativeSD]) - Returns the estimated cardinality by HyperLogLog++.
        `relativeSD` defines the maximum estimation error allowed.

  Extended Usage:
      No example/argument for approx_count_distinct.
  ```

- `percentile_approx`

  ``` sql
  Usage:
      percentile_approx(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
        column `col` at the given percentage. The value of percentage must be between 0.0
        and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which
        controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields
        better accuracy, `1.0/accuracy` is the relative error of the approximation.
        When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0.
        In this case, returns the approximate percentile array of column `col` at the given
        percentage array.

  Extended Usage:
      Examples:
        > SELECT percentile_approx(10.0, array(0.5, 0.4, 0.1), 100);
         [10.0,10.0,10.0]
        > SELECT percentile_approx(10.0, 0.5, 100);
         10.0
  ```
## How was this patch tested?

Manually tested

**When examples are multiple**

``` sql
spark-sql> describe function extended reflect;
Function: reflect
Class: org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection
Usage: reflect(class, method[, arg1[, arg2 ..]]) - Calls a method with reflection.
Extended Usage:
    Examples:
      > SELECT reflect('java.util.UUID', 'randomUUID');
       c33fb387-8500-4bfa-81d2-6e0e3e930df2
      > SELECT reflect('java.util.UUID', 'fromString', 'a5cf6c42-0c85-418f-af6c-3e4e5b1328f2');
       a5cf6c42-0c85-418f-af6c-3e4e5b1328f2
```

**When `Usage` is in single line**

``` sql
spark-sql> describe function extended min;
Function: min
Class: org.apache.spark.sql.catalyst.expressions.aggregate.Min
Usage: min(expr) - Returns the minimum value of `expr`.
Extended Usage:
    No example/argument for min.
```

**When `Usage` is already in multiple lines**

``` sql
spark-sql> describe function extended percentile_approx;
Function: percentile_approx
Class: org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile
Usage:
    percentile_approx(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
      column `col` at the given percentage. The value of percentage must be between 0.0
      and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which
      controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields
      better accuracy, `1.0/accuracy` is the relative error of the approximation.
      When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0.
      In this case, returns the approximate percentile array of column `col` at the given
      percentage array.

Extended Usage:
    Examples:
      > SELECT percentile_approx(10.0, array(0.5, 0.4, 0.1), 100);
       [10.0,10.0,10.0]
      > SELECT percentile_approx(10.0, 0.5, 100);
       10.0
```

**When example/argument is missing**

``` sql
spark-sql> describe function extended rank;
Function: rank
Class: org.apache.spark.sql.catalyst.expressions.Rank
Usage:
    rank() - Computes the rank of a value in a group of values. The result is one plus the number
      of rows preceding or equal to the current row in the ordering of the partition. The values
      will produce gaps in the sequence.

Extended Usage:
    No example/argument for rank.
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15677 from HyukjinKwon/SPARK-17963-1.

(cherry picked from commit 7eb2ca8e338e04034a662920261e028f56b07395)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../expressions/ExpressionDescription.java    |   2 +-
 .../expressions/CallMethodViaReflection.scala |  12 +-
 .../spark/sql/catalyst/expressions/Cast.scala |   8 +-
 .../catalyst/expressions/InputFileName.scala  |   3 +-
 .../MonotonicallyIncreasingID.scala           |  14 +-
 .../expressions/SparkPartitionID.scala        |   3 +-
 .../aggregate/ApproximatePercentile.scala     |  26 +-
 .../expressions/aggregate/Average.scala       |   2 +-
 .../aggregate/CentralMomentAgg.scala          |  14 +-
 .../catalyst/expressions/aggregate/Corr.scala |   4 +-
 .../expressions/aggregate/Count.scala         |  10 +-
 .../expressions/aggregate/Covariance.scala    |   4 +-
 .../expressions/aggregate/First.scala         |   8 +-
 .../aggregate/HyperLogLogPlusPlus.scala       |   8 +-
 .../catalyst/expressions/aggregate/Last.scala |   5 +-
 .../catalyst/expressions/aggregate/Max.scala  |   2 +-
 .../catalyst/expressions/aggregate/Min.scala  |   2 +-
 .../catalyst/expressions/aggregate/Sum.scala  |   2 +-
 .../expressions/aggregate/collect.scala       |   2 +-
 .../sql/catalyst/expressions/arithmetic.scala |  79 +++-
 .../expressions/bitwiseExpressions.scala      |  32 +-
 .../expressions/collectionOperations.scala    |  36 +-
 .../expressions/complexTypeCreator.scala      |  29 +-
 .../expressions/conditionalExpressions.scala  |   9 +-
 .../expressions/datetimeExpressions.scala     | 199 +++++++---
 .../sql/catalyst/expressions/generators.scala |  36 +-
 .../expressions/jsonExpressions.scala         |  14 +-
 .../expressions/mathExpressions.scala         | 346 ++++++++++++++----
 .../spark/sql/catalyst/expressions/misc.scala |  59 ++-
 .../expressions/nullExpressions.scala         |  72 +++-
 .../sql/catalyst/expressions/predicates.scala |  24 +-
 .../expressions/randomExpressions.scala       |  24 +-
 .../expressions/regexpExpressions.scala       |  30 +-
 .../expressions/stringExpressions.scala       | 317 ++++++++++++----
 .../expressions/windowExpressions.scala       | 117 +++---
 .../sql/catalyst/expressions/xml/xpath.scala  |  78 +++-
 .../sql/execution/command/functions.scala     |  22 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  |   7 +-
 .../sql/execution/command/DDLSuite.scala      |  22 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |  24 +-
 40 files changed, 1256 insertions(+), 451 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
index 9e10f27d59d55..62a2ce47d0ce6 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionDescription.java
@@ -39,5 +39,5 @@
 @Retention(RetentionPolicy.RUNTIME)
 public @interface ExpressionDescription {
     String usage() default "_FUNC_ is undocumented";
-    String extended() default "No example for _FUNC_.";
+    String extended() default "\n    No example/argument for _FUNC_.\n";
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
index fe24c0489fc98..40f1b148f9287 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
@@ -43,11 +43,15 @@ import org.apache.spark.util.Utils
  *                 and the second element should be a literal string for the method name,
  *                 and the remaining are input arguments to the Java method.
  */
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(class,method[,arg1[,arg2..]]) calls method with reflection",
-  extended = "> SELECT _FUNC_('java.util.UUID', 'randomUUID');\n c33fb387-8500-4bfa-81d2-6e0e3e930df2")
-// scalastyle:on line.size.limit
+  usage = "_FUNC_(class, method[, arg1[, arg2 ..]]) - Calls a method with reflection.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('java.util.UUID', 'randomUUID');
+       c33fb387-8500-4bfa-81d2-6e0e3e930df2
+      > SELECT _FUNC_('java.util.UUID', 'fromString', 'a5cf6c42-0c85-418f-af6c-3e4e5b1328f2');
+       a5cf6c42-0c85-418f-af6c-3e4e5b1328f2
+  """)
 case class CallMethodViaReflection(children: Seq[Expression])
   extends Expression with CodegenFallback {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 58fd65f62ffe7..4db1ae6faa159 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -114,8 +114,12 @@ object Cast {
 
 /** Cast the child expression to the target data type. */
 @ExpressionDescription(
-  usage = " - Cast value v to the target data type.",
-  extended = "> SELECT _FUNC_('10' as int);\n 10")
+  usage = "_FUNC_(expr AS type) - Casts the value `expr` to the target data type `type`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('10' as int);
+       10
+  """)
 case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with NullIntolerant {
 
   override def toString: String = s"cast($child as ${dataType.simpleString})"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
index b6c12c5351119..b7fb285133bfc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
@@ -27,8 +27,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * Expression that returns the name of the current file being read.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the name of the current file being read if available",
-  extended = "> SELECT _FUNC_();\n ''")
+  usage = "_FUNC_() - Returns the name of the current file being read if available.")
 case class InputFileName() extends LeafExpression with Nondeterministic {
 
   override def nullable: Boolean = true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index 72b8dcca26e2f..32358a99e7ce7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -33,13 +33,13 @@ import org.apache.spark.sql.types.{DataType, LongType}
  * Since this expression is stateful, it cannot be a case object.
  */
 @ExpressionDescription(
-  usage =
-    """_FUNC_() - Returns monotonically increasing 64-bit integers.
-      The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
-      The current implementation puts the partition ID in the upper 31 bits, and the lower 33 bits
-      represent the record number within each partition. The assumption is that the data frame has
-      less than 1 billion partitions, and each partition has less than 8 billion records.""",
-  extended = "> SELECT _FUNC_();\n 0")
+  usage = """
+    _FUNC_() - Returns monotonically increasing 64-bit integers. The generated ID is guaranteed
+      to be monotonically increasing and unique, but not consecutive. The current implementation
+      puts the partition ID in the upper 31 bits, and the lower 33 bits represent the record number
+      within each partition. The assumption is that the data frame has less than 1 billion
+      partitions, and each partition has less than 8 billion records.
+  """)
 case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterministic {
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 6bef473cac060..8db7efdbb5dd4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -25,8 +25,7 @@ import org.apache.spark.sql.types.{DataType, IntegerType}
  * Expression that returns the current partition id.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns the current partition id",
-  extended = "> SELECT _FUNC_();\n 0")
+  usage = "_FUNC_() - Returns the current partition id.")
 case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
   override def nullable: Boolean = false
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index f91ff87fc1c01..692cbd7c0d32c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -49,21 +49,23 @@ import org.apache.spark.sql.types._
  *                           DEFAULT_PERCENTILE_ACCURACY.
  */
 @ExpressionDescription(
-  usage =
-    """
-      _FUNC_(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
+  usage = """
+    _FUNC_(col, percentage [, accuracy]) - Returns the approximate percentile value of numeric
       column `col` at the given percentage. The value of percentage must be between 0.0
-      and 1.0. The `accuracy` parameter (default: 10000) is a positive integer literal which
+      and 1.0. The `accuracy` parameter (default: 10000) is a positive numeric literal which
       controls approximation accuracy at the cost of memory. Higher value of `accuracy` yields
       better accuracy, `1.0/accuracy` is the relative error of the approximation.
-
-      _FUNC_(col, array(percentage1 [, percentage2]...) [, accuracy]) - Returns the approximate
-      percentile array of column `col` at the given percentage array. Each value of the
-      percentage array must be between 0.0 and 1.0. The `accuracy` parameter (default: 10000) is
-       a positive integer literal which controls approximation accuracy at the cost of memory.
-       Higher value of `accuracy` yields better accuracy, `1.0/accuracy` is the relative error of
-       the approximation.
-    """)
+      When `percentage` is an array, each value of the percentage array must be between 0.0 and 1.0.
+      In this case, returns the approximate percentile array of column `col` at the given
+      percentage array.
+  """,
+  extended = """
+    Examples:
+      > SELECT percentile_approx(10.0, array(0.5, 0.4, 0.1), 100);
+       [10.0,10.0,10.0]
+      > SELECT percentile_approx(10.0, 0.5, 100);
+       10.0
+  """)
 case class ApproximatePercentile(
     child: Expression,
     percentageExpression: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index ff70774847830..d523420530c2c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the mean calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the mean calculated from values of a group.")
 case class Average(child: Expression) extends DeclarativeAggregate {
 
   override def prettyName: String = "avg"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
index 17a7c6dce89ca..302054708ccb5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -132,7 +132,7 @@ abstract class CentralMomentAgg(child: Expression) extends DeclarativeAggregate
 // Compute the population standard deviation of a column
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the population standard deviation calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the population standard deviation calculated from values of a group.")
 // scalastyle:on line.size.limit
 case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
 
@@ -147,8 +147,10 @@ case class StddevPop(child: Expression) extends CentralMomentAgg(child) {
 }
 
 // Compute the sample standard deviation of a column
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sample standard deviation calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the sample standard deviation calculated from values of a group.")
+// scalastyle:on line.size.limit
 case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 2
@@ -164,7 +166,7 @@ case class StddevSamp(child: Expression) extends CentralMomentAgg(child) {
 
 // Compute the population variance of a column
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the population variance calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the population variance calculated from values of a group.")
 case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 2
@@ -179,7 +181,7 @@ case class VariancePop(child: Expression) extends CentralMomentAgg(child) {
 
 // Compute the sample variance of a column
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sample variance calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the sample variance calculated from values of a group.")
 case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 2
@@ -194,7 +196,7 @@ case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the Skewness value calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the skewness value calculated from values of a group.")
 case class Skewness(child: Expression) extends CentralMomentAgg(child) {
 
   override def prettyName: String = "skewness"
@@ -209,7 +211,7 @@ case class Skewness(child: Expression) extends CentralMomentAgg(child) {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the Kurtosis value calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the kurtosis value calculated from values of a group.")
 case class Kurtosis(child: Expression) extends CentralMomentAgg(child) {
 
   override protected def momentOrder = 4
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
index e29265e2f41e1..657f519d2a05e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala
@@ -28,8 +28,10 @@ import org.apache.spark.sql.types._
  * Definition of Pearson correlation can be found at
  * http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns Pearson coefficient of correlation between a set of number pairs.")
+  usage = "_FUNC_(expr1, expr2) - Returns Pearson coefficient of correlation between a set of number pairs.")
+// scalastyle:on line.size.limit
 case class Corr(x: Expression, y: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = Seq(x, y)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
index 17ae012af79be..bcae0dc0754c4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Count.scala
@@ -23,9 +23,13 @@ import org.apache.spark.sql.types._
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(*) - Returns the total number of retrieved rows, including rows containing NULL values.
-    _FUNC_(expr) - Returns the number of rows for which the supplied expression is non-NULL.
-    _FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL.""")
+  usage = """
+    _FUNC_(*) - Returns the total number of retrieved rows, including rows containing null.
+
+    _FUNC_(expr) - Returns the number of rows for which the supplied expression is non-null.
+
+    _FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-null.
+  """)
 // scalastyle:on line.size.limit
 case class Count(children: Seq[Expression]) extends DeclarativeAggregate {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
index d80afbebf7404..ae5ed779700b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala
@@ -77,7 +77,7 @@ abstract class Covariance(x: Expression, y: Expression) extends DeclarativeAggre
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns the population covariance of a set of number pairs.")
+  usage = "_FUNC_(expr1, expr2) - Returns the population covariance of a set of number pairs.")
 case class CovPopulation(left: Expression, right: Expression) extends Covariance(left, right) {
   override val evaluateExpression: Expression = {
     If(n === Literal(0.0), Literal.create(null, DoubleType),
@@ -88,7 +88,7 @@ case class CovPopulation(left: Expression, right: Expression) extends Covariance
 
 
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns the sample covariance of a set of number pairs.")
+  usage = "_FUNC_(expr1, expr2) - Returns the sample covariance of a set of number pairs.")
 case class CovSample(left: Expression, right: Expression) extends Covariance(left, right) {
   override val evaluateExpression: Expression = {
     If(n === Literal(0.0), Literal.create(null, DoubleType),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
index d702c08cfd342..29b8947980004 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/First.scala
@@ -29,10 +29,10 @@ import org.apache.spark.sql.types._
  * a single partition, and we use a single reducer to do the aggregation.).
  */
 @ExpressionDescription(
-  usage = """_FUNC_(expr) - Returns the first value of `child` for a group of rows.
-    _FUNC_(expr,isIgnoreNull=false) - Returns the first value of `child` for a group of rows.
-      If isIgnoreNull is true, returns only non-null values.
-    """)
+  usage = """
+    _FUNC_(expr[, isIgnoreNull]) - Returns the first value of `expr` for a group of rows.
+      If `isIgnoreNull` is true, returns only non-null values.
+  """)
 case class First(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
 
   def this(child: Expression) = this(child, Literal.create(false, BooleanType))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
index 83c8d400c5d6a..b9862aa04fcd9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -47,10 +47,10 @@ import org.apache.spark.sql.types._
  */
 // scalastyle:on
 @ExpressionDescription(
-  usage = """_FUNC_(expr) - Returns the estimated cardinality by HyperLogLog++.
-    _FUNC_(expr, relativeSD=0.05) - Returns the estimated cardinality by HyperLogLog++
-      with relativeSD, the maximum estimation error allowed.
-    """)
+  usage = """
+    _FUNC_(expr[, relativeSD]) - Returns the estimated cardinality by HyperLogLog++.
+      `relativeSD` defines the maximum estimation error allowed.
+  """)
 case class HyperLogLogPlusPlus(
     child: Expression,
     relativeSD: Double = 0.05,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
index 8579f7292d3ab..b0a363e7d6dce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Last.scala
@@ -29,7 +29,10 @@ import org.apache.spark.sql.types._
  * a single partition, and we use a single reducer to do the aggregation.).
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr,isIgnoreNull) - Returns the last value of `child` for a group of rows.")
+  usage = """
+    _FUNC_(expr[, isIgnoreNull]) - Returns the last value of `expr` for a group of rows.
+      If `isIgnoreNull` is true, returns only non-null values.
+  """)
 case class Last(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
 
   def this(child: Expression) = this(child, Literal.create(false, BooleanType))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
index c534fe495fc13..f32c9c677a864 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Max.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the maximum value of expr.")
+  usage = "_FUNC_(expr) - Returns the maximum value of `expr`.")
 case class Max(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
index 35289b468183c..9ef42b96975af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Min.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the minimum value of expr.")
+  usage = "_FUNC_(expr) - Returns the minimum value of `expr`.")
 case class Min(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
index ad217f25b5a26..f3731d40058e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.types._
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sum calculated from values of a group.")
+  usage = "_FUNC_(expr) - Returns the sum calculated from values of a group.")
 case class Sum(child: Expression) extends DeclarativeAggregate {
 
   override def children: Seq[Expression] = child :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 89eb864e94702..d2880d58aefe1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -106,7 +106,7 @@ case class CollectList(
 }
 
 /**
- * Collect a list of unique elements.
+ * Collect a set of unique elements.
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Collects and returns a set of unique elements.")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 6f3db79622fa2..4870093e9250f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -25,7 +25,12 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns -a.")
+  usage = "_FUNC_(expr) - Returns the negated value of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       -1
+  """)
 case class UnaryMinus(child: Expression) extends UnaryExpression
     with ExpectsInputTypes with NullIntolerant {
 
@@ -62,7 +67,7 @@ case class UnaryMinus(child: Expression) extends UnaryExpression
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns a.")
+  usage = "_FUNC_(expr) - Returns the value of `expr`.")
 case class UnaryPositive(child: Expression)
     extends UnaryExpression with ExpectsInputTypes with NullIntolerant {
   override def prettyName: String = "positive"
@@ -84,7 +89,11 @@ case class UnaryPositive(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the absolute value of the numeric value.",
-  extended = "> SELECT _FUNC_('-1');\n 1")
+  extended = """
+    Examples:
+      > SELECT _FUNC_(-1);
+       1
+  """)
 case class Abs(child: Expression)
     extends UnaryExpression with ExpectsInputTypes with NullIntolerant {
 
@@ -131,7 +140,12 @@ object BinaryArithmetic {
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns a+b.")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`+`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 1 _FUNC_ 2;
+       3
+  """)
 case class Add(left: Expression, right: Expression) extends BinaryArithmetic with NullIntolerant {
 
   override def inputType: AbstractDataType = TypeCollection.NumericAndInterval
@@ -162,7 +176,12 @@ case class Add(left: Expression, right: Expression) extends BinaryArithmetic wit
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns a-b.")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`-`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 2 _FUNC_ 1;
+       1
+  """)
 case class Subtract(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -194,7 +213,12 @@ case class Subtract(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Multiplies a by b.")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`*`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 2 _FUNC_ 3;
+       6
+  """)
 case class Multiply(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -208,9 +232,17 @@ case class Multiply(left: Expression, right: Expression)
   protected override def nullSafeEval(input1: Any, input2: Any): Any = numeric.times(input1, input2)
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Divides a by b.",
-  extended = "> SELECT 3 _FUNC_ 2;\n 1.5")
+  usage = "expr1 _FUNC_ expr2 - Returns `expr1`/`expr2`. It always performs floating point division.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 2;
+       1.5
+      > SELECT 2L _FUNC_ 2L;
+       1.0
+  """)
+// scalastyle:on line.size.limit
 case class Divide(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -286,7 +318,12 @@ case class Divide(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns the remainder when dividing a by b.")
+  usage = "expr1 _FUNC_ expr2 - Returns the remainder after `expr1`/`expr2`.",
+  extended = """
+    Examples:
+      > SELECT 2 _FUNC_ 1.8;
+       0.2
+  """)
 case class Remainder(left: Expression, right: Expression)
     extends BinaryArithmetic with NullIntolerant {
 
@@ -367,8 +404,14 @@ case class Remainder(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Returns the positive modulo",
-  extended = "> SELECT _FUNC_(10,3);\n 1")
+  usage = "_FUNC_(expr1, expr2) - Returns the positive value of `expr1` mod `expr2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 3);
+       1
+      > SELECT _FUNC_(-10, 3);
+       2
+  """)
 case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic with NullIntolerant {
 
   override def toString: String = s"pmod($left, $right)"
@@ -471,7 +514,12 @@ case class Pmod(left: Expression, right: Expression) extends BinaryArithmetic wi
  * It takes at least 2 parameters, and returns null iff all parameters are null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n1, ...) - Returns the least value of all parameters, skipping null values.")
+  usage = "_FUNC_(expr, ...) - Returns the least value of all parameters, skipping null values.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 9, 2, 4, 3);
+       2
+  """)
 case class Least(children: Seq[Expression]) extends Expression {
 
   override def nullable: Boolean = children.forall(_.nullable)
@@ -531,7 +579,12 @@ case class Least(children: Seq[Expression]) extends Expression {
  * It takes at least 2 parameters, and returns null iff all parameters are null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n1, ...) - Returns the greatest value of all parameters, skipping null values.")
+  usage = "_FUNC_(expr, ...) - Returns the greatest value of all parameters, skipping null values.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 9, 2, 4, 3);
+       10
+  """)
 case class Greatest(children: Seq[Expression]) extends Expression {
 
   override def nullable: Boolean = children.forall(_.nullable)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
index 3a0a882e3876e..2918040771433 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
@@ -27,8 +27,12 @@ import org.apache.spark.sql.types._
  * Code generation inherited from BinaryArithmetic.
  */
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Bitwise AND.",
-  extended = "> SELECT 3 _FUNC_ 5; 1")
+  usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise AND of `expr1` and `expr2`.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 5;
+       1
+  """)
 case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = IntegralType
@@ -55,8 +59,12 @@ case class BitwiseAnd(left: Expression, right: Expression) extends BinaryArithme
  * Code generation inherited from BinaryArithmetic.
  */
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Bitwise OR.",
-  extended = "> SELECT 3 _FUNC_ 5; 7")
+  usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise OR of `expr1` and `expr2`.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 5;
+       7
+  """)
 case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = IntegralType
@@ -83,8 +91,12 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet
  * Code generation inherited from BinaryArithmetic.
  */
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Bitwise exclusive OR.",
-  extended = "> SELECT 3 _FUNC_ 5; 2")
+  usage = "expr1 _FUNC_ expr2 - Returns the result of bitwise exclusive OR of `expr1` and `expr2`.",
+  extended = """
+    Examples:
+      > SELECT 3 _FUNC_ 5;
+       2
+  """)
 case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithmetic {
 
   override def inputType: AbstractDataType = IntegralType
@@ -109,8 +121,12 @@ case class BitwiseXor(left: Expression, right: Expression) extends BinaryArithme
  * A function that calculates bitwise not(~) of a number.
  */
 @ExpressionDescription(
-  usage = "_FUNC_ b - Bitwise NOT.",
-  extended = "> SELECT _FUNC_ 0; -1")
+  usage = "_FUNC_ expr - Returns the result of bitwise NOT of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_ 0;
+       -1
+  """)
 case class BitwiseNot(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(IntegralType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index f56bb39d10791..c863ba434120d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -28,8 +28,12 @@ import org.apache.spark.sql.types._
  * Given an array or map, returns its size. Returns -1 if null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the size of an array or a map.",
-  extended = " > SELECT _FUNC_(array('b', 'd', 'c', 'a'));\n 4")
+  usage = "_FUNC_(expr) - Returns the size of an array or a map. Returns -1 if null.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array('b', 'd', 'c', 'a'));
+       4
+  """)
 case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
@@ -60,7 +64,11 @@ case class Size(child: Expression) extends UnaryExpression with ExpectsInputType
  */
 @ExpressionDescription(
   usage = "_FUNC_(map) - Returns an unordered array containing the keys of the map.",
-  extended = " > SELECT _FUNC_(map(1, 'a', 2, 'b'));\n [1,2]")
+  extended = """
+    Examples:
+      > SELECT _FUNC_(map(1, 'a', 2, 'b'));
+       [1,2]
+  """)
 case class MapKeys(child: Expression)
   extends UnaryExpression with ExpectsInputTypes {
 
@@ -84,7 +92,11 @@ case class MapKeys(child: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(map) - Returns an unordered array containing the values of the map.",
-  extended = " > SELECT _FUNC_(map(1, 'a', 2, 'b'));\n [\"a\",\"b\"]")
+  extended = """
+    Examples:
+      > SELECT _FUNC_(map(1, 'a', 2, 'b'));
+       ["a","b"]
+  """)
 case class MapValues(child: Expression)
   extends UnaryExpression with ExpectsInputTypes {
 
@@ -109,8 +121,12 @@ case class MapValues(child: Expression)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(array(obj1, obj2, ...), ascendingOrder) - Sorts the input array in ascending order according to the natural ordering of the array elements.",
-  extended = " > SELECT _FUNC_(array('b', 'd', 'c', 'a'), true);\n 'a', 'b', 'c', 'd'")
+  usage = "_FUNC_(array[, ascendingOrder]) - Sorts the input array in ascending or descending order according to the natural ordering of the array elements.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array('b', 'd', 'c', 'a'), true);
+       ["a","b","c","d"]
+  """)
 // scalastyle:on line.size.limit
 case class SortArray(base: Expression, ascendingOrder: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
@@ -200,8 +216,12 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
  * Checks if the array (left) has the element (right)
  */
 @ExpressionDescription(
-  usage = "_FUNC_(array, value) - Returns TRUE if the array contains the value.",
-  extended = " > SELECT _FUNC_(array(1, 2, 3), 2);\n true")
+  usage = "_FUNC_(array, value) - Returns true if the array contains the value.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(1, 2, 3), 2);
+       true
+  """)
 case class ArrayContains(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index dbfb2996ec9d5..c9f36649ec8ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -30,7 +30,12 @@ import org.apache.spark.unsafe.types.UTF8String
  * Returns an Array containing the evaluation of all children expressions.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n0, ...) - Returns an array with the given elements.")
+  usage = "_FUNC_(expr, ...) - Returns an array with the given elements.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1, 2, 3);
+       [1,2,3]
+  """)
 case class CreateArray(children: Seq[Expression]) extends Expression {
 
   override def foldable: Boolean = children.forall(_.foldable)
@@ -84,7 +89,12 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
  * The children are a flatted sequence of kv pairs, e.g. (key1, value1, key2, value2, ...)
  */
 @ExpressionDescription(
-  usage = "_FUNC_(key0, value0, key1, value1...) - Creates a map with the given key/value pairs.")
+  usage = "_FUNC_(key0, value0, key1, value1, ...) - Creates a map with the given key/value pairs.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1.0, '2', 3.0, '4');
+       {1.0:"2",3.0:"4"}
+  """)
 case class CreateMap(children: Seq[Expression]) extends Expression {
   lazy val keys = children.indices.filter(_ % 2 == 0).map(children)
   lazy val values = children.indices.filter(_ % 2 != 0).map(children)
@@ -276,7 +286,12 @@ trait CreateNamedStructLike extends Expression {
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.")
+  usage = "_FUNC_(name1, val1, name2, val2, ...) - Creates a struct with the given field names and values.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_("a", 1, "b", 2, "c", 3);
+       {"a":1,"b":2,"c":3}
+  """)
 // scalastyle:on line.size.limit
 case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStructLike {
 
@@ -329,8 +344,12 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateName
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(text[, pairDelim, keyValueDelim]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for pairDelim and ':' for keyValueDelim.",
-  extended = """ > SELECT _FUNC_('a:1,b:2,c:3',',',':');\n map("a":"1","b":"2","c":"3") """)
+  usage = "_FUNC_(text[, pairDelim[, keyValueDelim]]) - Creates a map after splitting the text into key/value pairs using delimiters. Default delimiters are ',' for `pairDelim` and ':' for `keyValueDelim`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('a:1,b:2,c:3', ',', ':');
+       map("a":"1","b":"2","c":"3")
+  """)
 // scalastyle:on line.size.limit
 case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression)
   extends TernaryExpression with CodegenFallback with ExpectsInputTypes {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 71d4e9a3c9471..a7d9e2dfcdb62 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -24,7 +24,12 @@ import org.apache.spark.sql.types._
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr1,expr2,expr3) - If expr1 is TRUE then IF() returns expr2; otherwise it returns expr3.")
+  usage = "_FUNC_(expr1, expr2, expr3) - If `expr1` evaluates to true, then returns `expr2`; otherwise returns `expr3`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1 < 2, 'a', 'b');
+       a
+  """)
 // scalastyle:on line.size.limit
 case class If(predicate: Expression, trueValue: Expression, falseValue: Expression)
   extends Expression {
@@ -162,7 +167,7 @@ abstract class CaseWhenBase(
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "CASE WHEN a THEN b [WHEN c THEN d]* [ELSE e] END - When a = true, returns b; when c = true, return d; else return e.")
+  usage = "CASE WHEN expr1 THEN expr2 [WHEN expr3 THEN expr4]* [ELSE expr5] END - When `expr1` = true, returns `expr2`; when `expr3` = true, return `expr4`; else return `expr5`.")
 // scalastyle:on line.size.limit
 case class CaseWhen(
     val branches: Seq[(Expression, Expression)],
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 05bfa7dcfc88f..9cec6be841de0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -75,8 +75,12 @@ case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
  * Adds a number of days to startdate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, num_days) - Returns the date that is num_days after start_date.",
-  extended = "> SELECT _FUNC_('2016-07-30', 1);\n '2016-07-31'")
+  usage = "_FUNC_(start_date, num_days) - Returns the date that is `num_days` after `start_date`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30', 1);
+       2016-07-31
+  """)
 case class DateAdd(startDate: Expression, days: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -104,8 +108,12 @@ case class DateAdd(startDate: Expression, days: Expression)
  * Subtracts a number of days to startdate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, num_days) - Returns the date that is num_days before start_date.",
-  extended = "> SELECT _FUNC_('2016-07-30', 1);\n '2016-07-29'")
+  usage = "_FUNC_(start_date, num_days) - Returns the date that is `num_days` before `start_date`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30', 1);
+       2016-07-29
+  """)
 case class DateSub(startDate: Expression, days: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
   override def left: Expression = startDate
@@ -129,8 +137,12 @@ case class DateSub(startDate: Expression, days: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the hour component of the string/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2009-07-30 12:58:59');\n 12")
+  usage = "_FUNC_(timestamp) - Returns the hour component of the string/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 12:58:59');
+       12
+  """)
 case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -148,8 +160,12 @@ case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInpu
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the minute component of the string/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2009-07-30 12:58:59');\n 58")
+  usage = "_FUNC_(timestamp) - Returns the minute component of the string/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 12:58:59');
+       58
+  """)
 case class Minute(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -167,8 +183,12 @@ case class Minute(child: Expression) extends UnaryExpression with ImplicitCastIn
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the second component of the string/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2009-07-30 12:58:59');\n 59")
+  usage = "_FUNC_(timestamp) - Returns the second component of the string/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 12:58:59');
+       59
+  """)
 case class Second(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
@@ -186,8 +206,12 @@ case class Second(child: Expression) extends UnaryExpression with ImplicitCastIn
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the day of year of date/timestamp.",
-  extended = "> SELECT _FUNC_('2016-04-09');\n 100")
+  usage = "_FUNC_(date) - Returns the day of year of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-04-09');
+       100
+  """)
 case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -205,8 +229,12 @@ case class DayOfYear(child: Expression) extends UnaryExpression with ImplicitCas
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the year component of the date/timestamp/interval.",
-  extended = "> SELECT _FUNC_('2016-07-30');\n 2016")
+  usage = "_FUNC_(date) - Returns the year component of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30');
+       2016
+  """)
 case class Year(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -224,7 +252,12 @@ case class Year(child: Expression) extends UnaryExpression with ImplicitCastInpu
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the quarter of the year for date, in the range 1 to 4.")
+  usage = "_FUNC_(date) - Returns the quarter of the year for date, in the range 1 to 4.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-08-31');
+       3
+  """)
 case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -242,8 +275,12 @@ case class Quarter(child: Expression) extends UnaryExpression with ImplicitCastI
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the month component of the date/timestamp/interval",
-  extended = "> SELECT _FUNC_('2016-07-30');\n 7")
+  usage = "_FUNC_(date) - Returns the month component of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-07-30');
+       7
+  """)
 case class Month(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -261,8 +298,12 @@ case class Month(child: Expression) extends UnaryExpression with ImplicitCastInp
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the day of month of date/timestamp, or the day of interval.",
-  extended = "> SELECT _FUNC_('2009-07-30');\n 30")
+  usage = "_FUNC_(date) - Returns the day of month of the date/timestamp.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30');
+       30
+  """)
 case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -280,8 +321,12 @@ case class DayOfMonth(child: Expression) extends UnaryExpression with ImplicitCa
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(param) - Returns the week of the year of the given date.",
-  extended = "> SELECT _FUNC_('2008-02-20');\n 8")
+  usage = "_FUNC_(date) - Returns the week of the year of the given date.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2008-02-20');
+       8
+  """)
 case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(DateType)
@@ -320,8 +365,12 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(date/timestamp/string, fmt) - Converts a date/timestamp/string to a value of string in the format specified by the date format fmt.",
-  extended = "> SELECT _FUNC_('2016-04-08', 'y')\n '2016'")
+  usage = "_FUNC_(timestamp, fmt) - Converts `timestamp` to a value of string in the format specified by the date format `fmt`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-04-08', 'y');
+       2016
+  """)
 // scalastyle:on line.size.limit
 case class DateFormatClass(left: Expression, right: Expression) extends BinaryExpression
   with ImplicitCastInputTypes {
@@ -351,7 +400,12 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx
  * Deterministic version of [[UnixTimestamp]], must have at least one parameter.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(date[, pattern]) - Returns the UNIX timestamp of the give time.")
+  usage = "_FUNC_(expr[, pattern]) - Returns the UNIX timestamp of the give time.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
+       1460041200
+  """)
 case class ToUnixTimestamp(timeExp: Expression, format: Expression) extends UnixTime {
   override def left: Expression = timeExp
   override def right: Expression = format
@@ -374,7 +428,14 @@ case class ToUnixTimestamp(timeExp: Expression, format: Expression) extends Unix
  * second parameter.
  */
 @ExpressionDescription(
-  usage = "_FUNC_([date[, pattern]]) - Returns the UNIX timestamp of current or specified time.")
+  usage = "_FUNC_([expr[, pattern]]) - Returns the UNIX timestamp of current or specified time.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       1476884637
+      > SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
+       1460041200
+  """)
 case class UnixTimestamp(timeExp: Expression, format: Expression) extends UnixTime {
   override def left: Expression = timeExp
   override def right: Expression = format
@@ -497,8 +558,12 @@ abstract class UnixTime extends BinaryExpression with ExpectsInputTypes {
  * Note that hive Language Manual says it returns 0 if fail, but in fact it returns null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(unix_time, format) - Returns unix_time in the specified format",
-  extended = "> SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');\n '1970-01-01 00:00:00'")
+  usage = "_FUNC_(unix_time, format) - Returns `unix_time` in the specified `format`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');
+       1970-01-01 00:00:00
+  """)
 case class FromUnixTime(sec: Expression, format: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -586,7 +651,11 @@ case class FromUnixTime(sec: Expression, format: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(date) - Returns the last day of the month which the date belongs to.",
-  extended = "> SELECT _FUNC_('2009-01-12');\n '2009-01-31'")
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-01-12');
+       2009-01-31
+  """)
 case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def child: Expression = startDate
 
@@ -615,8 +684,12 @@ case class LastDay(startDate: Expression) extends UnaryExpression with ImplicitC
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, day_of_week) - Returns the first date which is later than start_date and named as indicated.",
-  extended = "> SELECT _FUNC_('2015-01-14', 'TU');\n '2015-01-20'")
+  usage = "_FUNC_(start_date, day_of_week) - Returns the first date which is later than `start_date` and named as indicated.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2015-01-14', 'TU');
+       2015-01-20
+  """)
 // scalastyle:on line.size.limit
 case class NextDay(startDate: Expression, dayOfWeek: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -701,11 +774,17 @@ case class TimeAdd(start: Expression, interval: Expression)
 }
 
 /**
- * Assumes given timestamp is UTC and converts to given timezone.
+ * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+ * that corresponds to the same time of day in the given timezone.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(timestamp, string timezone) - Assumes given timestamp is UTC and converts to given timezone.")
+  usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp that corresponds to the same time of day in the given timezone.",
+  extended = """
+    Examples:
+      > SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');
+       2016-08-31 09:00:00
+  """)
 // scalastyle:on line.size.limit
 case class FromUTCTimestamp(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -784,9 +863,15 @@ case class TimeSub(start: Expression, interval: Expression)
 /**
  * Returns the date that is num_months after start_date.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(start_date, num_months) - Returns the date that is num_months after start_date.",
-  extended = "> SELECT _FUNC_('2016-08-31', 1);\n '2016-09-30'")
+  usage = "_FUNC_(start_date, num_months) - Returns the date that is `num_months` after `start_date`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-08-31', 1);
+       2016-09-30
+  """)
+// scalastyle:on line.size.limit
 case class AddMonths(startDate: Expression, numMonths: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -814,9 +899,15 @@ case class AddMonths(startDate: Expression, numMonths: Expression)
 /**
  * Returns number of months between dates date1 and date2.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(date1, date2) - returns number of months between dates date1 and date2.",
-  extended = "> SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30');\n 3.94959677")
+  usage = "_FUNC_(timestamp1, timestamp2) - Returns number of months between `timestamp1` and `timestamp2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('1997-02-28 10:30:00', '1996-10-30');
+       3.94959677
+  """)
+// scalastyle:on line.size.limit
 case class MonthsBetween(date1: Expression, date2: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -842,11 +933,17 @@ case class MonthsBetween(date1: Expression, date2: Expression)
 }
 
 /**
- * Assumes given timestamp is in given timezone and converts to UTC.
+ * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+ * another timestamp that corresponds to the same time of day in UTC.
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(timestamp, string timezone) - Assumes given timestamp is in given timezone and converts to UTC.")
+  usage = "_FUNC_(timestamp, timezone) - Given a timestamp, which corresponds to a certain time of day in the given timezone, returns another timestamp that corresponds to the same time of day in UTC.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
+       2016-08-30 15:00:00
+  """)
 // scalastyle:on line.size.limit
 case class ToUTCTimestamp(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -897,8 +994,12 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
  * Returns the date part of a timestamp or string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Extracts the date part of the date or datetime expression expr.",
-  extended = "> SELECT _FUNC_('2009-07-30 04:17:52');\n '2009-07-30'")
+  usage = "_FUNC_(expr) - Extracts the date part of the date or timestamp expression `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30 04:17:52');
+       2009-07-30
+  """)
 case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   // Implicit casting of spark will accept string in both date and timestamp format, as
@@ -921,8 +1022,14 @@ case class ToDate(child: Expression) extends UnaryExpression with ImplicitCastIn
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(date, fmt) - Returns returns date with the time portion of the day truncated to the unit specified by the format model fmt.",
-  extended = "> SELECT _FUNC_('2009-02-12', 'MM')\n '2009-02-01'\n> SELECT _FUNC_('2015-10-27', 'YEAR');\n '2015-01-01'")
+  usage = "_FUNC_(date, fmt) - Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-02-12', 'MM');
+       2009-02-01
+      > SELECT _FUNC_('2015-10-27', 'YEAR');
+       2015-01-01
+  """)
 // scalastyle:on line.size.limit
 case class TruncDate(date: Expression, format: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
@@ -994,8 +1101,12 @@ case class TruncDate(date: Expression, format: Expression)
  * Returns the number of days from startDate to endDate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(date1, date2) - Returns the number of days between date1 and date2.",
-  extended = "> SELECT _FUNC_('2009-07-30', '2009-07-31');\n 1")
+  usage = "_FUNC_(date1, date2) - Returns the number of days between `date1` and `date2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('2009-07-30', '2009-07-31');
+       1
+  """)
 case class DateDiff(endDate: Expression, startDate: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index f74208ff66db7..d042bfb63d567 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -102,8 +102,13 @@ case class UserDefinedGenerator(
  * }}}
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n, v1, ..., vk) - Separate v1, ..., vk into n rows.",
-  extended = "> SELECT _FUNC_(2, 1, 2, 3);\n  [1,2]\n  [3,null]")
+  usage = "_FUNC_(n, expr1, ..., exprk) - Separates `expr1`, ..., `exprk` into `n` rows.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2, 1, 2, 3);
+       1  2
+       3  NULL
+  """)
 case class Stack(children: Seq[Expression])
     extends Expression with Generator with CodegenFallback {
 
@@ -226,8 +231,13 @@ abstract class ExplodeBase(child: Expression, position: Boolean)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Separates the elements of array a into multiple rows, or the elements of map a into multiple rows and columns.",
-  extended = "> SELECT _FUNC_(array(10,20));\n  10\n  20")
+  usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows, or the elements of map `expr` into multiple rows and columns.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(10, 20));
+       10
+       20
+  """)
 // scalastyle:on line.size.limit
 case class Explode(child: Expression) extends ExplodeBase(child, position = false)
 
@@ -242,8 +252,13 @@ case class Explode(child: Expression) extends ExplodeBase(child, position = fals
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Separates the elements of array a into multiple rows with positions, or the elements of a map into multiple rows and columns with positions.",
-  extended = "> SELECT _FUNC_(array(10,20));\n  0\t10\n  1\t20")
+  usage = "_FUNC_(expr) - Separates the elements of array `expr` into multiple rows with positions, or the elements of map `expr` into multiple rows and columns with positions.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(10,20));
+       0  10
+       1  20
+  """)
 // scalastyle:on line.size.limit
 case class PosExplode(child: Expression) extends ExplodeBase(child, position = true)
 
@@ -251,8 +266,13 @@ case class PosExplode(child: Expression) extends ExplodeBase(child, position = t
  * Explodes an array of structs into a table.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Explodes an array of structs into a table.",
-  extended = "> SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b')));\n  [1,a]\n  [2,b]")
+  usage = "_FUNC_(expr) - Explodes an array of structs into a table.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(array(struct(1, 'a'), struct(2, 'b')));
+       1  a
+       2  b
+  """)
 case class Inline(child: Expression) extends UnaryExpression with Generator with CodegenFallback {
 
   override def checkInputDataTypes(): TypeCheckResult = child.dataType match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 244a5a34f3594..e034735375274 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -110,7 +110,12 @@ private[this] object SharedFactory {
  * of the extracted json object. It will return null if the input json string is invalid.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(json_txt, path) - Extract a json object from path")
+  usage = "_FUNC_(json_txt, path) - Extracts a json object from `path`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('{"a":"b"}', '$.a');
+       b
+  """)
 case class GetJsonObject(json: Expression, path: Expression)
   extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
 
@@ -326,7 +331,12 @@ case class GetJsonObject(json: Expression, path: Expression)
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - like get_json_object, but it takes multiple names and return a tuple. All the input parameters and output column types are string.")
+  usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Return a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b');
+       1  2
+  """)
 // scalastyle:on line.size.limit
 case class JsonTuple(children: Seq[Expression])
   extends Generator with CodegenFallback {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 5152265152aed..a60494a5bb69d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -139,8 +139,12 @@ abstract class BinaryMathExpression(f: (Double, Double) => Double, name: String)
  * evaluated by the optimizer during constant folding.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns Euler's number, E.",
-  extended = "> SELECT _FUNC_();\n 2.718281828459045")
+  usage = "_FUNC_() - Returns Euler's number, e.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       2.718281828459045
+  """)
 case class EulerNumber() extends LeafMathExpression(math.E, "E")
 
 /**
@@ -148,8 +152,12 @@ case class EulerNumber() extends LeafMathExpression(math.E, "E")
  * evaluated by the optimizer during constant folding.
  */
 @ExpressionDescription(
-  usage = "_FUNC_() - Returns PI.",
-  extended = "> SELECT _FUNC_();\n 3.141592653589793")
+  usage = "_FUNC_() - Returns pi.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       3.141592653589793
+  """)
 case class Pi() extends LeafMathExpression(math.Pi, "PI")
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -158,29 +166,61 @@ case class Pi() extends LeafMathExpression(math.Pi, "PI")
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the arc cosine of x if -1<=x<=1 or NaN otherwise.",
-  extended = "> SELECT _FUNC_(1);\n 0.0\n> SELECT _FUNC_(2);\n NaN")
+  usage = "_FUNC_(expr) - Returns the inverse cosine (a.k.a. arccosine) of `expr` if -1<=`expr`<=1 or NaN otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       0.0
+      > SELECT _FUNC_(2);
+       NaN
+  """)
+// scalastyle:on line.size.limit
 case class Acos(child: Expression) extends UnaryMathExpression(math.acos, "ACOS")
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the arc sin of x if -1<=x<=1 or NaN otherwise.",
-  extended = "> SELECT _FUNC_(0);\n 0.0\n> SELECT _FUNC_(2);\n NaN")
+  usage = "_FUNC_(expr) - Returns the inverse sine (a.k.a. arcsine) the arc sin of `expr` if -1<=`expr`<=1 or NaN otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+      > SELECT _FUNC_(2);
+       NaN
+  """)
+// scalastyle:on line.size.limit
 case class Asin(child: Expression) extends UnaryMathExpression(math.asin, "ASIN")
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the arc tangent.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the inverse tangent (a.k.a. arctangent).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
+// scalastyle:on line.size.limit
 case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the cube root of a double value.",
-  extended = "> SELECT _FUNC_(27.0);\n 3.0")
+  usage = "_FUNC_(expr) - Returns the cube root of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(27.0);
+       3.0
+  """)
 case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the smallest integer not smaller than x.",
-  extended = "> SELECT _FUNC_(-0.1);\n 0\n> SELECT _FUNC_(5);\n 5")
+  usage = "_FUNC_(expr) - Returns the smallest integer not smaller than `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(-0.1);
+       0
+      > SELECT _FUNC_(5);
+       5
+  """)
 case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL") {
   override def dataType: DataType = child.dataType match {
     case dt @ DecimalType.Fixed(_, 0) => dt
@@ -208,13 +248,21 @@ case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the cosine of x.",
-  extended = "> SELECT _FUNC_(0);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the cosine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       1.0
+  """)
 case class Cos(child: Expression) extends UnaryMathExpression(math.cos, "COS")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the hyperbolic cosine of x.",
-  extended = "> SELECT _FUNC_(0);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the hyperbolic cosine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       1.0
+  """)
 case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH")
 
 /**
@@ -225,8 +273,14 @@ case class Cosh(child: Expression) extends UnaryMathExpression(math.cosh, "COSH"
  * @param toBaseExpr to which base
  */
 @ExpressionDescription(
-  usage = "_FUNC_(num, from_base, to_base) - Convert num from from_base to to_base.",
-  extended = "> SELECT _FUNC_('100', 2, 10);\n '4'\n> SELECT _FUNC_(-10, 16, -10);\n '16'")
+  usage = "_FUNC_(num, from_base, to_base) - Convert `num` from `from_base` to `to_base`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('100', 2, 10);
+       4
+      > SELECT _FUNC_(-10, 16, -10);
+       16
+  """)
 case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -256,18 +310,32 @@ case class Conv(numExpr: Expression, fromBaseExpr: Expression, toBaseExpr: Expre
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns e to the power of x.",
-  extended = "> SELECT _FUNC_(0);\n 1.0")
+  usage = "_FUNC_(expr) - Returns e to the power of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       1.0
+  """)
 case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns exp(x) - 1.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns exp(`expr`) - 1.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the largest integer not greater than x.",
-  extended = "> SELECT _FUNC_(-0.1);\n -1\n> SELECT _FUNC_(5);\n 5")
+  usage = "_FUNC_(expr) - Returns the largest integer not greater than `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(-0.1);
+       -1
+      > SELECT _FUNC_(5);
+       5
+  """)
 case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR") {
   override def dataType: DataType = child.dataType match {
     case dt @ DecimalType.Fixed(_, 0) => dt
@@ -326,8 +394,12 @@ object Factorial {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(n) - Returns n factorial for n is [0..20]. Otherwise, NULL.",
-  extended = "> SELECT _FUNC_(5);\n 120")
+  usage = "_FUNC_(expr) - Returns the factorial of `expr`. `expr` is [0..20]. Otherwise, null.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(5);
+       120
+  """)
 case class Factorial(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(IntegerType)
@@ -361,13 +433,21 @@ case class Factorial(child: Expression) extends UnaryExpression with ImplicitCas
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the natural logarithm of x with base e.",
-  extended = "> SELECT _FUNC_(1);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the natural logarithm (base e) of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       0.0
+  """)
 case class Log(child: Expression) extends UnaryLogExpression(math.log, "LOG")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the logarithm of x with base 2.",
-  extended = "> SELECT _FUNC_(2);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the logarithm of `expr` with base 2.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2);
+       1.0
+  """)
 case class Log2(child: Expression)
   extends UnaryLogExpression((x: Double) => math.log(x) / math.log(2), "LOG2") {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -384,71 +464,127 @@ case class Log2(child: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the logarithm of x with base 10.",
-  extended = "> SELECT _FUNC_(10);\n 1.0")
+  usage = "_FUNC_(expr) - Returns the logarithm of `expr` with base 10.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10);
+       1.0
+  """)
 case class Log10(child: Expression) extends UnaryLogExpression(math.log10, "LOG10")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns log(1 + x).",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns log(1 + `expr`).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Log1p(child: Expression) extends UnaryLogExpression(math.log1p, "LOG1P") {
   protected override val yAsymptote: Double = -1.0
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x, d) - Return the rounded x at d decimal places.",
-  extended = "> SELECT _FUNC_(12.3456, 1);\n 12.3")
+  usage = "_FUNC_(expr) - Returns the double value that is closest in value to the argument and is equal to a mathematical integer.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(12.3456);
+       12.0
+  """)
+// scalastyle:on line.size.limit
 case class Rint(child: Expression) extends UnaryMathExpression(math.rint, "ROUND") {
   override def funcName: String = "rint"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sign of x.",
-  extended = "> SELECT _FUNC_(40);\n 1.0")
+  usage = "_FUNC_(expr) - Returns -1.0, 0.0 or 1.0 as `expr` is negative, 0 or positive.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(40);
+       1.0
+  """)
 case class Signum(child: Expression) extends UnaryMathExpression(math.signum, "SIGNUM")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the sine of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the sine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Sin(child: Expression) extends UnaryMathExpression(math.sin, "SIN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the hyperbolic sine of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the hyperbolic sine of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Sinh(child: Expression) extends UnaryMathExpression(math.sinh, "SINH")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the square root of x.",
-  extended = "> SELECT _FUNC_(4);\n 2.0")
+  usage = "_FUNC_(expr) - Returns the square root of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(4);
+       2.0
+  """)
 case class Sqrt(child: Expression) extends UnaryMathExpression(math.sqrt, "SQRT")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the tangent of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the tangent of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Tan(child: Expression) extends UnaryMathExpression(math.tan, "TAN")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns the hyperbolic tangent of x.",
-  extended = "> SELECT _FUNC_(0);\n 0.0")
+  usage = "_FUNC_(expr) - Returns the hyperbolic tangent of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0);
+       0.0
+  """)
 case class Tanh(child: Expression) extends UnaryMathExpression(math.tanh, "TANH")
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Converts radians to degrees.",
-  extended = "> SELECT _FUNC_(3.141592653589793);\n 180.0")
+  usage = "_FUNC_(expr) - Converts radians to degrees.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(3.141592653589793);
+       180.0
+  """)
 case class ToDegrees(child: Expression) extends UnaryMathExpression(math.toDegrees, "DEGREES") {
   override def funcName: String = "toDegrees"
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Converts degrees to radians.",
-  extended = "> SELECT _FUNC_(180);\n 3.141592653589793")
+  usage = "_FUNC_(expr) - Converts degrees to radians.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(180);
+       3.141592653589793
+  """)
 case class ToRadians(child: Expression) extends UnaryMathExpression(math.toRadians, "RADIANS") {
   override def funcName: String = "toRadians"
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Returns x in binary.",
-  extended = "> SELECT _FUNC_(13);\n '1101'")
+  usage = "_FUNC_(expr) - Returns the string representation of the long value `expr` represented in binary.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(13);
+       1101
+      > SELECT _FUNC_(-13);
+       1111111111111111111111111111111111111111111111111111111111110011
+      > SELECT _FUNC_(13.3);
+       1101
+  """)
+// scalastyle:on line.size.limit
 case class Bin(child: Expression)
   extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
@@ -541,8 +677,14 @@ object Hex {
  * and returns the resulting STRING. Negative numbers would be treated as two's complement.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Convert the argument to hexadecimal.",
-  extended = "> SELECT _FUNC_(17);\n '11'\n> SELECT _FUNC_('Spark SQL');\n '537061726B2053514C'")
+  usage = "_FUNC_(expr) - Converts `expr` to hexadecimal.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(17);
+       11
+      > SELECT _FUNC_('Spark SQL');
+       537061726B2053514C
+  """)
 case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] =
@@ -572,8 +714,12 @@ case class Hex(child: Expression) extends UnaryExpression with ImplicitCastInput
  * Resulting characters are returned as a byte array.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(x) - Converts hexadecimal argument to binary.",
-  extended = "> SELECT decode(_FUNC_('537061726B2053514C'),'UTF-8');\n 'Spark SQL'")
+  usage = "_FUNC_(expr) - Converts hexadecimal `expr` to binary.",
+  extended = """
+    Examples:
+      > SELECT decode(_FUNC_('537061726B2053514C'), 'UTF-8');
+       Spark SQL
+  """)
 case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
@@ -602,9 +748,15 @@ case class Unhex(child: Expression) extends UnaryExpression with ImplicitCastInp
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x,y) - Returns the arc tangent2.",
-  extended = "> SELECT _FUNC_(0, 0);\n 0.0")
+  usage = "_FUNC_(expr1, expr2) - Returns the angle in radians between the positive x-axis of a plane and the point given by the coordinates (`expr1`, `expr2`).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0, 0);
+       0.0
+  """)
+// scalastyle:on line.size.limit
 case class Atan2(left: Expression, right: Expression)
   extends BinaryMathExpression(math.atan2, "ATAN2") {
 
@@ -619,8 +771,12 @@ case class Atan2(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(x1, x2) - Raise x1 to the power of x2.",
-  extended = "> SELECT _FUNC_(2, 3);\n 8.0")
+  usage = "_FUNC_(expr1, expr2) - Raises `expr1` to the power of `expr2`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2, 3);
+       8.0
+  """)
 case class Pow(left: Expression, right: Expression)
   extends BinaryMathExpression(math.pow, "POWER") {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -636,8 +792,12 @@ case class Pow(left: Expression, right: Expression)
  * @param right number of bits to left shift.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Bitwise left shift.",
-  extended = "> SELECT _FUNC_(2, 1);\n 4")
+  usage = "_FUNC_(base, expr) - Bitwise left shift.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2, 1);
+       4
+  """)
 case class ShiftLeft(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -660,14 +820,18 @@ case class ShiftLeft(left: Expression, right: Expression)
 
 
 /**
- * Bitwise right shift.
+ * Bitwise (signed) right shift.
  *
  * @param left the base number to shift.
  * @param right number of bits to right shift.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Bitwise right shift.",
-  extended = "> SELECT _FUNC_(4, 1);\n 2")
+  usage = "_FUNC_(base, expr) - Bitwise (signed) right shift.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(4, 1);
+       2
+  """)
 case class ShiftRight(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -696,8 +860,12 @@ case class ShiftRight(left: Expression, right: Expression)
  * @param right the number of bits to right shift.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Bitwise unsigned right shift.",
-  extended = "> SELECT _FUNC_(4, 1);\n 2")
+  usage = "_FUNC_(base, expr) - Bitwise unsigned right shift.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(4, 1);
+       2
+  """)
 case class ShiftRightUnsigned(left: Expression, right: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -719,8 +887,12 @@ case class ShiftRightUnsigned(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_(a, b) - Returns sqrt(a**2 + b**2).",
-  extended = "> SELECT _FUNC_(3, 4);\n 5.0")
+  usage = "_FUNC_(expr1, expr2) - Returns sqrt(`expr1`**2 + `expr2`**2).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(3, 4);
+       5.0
+  """)
 case class Hypot(left: Expression, right: Expression)
   extends BinaryMathExpression(math.hypot, "HYPOT")
 
@@ -732,8 +904,12 @@ case class Hypot(left: Expression, right: Expression)
  * @param right the number to compute the logarithm of.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(b, x) - Returns the logarithm of x with base b.",
-  extended = "> SELECT _FUNC_(10, 100);\n 2.0")
+  usage = "_FUNC_(base, expr) - Returns the logarithm of `expr` with `base`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(10, 100);
+       2.0
+  """)
 case class Logarithm(left: Expression, right: Expression)
   extends BinaryMathExpression((c1, c2) => math.log(c2) / math.log(c1), "LOG") {
 
@@ -956,9 +1132,15 @@ abstract class RoundBase(child: Expression, scale: Expression,
  * Round an expression to d decimal places using HALF_UP rounding mode.
  * round(2.5) == 3.0, round(3.5) == 4.0.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x, d) - Round x to d decimal places using HALF_UP rounding mode.",
-  extended = "> SELECT _FUNC_(2.5, 0);\n 3.0")
+  usage = "_FUNC_(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_UP rounding mode.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2.5, 0);
+       3.0
+  """)
+// scalastyle:on line.size.limit
 case class Round(child: Expression, scale: Expression)
   extends RoundBase(child, scale, BigDecimal.RoundingMode.HALF_UP, "ROUND_HALF_UP")
     with Serializable with ImplicitCastInputTypes {
@@ -970,9 +1152,15 @@ case class Round(child: Expression, scale: Expression)
  * also known as Gaussian rounding or bankers' rounding.
  * round(2.5) = 2.0, round(3.5) = 4.0.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(x, d) - Round x to d decimal places using HALF_EVEN rounding mode.",
-  extended = "> SELECT _FUNC_(2.5, 0);\n 2.0")
+  usage = "_FUNC_(expr, d) - Returns `expr` rounded to `d` decimal places using HALF_EVEN rounding mode.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(2.5, 0);
+       2.0
+  """)
+// scalastyle:on line.size.limit
 case class BRound(child: Expression, scale: Expression)
   extends RoundBase(child, scale, BigDecimal.RoundingMode.HALF_EVEN, "ROUND_HALF_EVEN")
     with Serializable with ImplicitCastInputTypes {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 5ead16908732f..2ce10ef13215e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -38,8 +38,12 @@ import org.apache.spark.unsafe.Platform
  * For input of type [[BinaryType]]
  */
 @ExpressionDescription(
-  usage = "_FUNC_(input) - Returns an MD5 128-bit checksum as a hex string of the input",
-  extended = "> SELECT _FUNC_('Spark');\n '8cde774d6f7333752ed72cacddb05126'")
+  usage = "_FUNC_(expr) - Returns an MD5 128-bit checksum as a hex string of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       8cde774d6f7333752ed72cacddb05126
+  """)
 case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
@@ -65,10 +69,15 @@ case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInput
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(input, bitLength) - Returns a checksum of SHA-2 family as a hex string of the input.
-            SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.""",
-  extended = """> SELECT _FUNC_('Spark', 0);
-               '529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b'""")
+  usage = """
+    _FUNC_(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of `expr`.
+      SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', 256);
+       529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
+  """)
 // scalastyle:on line.size.limit
 case class Sha2(left: Expression, right: Expression)
   extends BinaryExpression with Serializable with ImplicitCastInputTypes {
@@ -136,8 +145,12 @@ case class Sha2(left: Expression, right: Expression)
  * For input of type [[BinaryType]] or [[StringType]]
  */
 @ExpressionDescription(
-  usage = "_FUNC_(input) - Returns a sha1 hash value as a hex string of the input",
-  extended = "> SELECT _FUNC_('Spark');\n '85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c'")
+  usage = "_FUNC_(expr) - Returns a sha1 hash value as a hex string of the `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+  """)
 case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
@@ -159,8 +172,12 @@ case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInpu
  * For input of type [[BinaryType]]
  */
 @ExpressionDescription(
-  usage = "_FUNC_(input) - Returns a cyclic redundancy check value as a bigint of the input",
-  extended = "> SELECT _FUNC_('Spark');\n '1557323817'")
+  usage = "_FUNC_(expr) - Returns a cyclic redundancy check value of the `expr` as a bigint.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       1557323817
+  """)
 case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = LongType
@@ -490,7 +507,12 @@ abstract class InterpretedHashFunction {
  * and bucketing have same data distribution.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a1, a2, ...) - Returns a hash value of the arguments.")
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', array(123), 2);
+        -1321691492
+  """)
 case class Murmur3Hash(children: Seq[Expression], seed: Int) extends HashExpression[Int] {
   def this(arguments: Seq[Expression]) = this(arguments, 42)
 
@@ -544,7 +566,12 @@ case class PrintToStderr(child: Expression) extends UnaryExpression {
  * A function throws an exception if 'condition' is not true.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(condition) - Throw an exception if 'condition' is not true.")
+  usage = "_FUNC_(expr) - Throws an exception if `expr` is not true.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(0 < 1);
+       NULL
+  """)
 case class AssertTrue(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def nullable: Boolean = true
@@ -613,7 +640,11 @@ object XxHash64Function extends InterpretedHashFunction {
  */
 @ExpressionDescription(
   usage = "_FUNC_() - Returns the current database.",
-  extended = "> SELECT _FUNC_()")
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       default
+  """)
 case class CurrentDatabase() extends LeafExpression with Unevaluable {
   override def dataType: DataType = StringType
   override def foldable: Boolean = true
@@ -631,7 +662,7 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable {
  * TODO: Support Decimal and date related types
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a1, a2, ...) - Returns a hash value of the arguments.")
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.")
 case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
   override val seed = 0
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 70862a87ef9c6..8b2e8f3e7ef73 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -34,9 +34,15 @@ import org.apache.spark.sql.types._
  *   coalesce(null, null, null) => null
  * }}}
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a1, a2, ...) - Returns the first non-null argument if exists. Otherwise, NULL.",
-  extended = "> SELECT _FUNC_(NULL, 1, NULL);\n 1")
+  usage = "_FUNC_(expr1, expr2, ...) - Returns the first non-null argument if exists. Otherwise, null.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, 1, NULL);
+       1
+  """)
+// scalastyle:on line.size.limit
 case class Coalesce(children: Seq[Expression]) extends Expression {
 
   /** Coalesce is nullable if all of its children are nullable, or if it has no children. */
@@ -88,7 +94,13 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b) - Returns b if a is null, or a otherwise.")
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2) - Returns `expr2` if `expr1` is null, or `expr1` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, array('2'));
+       ["2"]
+  """)
 case class IfNull(left: Expression, right: Expression, child: Expression)
   extends RuntimeReplaceable {
 
@@ -101,7 +113,13 @@ case class IfNull(left: Expression, right: Expression, child: Expression)
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b) - Returns null if a equals to b, or a otherwise.")
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`, or `expr1` otherwise.",
+  extended = """
+   Examples:
+     > SELECT _FUNC_(2, 2);
+      NULL
+  """)
 case class NullIf(left: Expression, right: Expression, child: Expression)
   extends RuntimeReplaceable {
 
@@ -114,7 +132,13 @@ case class NullIf(left: Expression, right: Expression, child: Expression)
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b) - Returns b if a is null, or a otherwise.")
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2) - Returns `expr2` if `expr1` is null, or `expr1` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, array('2'));
+       ["2"]
+  """)
 case class Nvl(left: Expression, right: Expression, child: Expression) extends RuntimeReplaceable {
 
   def this(left: Expression, right: Expression) = {
@@ -126,7 +150,15 @@ case class Nvl(left: Expression, right: Expression, child: Expression) extends R
 }
 
 
-@ExpressionDescription(usage = "_FUNC_(a,b,c) - Returns b if a is not null, or c otherwise.")
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2, expr3) - Returns `expr2` if `expr1` is not null, or `expr3` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(NULL, 2, 1);
+       1
+  """)
+// scalastyle:on line.size.limit
 case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression, child: Expression)
   extends RuntimeReplaceable {
 
@@ -143,7 +175,12 @@ case class Nvl2(expr1: Expression, expr2: Expression, expr3: Expression, child:
  * Evaluates to `true` iff it's NaN.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns true if a is NaN and false otherwise.")
+  usage = "_FUNC_(expr) - Returns true if `expr` is NaN, or false otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(cast('NaN' as double));
+       true
+  """)
 case class IsNaN(child: Expression) extends UnaryExpression
   with Predicate with ImplicitCastInputTypes {
 
@@ -181,7 +218,12 @@ case class IsNaN(child: Expression) extends UnaryExpression
  * This Expression is useful for mapping NaN values to null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a,b) - Returns a iff it's not NaN, or b otherwise.")
+  usage = "_FUNC_(expr1, expr2) - Returns `expr1` if it's not NaN, or `expr2` otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(cast('NaN' as double), 123);
+       123.0
+  """)
 case class NaNvl(left: Expression, right: Expression)
     extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -236,7 +278,12 @@ case class NaNvl(left: Expression, right: Expression)
  * An expression that is evaluated to true if the input is null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns true if a is NULL and false otherwise.")
+  usage = "_FUNC_(expr) - Returns true if `expr` is null, or false otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       false
+  """)
 case class IsNull(child: Expression) extends UnaryExpression with Predicate {
   override def nullable: Boolean = false
 
@@ -257,7 +304,12 @@ case class IsNull(child: Expression) extends UnaryExpression with Predicate {
  * An expression that is evaluated to true if the input is not null.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns true if a is not NULL and false otherwise.")
+  usage = "_FUNC_(expr) - Returns true if `expr` is not null, or false otherwise.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1);
+       true
+  """)
 case class IsNotNull(child: Expression) extends UnaryExpression with Predicate {
   override def nullable: Boolean = false
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index c941a576d00d6..7946c201f4ffc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -89,7 +89,7 @@ trait PredicateHelper {
 }
 
 @ExpressionDescription(
-  usage = "_FUNC_ a - Logical not")
+  usage = "_FUNC_ expr - Logical not.")
 case class Not(child: Expression)
   extends UnaryExpression with Predicate with ImplicitCastInputTypes with NullIntolerant {
 
@@ -111,7 +111,7 @@ case class Not(child: Expression)
  * Evaluates to `true` if `list` contains `value`.
  */
 @ExpressionDescription(
-  usage = "expr _FUNC_(val1, val2, ...) - Returns true if expr equals to any valN.")
+  usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.")
 case class In(value: Expression, list: Seq[Expression]) extends Predicate
     with ImplicitCastInputTypes {
 
@@ -248,7 +248,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Logical AND.")
+  usage = "expr1 _FUNC_ expr2 - Logical AND.")
 case class And(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
   override def inputType: AbstractDataType = BooleanType
@@ -311,7 +311,7 @@ case class And(left: Expression, right: Expression) extends BinaryOperator with
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Logical OR.")
+  usage = "expr1 _FUNC_ expr2 - Logical OR.")
 case class Or(left: Expression, right: Expression) extends BinaryOperator with Predicate {
 
   override def inputType: AbstractDataType = BooleanType
@@ -406,7 +406,7 @@ object Equality {
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a equals b and false otherwise.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` equals `expr2`, or false otherwise.")
 case class EqualTo(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -432,8 +432,10 @@ case class EqualTo(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = """a _FUNC_ b - Returns same result with EQUAL(=) operator for non-null operands,
-    but returns TRUE if both are NULL, FALSE if one of the them is NULL.""")
+  usage = """
+    expr1 _FUNC_ expr2 - Returns same result as the EQUAL(=) operator for non-null operands,
+      but returns true if both are null, false if one of the them is null.
+  """)
 case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComparison {
 
   override def inputType: AbstractDataType = AnyDataType
@@ -473,7 +475,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is less than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than `expr2`.")
 case class LessThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -487,7 +489,7 @@ case class LessThan(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is not greater than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is less than or equal to `expr2`.")
 case class LessThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -501,7 +503,7 @@ case class LessThanOrEqual(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is greater than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than `expr2`.")
 case class GreaterThan(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
@@ -515,7 +517,7 @@ case class GreaterThan(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "a _FUNC_ b - Returns TRUE if a is not smaller than b.")
+  usage = "expr1 _FUNC_ expr2 - Returns true if `expr1` is greater than or equal to `expr2`.")
 case class GreaterThanOrEqual(left: Expression, right: Expression)
     extends BinaryComparison with NullIntolerant {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index e09029f5aab9b..a331a5557b455 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -55,8 +55,17 @@ abstract class RDG extends LeafExpression with Nondeterministic {
 }
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns a random column with i.i.d. uniformly distributed values in [0, 1).")
+  usage = "_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) uniformly distributed values in [0, 1).",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       0.9629742951434543
+      > SELECT _FUNC_(0);
+       0.8446490682263027
+  """)
+// scalastyle:on line.size.limit
 case class Rand(seed: Long) extends RDG {
   override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
 
@@ -78,9 +87,18 @@ case class Rand(seed: Long) extends RDG {
   }
 }
 
-/** Generate a random column with i.i.d. gaussian random distribution. */
+/** Generate a random column with i.i.d. values drawn from the standard normal distribution. */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.")
+  usage = "_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_();
+       -0.3254147983080288
+      > SELECT _FUNC_(0);
+       1.1164209726833079
+  """)
+// scalastyle:on line.size.limit
 case class Randn(seed: Long) extends RDG {
   override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index d25da3fd587b6..5648ad6b6dc18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -68,7 +68,7 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
  * Simple RegEx pattern matching function
  */
 @ExpressionDescription(
-  usage = "str _FUNC_ pattern - Returns true if str matches pattern and false otherwise.")
+  usage = "str _FUNC_ pattern - Returns true if `str` matches `pattern`, or false otherwise.")
 case class Like(left: Expression, right: Expression)
   extends BinaryExpression with StringRegexExpression {
 
@@ -121,7 +121,7 @@ case class Like(left: Expression, right: Expression)
 }
 
 @ExpressionDescription(
-  usage = "str _FUNC_ regexp - Returns true if str matches regexp and false otherwise.")
+  usage = "str _FUNC_ regexp - Returns true if `str` matches `regexp`, or false otherwise.")
 case class RLike(left: Expression, right: Expression)
   extends BinaryExpression with StringRegexExpression {
 
@@ -175,8 +175,12 @@ case class RLike(left: Expression, right: Expression)
  * Splits str around pat (pattern is a regular expression).
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regex) - Splits str around occurrences that match regex",
-  extended = "> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');\n ['one', 'two', 'three']")
+  usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
+       ["one","two","three",""]
+  """)
 case class StringSplit(str: Expression, pattern: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -206,9 +210,15 @@ case class StringSplit(str: Expression, pattern: Expression)
  *
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str, regexp, rep) - replace all substrings of str that match regexp with rep.",
-  extended = "> SELECT _FUNC_('100-200', '(\\d+)', 'num');\n 'num-num'")
+  usage = "_FUNC_(str, regexp, rep) - Replaces all substrings of `str` that match `regexp` with `rep`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('100-200', '(\d+)', 'num');
+       num-num
+  """)
+// scalastyle:on line.size.limit
 case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -309,8 +319,12 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
  * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regexp[, idx]) - extracts a group that matches regexp.",
-  extended = "> SELECT _FUNC_('100-200', '(\\d+)-(\\d+)', 1);\n '100'")
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts a group that matches `regexp`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('100-200', '(\d+)-(\d+)', 1);
+       100
+  """)
 case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
   def this(s: Expression, r: Expression) = this(s, r, Literal(1))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 25a5e3fd7da73..5f533fecf8d07 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -40,9 +40,15 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
  * An expression that concatenates multiple input strings into a single string.
  * If any input is null, concat returns null.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of str1, str2, ..., strN",
-  extended = "> SELECT _FUNC_('Spark','SQL');\n 'SparkSQL'")
+  usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of `str1`, `str2`, ..., `strN`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark','SQL');
+       SparkSQL
+  """)
+// scalastyle:on line.size.limit
 case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
@@ -78,10 +84,15 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
  *
  * Returns null if the separator is null. Otherwise, concat_ws skips all null values.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage =
-    "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by sep.",
-  extended = "> SELECT _FUNC_(' ', Spark', 'SQL');\n 'Spark SQL'")
+  usage = "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by `sep`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(' ', Spark', 'SQL');
+       Spark SQL
+  """)
+// scalastyle:on line.size.limit
 case class ConcatWs(children: Seq[Expression])
   extends Expression with ImplicitCastInputTypes {
 
@@ -167,9 +178,15 @@ case class ConcatWs(children: Seq[Expression])
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(n, str1, str2, ...) - returns the n-th string, e.g. returns str2 when n is 2",
-  extended = "> SELECT _FUNC_(1, 'scala', 'java') FROM src LIMIT 1;\n" + "'scala'")
+  usage = "_FUNC_(n, str1, str2, ...) - Returns the `n`-th string, e.g., returns `str2` when `n` is 2.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(1, 'scala', 'java');
+       scala
+  """)
+// scalastyle:on line.size.limit
 case class Elt(children: Seq[Expression])
   extends Expression with ImplicitCastInputTypes {
 
@@ -246,8 +263,12 @@ trait String2StringExpression extends ImplicitCastInputTypes {
  * A function that converts the characters of a string to uppercase.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns str with all characters changed to uppercase",
-  extended = "> SELECT _FUNC_('SparkSql');\n 'SPARKSQL'")
+  usage = "_FUNC_(str) - Returns `str` with all characters changed to uppercase.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('SparkSql');
+       SPARKSQL
+  """)
 case class Upper(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -262,8 +283,12 @@ case class Upper(child: Expression)
  * A function that converts the characters of a string to lowercase.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns str with all characters changed to lowercase",
-  extended = "> SELECT _FUNC_('SparkSql');\n 'sparksql'")
+  usage = "_FUNC_(str) - Returns `str` with all characters changed to lowercase.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('SparkSql');
+       sparksql
+  """)
 case class Lower(child: Expression) extends UnaryExpression with String2StringExpression {
 
   override def convert(v: UTF8String): UTF8String = v.toLowerCase
@@ -347,8 +372,12 @@ object StringTranslate {
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(input, from, to) - Translates the input string by replacing the characters present in the from string with the corresponding characters in the to string""",
-  extended = "> SELECT _FUNC_('AaBbCc', 'abc', '123');\n 'A1B2C3'")
+  usage = "_FUNC_(input, from, to) - Translates the `input` string by replacing the characters present in the `from` string with the corresponding characters in the `to` string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('AaBbCc', 'abc', '123');
+       A1B2C3
+  """)
 // scalastyle:on line.size.limit
 case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replaceExpr: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -407,9 +436,15 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(str, str_array) - Returns the index (1-based) of the given string (left) in the comma-delimited list (right).
-    Returns 0, if the string wasn't found or if the given string (left) contains a comma.""",
-  extended = "> SELECT _FUNC_('ab','abc,b,ab,c,def');\n 3")
+  usage = """
+    _FUNC_(str, str_array) - Returns the index (1-based) of the given string (`str`) in the comma-delimited list (`str_array`).
+      Returns 0, if the string was not found or if the given string (`str`) contains a comma.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('ab','abc,b,ab,c,def');
+       3
+  """)
 // scalastyle:on
 case class FindInSet(left: Expression, right: Expression) extends BinaryExpression
     with ImplicitCastInputTypes {
@@ -434,8 +469,12 @@ case class FindInSet(left: Expression, right: Expression) extends BinaryExpressi
  * A function that trim the spaces from both ends for the specified string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the leading and trailing space characters from str.",
-  extended = "> SELECT _FUNC_('    SparkSQL   ');\n 'SparkSQL'")
+  usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL   ');
+       SparkSQL
+  """)
 case class StringTrim(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -452,8 +491,12 @@ case class StringTrim(child: Expression)
  * A function that trim the spaces from left end for given string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the leading space characters from str.",
-  extended = "> SELECT _FUNC_('    SparkSQL   ');\n 'SparkSQL   '")
+  usage = "_FUNC_(str) - Removes the leading and trailing space characters from `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL');
+       SparkSQL
+  """)
 case class StringTrimLeft(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -470,8 +513,12 @@ case class StringTrimLeft(child: Expression)
  * A function that trim the spaces from right end for given string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Removes the trailing space characters from str.",
-  extended = "> SELECT _FUNC_('    SparkSQL   ');\n '    SparkSQL'")
+  usage = "_FUNC_(str) - Removes the trailing space characters from `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('    SparkSQL   ');
+           SparkSQL
+  """)
 case class StringTrimRight(child: Expression)
   extends UnaryExpression with String2StringExpression {
 
@@ -492,8 +539,12 @@ case class StringTrimRight(child: Expression)
  * NOTE: that this is not zero based, but 1-based index. The first character in str has index 1.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of substr in str.",
-  extended = "> SELECT _FUNC_('SparkSQL', 'SQL');\n 6")
+  usage = "_FUNC_(str, substr) - Returns the (1-based) index of the first occurrence of `substr` in `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('SparkSQL', 'SQL');
+       6
+  """)
 case class StringInstr(str: Expression, substr: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -522,12 +573,18 @@ case class StringInstr(str: Expression, substr: Expression)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(str, delim, count) - Returns the substring from str before count occurrences of the delimiter delim.
-    If count is positive, everything to the left of the final delimiter (counting from the
-    left) is returned. If count is negative, everything to the right of the final delimiter
-    (counting from the right) is returned. Substring_index performs a case-sensitive match
-    when searching for delim.""",
-  extended = "> SELECT _FUNC_('www.apache.org', '.', 2);\n 'www.apache'")
+  usage = """
+    _FUNC_(str, delim, count) - Returns the substring from `str` before `count` occurrences of the delimiter `delim`.
+      If `count` is positive, everything to the left of the final delimiter (counting from the
+      left) is returned. If `count` is negative, everything to the right of the final delimiter
+      (counting from the right) is returned. The function substring_index performs a case-sensitive match
+      when searching for `delim`.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('www.apache.org', '.', 2);
+       www.apache
+  """)
 // scalastyle:on line.size.limit
 case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr: Expression)
  extends TernaryExpression with ImplicitCastInputTypes {
@@ -554,9 +611,15 @@ case class SubstringIndex(strExpr: Expression, delimExpr: Expression, countExpr:
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = """_FUNC_(substr, str[, pos]) - Returns the position of the first occurrence of substr in str after position pos.
-    The given pos and return value are 1-based.""",
-  extended = "> SELECT _FUNC_('bar', 'foobarbar', 5);\n 7")
+  usage = """
+    _FUNC_(substr, str[, pos]) - Returns the position of the first occurrence of `substr` in `str` after position `pos`.
+      The given `pos` and return value are 1-based.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('bar', 'foobarbar', 5);
+       7
+  """)
 // scalastyle:on line.size.limit
 case class StringLocate(substr: Expression, str: Expression, start: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -631,10 +694,17 @@ case class StringLocate(substr: Expression, str: Expression, start: Expression)
  * Returns str, left-padded with pad to a length of len.
  */
 @ExpressionDescription(
-  usage = """_FUNC_(str, len, pad) - Returns str, left-padded with pad to a length of len.
-    If str is longer than len, the return value is shortened to len characters.""",
-  extended = "> SELECT _FUNC_('hi', 5, '??');\n '???hi'\n" +
-    "> SELECT _FUNC_('hi', 1, '??');\n 'h'")
+  usage = """
+    _FUNC_(str, len, pad) - Returns `str`, left-padded with `pad` to a length of `len`.
+      If `str` is longer than `len`, the return value is shortened to `len` characters.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('hi', 5, '??');
+       ???hi
+      > SELECT _FUNC_('hi', 1, '??');
+       h
+  """)
 case class StringLPad(str: Expression, len: Expression, pad: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -657,10 +727,17 @@ case class StringLPad(str: Expression, len: Expression, pad: Expression)
  * Returns str, right-padded with pad to a length of len.
  */
 @ExpressionDescription(
-  usage = """_FUNC_(str, len, pad) - Returns str, right-padded with pad to a length of len.
-    If str is longer than len, the return value is shortened to len characters.""",
-  extended = "> SELECT _FUNC_('hi', 5, '??');\n 'hi???'\n" +
-    "> SELECT _FUNC_('hi', 1, '??');\n 'h'")
+  usage = """
+    _FUNC_(str, len, pad) - Returns `str`, right-padded with `pad` to a length of `len`.
+      If `str` is longer than `len`, the return value is shortened to `len` characters.
+  """,
+  extended = """
+    Examples:
+     > SELECT _FUNC_('hi', 5, '??');
+      hi???
+     > SELECT _FUNC_('hi', 1, '??');
+      h
+  """)
 case class StringRPad(str: Expression, len: Expression, pad: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
 
@@ -696,16 +773,16 @@ object ParseUrl {
  * Extracts a part from a URL
  */
 @ExpressionDescription(
-  usage = "_FUNC_(url, partToExtract[, key]) - extracts a part from a URL",
-  extended = """Parts: HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO.
-    Key specifies which query to extract.
+  usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
+  extended = """
     Examples:
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST')
-      'spark.apache.org'
+       spark.apache.org
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY')
-      'query=1'
+       query=1
       > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query')
-      '1'""")
+       1
+  """)
 case class ParseUrl(children: Seq[Expression])
   extends Expression with ExpectsInputTypes with CodegenFallback {
 
@@ -851,8 +928,12 @@ case class ParseUrl(children: Seq[Expression])
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(String format, Obj... args) - Returns a formatted string from printf-style format strings.",
-  extended = "> SELECT _FUNC_(\"Hello World %d %s\", 100, \"days\");\n 'Hello World 100 days'")
+  usage = "_FUNC_(strfmt, obj, ...) - Returns a formatted string from printf-style format strings.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_("Hello World %d %s", 100, "days");
+       Hello World 100 days
+  """)
 // scalastyle:on line.size.limit
 case class FormatString(children: Expression*) extends Expression with ImplicitCastInputTypes {
 
@@ -923,10 +1004,15 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
  * Words are delimited by whitespace.
  */
 @ExpressionDescription(
-  usage =
-   """_FUNC_(str) - Returns str with the first letter of each word in uppercase.
-     All other letters are in lowercase. Words are delimited by white space.""",
-  extended = "> SELECT initcap('sPark sql');\n 'Spark Sql'")
+  usage = """
+    _FUNC_(str) - Returns `str` with the first letter of each word in uppercase.
+      All other letters are in lowercase. Words are delimited by white space.
+  """,
+  extended = """
+    Examples:
+      > SELECT initcap('sPark sql');
+       Spark Sql
+  """)
 case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[DataType] = Seq(StringType)
@@ -944,8 +1030,12 @@ case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastI
  * Returns the string which repeat the given string value n times.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, n) - Returns the string which repeat the given string value n times.",
-  extended = "> SELECT _FUNC_('123', 2);\n '123123'")
+  usage = "_FUNC_(str, n) - Returns the string which repeats the given string value n times.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('123', 2);
+       123123
+  """)
 case class StringRepeat(str: Expression, times: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -970,7 +1060,11 @@ case class StringRepeat(str: Expression, times: Expression)
  */
 @ExpressionDescription(
   usage = "_FUNC_(str) - Returns the reversed given string.",
-  extended = "> SELECT _FUNC_('Spark SQL');\n 'LQS krapS'")
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       LQS krapS
+  """)
 case class StringReverse(child: Expression) extends UnaryExpression with String2StringExpression {
   override def convert(v: UTF8String): UTF8String = v.reverse()
 
@@ -982,11 +1076,15 @@ case class StringReverse(child: Expression) extends UnaryExpression with String2
 }
 
 /**
- * Returns a n spaces string.
+ * Returns a string consisting of n spaces.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(n) - Returns a n spaces string.",
-  extended = "> SELECT _FUNC_(2);\n '  '")
+  usage = "_FUNC_(n) - Returns a string consisting of `n` spaces.",
+  extended = """
+    Examples:
+      > SELECT concat(_FUNC_(2), '1');
+         1
+  """)
 case class StringSpace(child: Expression)
   extends UnaryExpression with ImplicitCastInputTypes {
 
@@ -1014,8 +1112,16 @@ case class StringSpace(child: Expression)
  */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str, pos[, len]) - Returns the substring of str that starts at pos and is of length len or the slice of byte array that starts at pos and is of length len.",
-  extended = "> SELECT _FUNC_('Spark SQL', 5);\n 'k SQL'\n> SELECT _FUNC_('Spark SQL', -3);\n 'SQL'\n> SELECT _FUNC_('Spark SQL', 5, 1);\n 'k'")
+  usage = "_FUNC_(str, pos[, len]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL', 5);
+       k SQL
+      > SELECT _FUNC_('Spark SQL', -3);
+       SQL
+      > SELECT _FUNC_('Spark SQL', 5, 1);
+       k
+  """)
 // scalastyle:on line.size.limit
 case class Substring(str: Expression, pos: Expression, len: Expression)
   extends TernaryExpression with ImplicitCastInputTypes {
@@ -1055,8 +1161,12 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
  * A function that return the length of the given string or binary expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str | binary) - Returns the length of str or number of bytes in binary data.",
-  extended = "> SELECT _FUNC_('Spark SQL');\n 9")
+  usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       9
+  """)
 case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -1079,7 +1189,11 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
  */
 @ExpressionDescription(
   usage = "_FUNC_(str1, str2) - Returns the Levenshtein distance between the two given strings.",
-  extended = "> SELECT _FUNC_('kitten', 'sitting');\n 3")
+  extended = """
+    Examples:
+      > SELECT _FUNC_('kitten', 'sitting');
+       3
+  """)
 case class Levenshtein(left: Expression, right: Expression) extends BinaryExpression
     with ImplicitCastInputTypes {
 
@@ -1096,11 +1210,15 @@ case class Levenshtein(left: Expression, right: Expression) extends BinaryExpres
 }
 
 /**
- * A function that return soundex code of the given string expression.
+ * A function that return Soundex code of the given string expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns soundex code of the string.",
-  extended = "> SELECT _FUNC_('Miller');\n 'M460'")
+  usage = "_FUNC_(str) - Returns Soundex code of the string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Miller');
+       M460
+  """)
 case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputTypes {
 
   override def dataType: DataType = StringType
@@ -1118,9 +1236,14 @@ case class SoundEx(child: Expression) extends UnaryExpression with ExpectsInputT
  * Returns the numeric value of the first character of str.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Returns the numeric value of the first character of str.",
-  extended = "> SELECT _FUNC_('222');\n 50\n" +
-    "> SELECT _FUNC_(2);\n 50")
+  usage = "_FUNC_(str) - Returns the numeric value of the first character of `str`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('222');
+       50
+      > SELECT _FUNC_(2);
+       50
+  """)
 case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = IntegerType
@@ -1153,7 +1276,12 @@ case class Ascii(child: Expression) extends UnaryExpression with ImplicitCastInp
  * Converts the argument from binary to a base 64 string.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(bin) - Convert the argument from binary to a base 64 string.")
+  usage = "_FUNC_(bin) - Converts the argument from a binary `bin` to a base 64 string.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       U3BhcmsgU1FM
+  """)
 case class Base64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = StringType
@@ -1177,7 +1305,12 @@ case class Base64(child: Expression) extends UnaryExpression with ImplicitCastIn
  * Converts the argument from a base 64 string to BINARY.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str) - Convert the argument from a base 64 string to binary.")
+  usage = "_FUNC_(str) - Converts the argument from a base 64 string `str` to a binary.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('U3BhcmsgU1FM');
+       Spark SQL
+  """)
 case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
 
   override def dataType: DataType = BinaryType
@@ -1199,8 +1332,15 @@ case class UnBase64(child: Expression) extends UnaryExpression with ImplicitCast
  * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
  * If either argument is null, the result will also be null.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(bin, str) - Decode the first argument using the second argument character set.")
+  usage = "_FUNC_(bin, charset) - Decodes the first argument using the second argument character set.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8');
+       abc
+  """)
+// scalastyle:on line.size.limit
 case class Decode(bin: Expression, charset: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -1231,8 +1371,15 @@ case class Decode(bin: Expression, charset: Expression)
  * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').
  * If either argument is null, the result will also be null.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str, str) - Encode the first argument using the second argument character set.")
+  usage = "_FUNC_(str, charset) - Encodes the first argument using the second argument character set.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('abc', 'utf-8');
+       abc
+  """)
+// scalastyle:on line.size.limit
 case class Encode(value: Expression, charset: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {
 
@@ -1263,10 +1410,16 @@ case class Encode(value: Expression, charset: Expression)
  * fractional part.
  */
 @ExpressionDescription(
-  usage = """_FUNC_(X, D) - Formats the number X like '#,###,###.##', rounded to D decimal places.
-    If D is 0, the result has no decimal point or fractional part.
-    This is supposed to function like MySQL's FORMAT.""",
-  extended = "> SELECT _FUNC_(12332.123456, 4);\n '12,332.1235'")
+  usage = """
+    _FUNC_(expr1, expr2) - Formats the number `expr1` like '#,###,###.##', rounded to `expr2`
+      decimal places. If `expr2` is 0, the result has no decimal point or fractional part.
+      This is supposed to function like MySQL's FORMAT.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_(12332.123456, 4);
+       12,332.1235
+  """)
 case class FormatNumber(x: Expression, d: Expression)
   extends BinaryExpression with ExpectsInputTypes {
 
@@ -1388,8 +1541,12 @@ case class FormatNumber(x: Expression, d: Expression)
  * The 'lang' and 'country' arguments are optional, and if omitted, the default locale is used.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str[, lang, country]) - Splits str into an array of array of words.",
-  extended = "> SELECT _FUNC_('Hi there! Good morning.');\n  [['Hi','there'], ['Good','morning']]")
+  usage = "_FUNC_(str[, lang, country]) - Splits `str` into an array of array of words.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Hi there! Good morning.');
+       [["Hi","there"],["Good","morning"]]
+  """)
 case class Sentences(
     str: Expression,
     language: Expression = Literal(""),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index b47486f7af7f9..3cbbcdf4a96cc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -321,7 +321,7 @@ abstract class OffsetWindowFunction
   val input: Expression
 
   /**
-   * Default result value for the function when the 'offset'th row does not exist.
+   * Default result value for the function when the `offset`th row does not exist.
    */
   val default: Expression
 
@@ -372,22 +372,23 @@ abstract class OffsetWindowFunction
 }
 
 /**
- * The Lead function returns the value of 'x' at the 'offset'th row after the current row in
+ * The Lead function returns the value of `input` at the `offset`th row after the current row in
  * the window. Offsets start at 0, which is the current row. The offset must be constant
- * integer value. The default offset is 1. When the value of 'x' is null at the 'offset'th row,
- * null is returned. If there is no such offset row, the default expression is evaluated.
+ * integer value. The default offset is 1. When the value of `input` is null at the `offset`th row,
+ * null is returned. If there is no such offset row, the `default` expression is evaluated.
  *
- * @param input expression to evaluate 'offset' rows after the current row.
+ * @param input expression to evaluate `offset` rows after the current row.
  * @param offset rows to jump ahead in the partition.
  * @param default to use when the offset is larger than the window. The default value is null.
  */
-@ExpressionDescription(usage =
-  """_FUNC_(input, offset, default) - LEAD returns the value of 'x' at the 'offset'th row
-     after the current row in the window.
-     The default value of 'offset' is 1 and the default value of 'default' is null.
-     If the value of 'x' at the 'offset'th row is null, null is returned.
-     If there is no such offset row (e.g. when the offset is 1, the last row of the window
-     does not have any subsequent row), 'default' is returned.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_(input[, offset[, default]]) - Returns the value of `input` at the `offset`th row
+      after the current row in the window. The default value of `offset` is 1 and the default
+      value of `default` is null. If the value of `input` at the `offset`th row is null,
+      null is returned. If there is no such an offset row (e.g., when the offset is 1, the last
+      row of the window does not have any subsequent row), `default` is returned.
+  """)
 case class Lead(input: Expression, offset: Expression, default: Expression)
     extends OffsetWindowFunction {
 
@@ -401,22 +402,23 @@ case class Lead(input: Expression, offset: Expression, default: Expression)
 }
 
 /**
- * The Lag function returns the value of 'x' at the 'offset'th row before the current row in
+ * The Lag function returns the value of `input` at the `offset`th row before the current row in
  * the window. Offsets start at 0, which is the current row. The offset must be constant
- * integer value. The default offset is 1. When the value of 'x' is null at the 'offset'th row,
- * null is returned. If there is no such offset row, the default expression is evaluated.
+ * integer value. The default offset is 1. When the value of `input` is null at the `offset`th row,
+ * null is returned. If there is no such offset row, the `default` expression is evaluated.
  *
- * @param input expression to evaluate 'offset' rows before the current row.
+ * @param input expression to evaluate `offset` rows before the current row.
  * @param offset rows to jump back in the partition.
  * @param default to use when the offset row does not exist.
  */
-@ExpressionDescription(usage =
-  """_FUNC_(input, offset, default) - LAG returns the value of 'x' at the 'offset'th row
-     before the current row in the window.
-     The default value of 'offset' is 1 and the default value of 'default' is null.
-     If the value of 'x' at the 'offset'th row is null, null is returned.
-     If there is no such offset row (e.g. when the offset is 1, the first row of the window
-     does not have any previous row), 'default' is returned.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_(input[, offset[, default]]) - Returns the value of `input` at the `offset`th row
+      before the current row in the window. The default value of `offset` is 1 and the default
+      value of `default` is null. If the value of `input` at the `offset`th row is null,
+      null is returned. If there is no such offset row (e.g., when the offset is 1, the first
+      row of the window does not have any previous row), `default` is returned.
+  """)
 case class Lag(input: Expression, offset: Expression, default: Expression)
     extends OffsetWindowFunction {
 
@@ -471,26 +473,28 @@ object SizeBasedWindowFunction {
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - The ROW_NUMBER() function assigns a unique, sequential number to
-     each row, starting with one, according to the ordering of rows within
-     the window partition.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Assigns a unique, sequential number to each row, starting with one,
+      according to the ordering of rows within the window partition.
+  """)
 case class RowNumber() extends RowNumberLike {
   override val evaluateExpression = rowNumber
   override def prettyName: String = "row_number"
 }
 
 /**
- * The CumeDist function computes the position of a value relative to a all values in the partition.
+ * The CumeDist function computes the position of a value relative to all values in the partition.
  * The result is the number of rows preceding or equal to the current row in the ordering of the
  * partition divided by the total number of rows in the window partition. Any tie values in the
  * ordering will evaluate to the same position.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - The CUME_DIST() function computes the position of a value relative to
-     a all values in the partition.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the position of a value relative to all values in the partition.
+  """)
 case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
   override def dataType: DataType = DoubleType
   // The frame for CUME_DIST is Range based instead of Row based, because CUME_DIST must
@@ -501,8 +505,8 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
 }
 
 /**
- * The NTile function divides the rows for each window partition into 'n' buckets ranging from 1 to
- * at most 'n'. Bucket values will differ by at most 1. If the number of rows in the partition does
+ * The NTile function divides the rows for each window partition into `n` buckets ranging from 1 to
+ * at most `n`. Bucket values will differ by at most 1. If the number of rows in the partition does
  * not divide evenly into the number of buckets, then the remainder values are distributed one per
  * bucket, starting with the first bucket.
  *
@@ -521,9 +525,11 @@ case class CumeDist() extends RowNumberLike with SizeBasedWindowFunction {
  *
  * @param buckets number of buckets to divide the rows in. Default value is 1.
  */
-@ExpressionDescription(usage =
-  """_FUNC_(x) - The NTILE(n) function divides the rows for each window partition
-     into 'n' buckets ranging from 1 to at most 'n'.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_(n) - Divides the rows for each window partition into `n` buckets ranging
+      from 1 to at most `n`.
+  """)
 case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindowFunction {
   def this() = this(Literal(1))
 
@@ -587,9 +593,9 @@ case class NTile(buckets: Expression) extends RowNumberLike with SizeBasedWindow
 
 /**
  * A RankLike function is a WindowFunction that changes its value based on a change in the value of
- * the order of the window in which is processed. For instance, when the value of 'x' changes in a
- * window ordered by 'x' the rank function also changes. The size of the change of the rank function
- * is (typically) not dependent on the size of the change in 'x'.
+ * the order of the window in which is processed. For instance, when the value of `input` changes
+ * in a window ordered by `input` the rank function also changes. The size of the change of the
+ * rank function is (typically) not dependent on the size of the change in `input`.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  */
@@ -635,7 +641,7 @@ abstract class RankLike extends AggregateWindowFunction {
 
 /**
  * The Rank function computes the rank of a value in a group of values. The result is one plus the
- * number of rows preceding or equal to the current row in the ordering of the partition. Tie values
+ * number of rows preceding or equal to the current row in the ordering of the partition. The values
  * will produce gaps in the sequence.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
@@ -644,10 +650,12 @@ abstract class RankLike extends AggregateWindowFunction {
  *                 change in rank. This is an internal parameter and will be assigned by the
  *                 Analyser.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() -  RANK() computes the rank of a value in a group of values. The result
-     is one plus the number of rows preceding or equal to the current row in the
-     ordering of the partition. Tie values will produce gaps in the sequence.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the rank of a value in a group of values. The result is one plus the number
+      of rows preceding or equal to the current row in the ordering of the partition. The values
+      will produce gaps in the sequence.
+  """)
 case class Rank(children: Seq[Expression]) extends RankLike {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): Rank = Rank(order)
@@ -655,8 +663,8 @@ case class Rank(children: Seq[Expression]) extends RankLike {
 
 /**
  * The DenseRank function computes the rank of a value in a group of values. The result is one plus
- * the previously assigned rank value. Unlike Rank, DenseRank will not produce gaps in the ranking
- * sequence.
+ * the previously assigned rank value. Unlike [[Rank]], [[DenseRank]] will not produce gaps in the
+ * ranking sequence.
  *
  * This documentation has been based upon similar documentation for the Hive and Presto projects.
  *
@@ -664,10 +672,12 @@ case class Rank(children: Seq[Expression]) extends RankLike {
  *                 change in rank. This is an internal parameter and will be assigned by the
  *                 Analyser.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - The DENSE_RANK() function computes the rank of a value in a group of
-     values. The result is one plus the previously assigned rank value. Unlike Rank,
-     DenseRank will not produce gaps in the ranking sequence.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the rank of a value in a group of values. The result is one plus the
+      previously assigned rank value. Unlike the function rank, dense_rank will not produce gaps
+      in the ranking sequence.
+  """)
 case class DenseRank(children: Seq[Expression]) extends RankLike {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): DenseRank = DenseRank(order)
@@ -692,9 +702,10 @@ case class DenseRank(children: Seq[Expression]) extends RankLike {
  *                 change in rank. This is an internal parameter and will be assigned by the
  *                 Analyser.
  */
-@ExpressionDescription(usage =
-  """_FUNC_() - PERCENT_RANK() The PercentRank function computes the percentage
-     ranking of a value in a group of values.""")
+@ExpressionDescription(
+  usage = """
+    _FUNC_() - Computes the percentage ranking of a value in a group of values.
+  """)
 case class PercentRank(children: Seq[Expression]) extends RankLike with SizeBasedWindowFunction {
   def this() = this(Nil)
   override def withOrder(order: Seq[Expression]): PercentRank = PercentRank(order)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
index 47f039e6a4cc4..aa328045cafdb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xml/xpath.scala
@@ -55,9 +55,15 @@ abstract class XPathExtract extends BinaryExpression with ExpectsInputTypes with
   def path: Expression
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Evaluates a boolean xpath expression.",
-  extended = "> SELECT _FUNC_('<a><b>1</b></a>','a/b');\ntrue")
+  usage = "_FUNC_(xml, xpath) - Returns true if the XPath expression evaluates to true, or if a matching node is found.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b></a>','a/b');
+       true
+  """)
+// scalastyle:on line.size.limit
 case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract {
 
   override def prettyName: String = "xpath_boolean"
@@ -68,11 +74,17 @@ case class XPathBoolean(xml: Expression, path: Expression) extends XPathExtract
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a short value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3")
+  usage = "_FUNC_(xml, xpath) - Returns a short integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3
+  """)
+// scalastyle:on line.size.limit
 case class XPathShort(xml: Expression, path: Expression) extends XPathExtract {
-  override def prettyName: String = "xpath_int"
+  override def prettyName: String = "xpath_short"
   override def dataType: DataType = ShortType
 
   override def nullSafeEval(xml: Any, path: Any): Any = {
@@ -81,9 +93,15 @@ case class XPathShort(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns an integer value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3")
+  usage = "_FUNC_(xml, xpath) - Returns an integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3
+  """)
+// scalastyle:on line.size.limit
 case class XPathInt(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_int"
   override def dataType: DataType = IntegerType
@@ -94,9 +112,15 @@ case class XPathInt(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a long value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3")
+  usage = "_FUNC_(xml, xpath) - Returns a long integer value, or the value zero if no match is found, or a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3
+  """)
+// scalastyle:on line.size.limit
 case class XPathLong(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_long"
   override def dataType: DataType = LongType
@@ -107,9 +131,15 @@ case class XPathLong(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a float value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3.0")
+  usage = "_FUNC_(xml, xpath) - Returns a float value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3.0
+  """)
+// scalastyle:on line.size.limit
 case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_float"
   override def dataType: DataType = FloatType
@@ -120,9 +150,15 @@ case class XPathFloat(xml: Expression, path: Expression) extends XPathExtract {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a double value that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>1</b><b>2</b></a>','sum(a/b)');\n3.0")
+  usage = "_FUNC_(xml, xpath) - Returns a double value, the value zero if no match is found, or NaN if a match is found but the value is non-numeric.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>1</b><b>2</b></a>', 'sum(a/b)');
+       3.0
+  """)
+// scalastyle:on line.size.limit
 case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_float"
   override def dataType: DataType = DoubleType
@@ -135,8 +171,12 @@ case class XPathDouble(xml: Expression, path: Expression) extends XPathExtract {
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns the text contents of the first xml node that matches the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>b</b><c>cc</c></a>','a/c');\ncc")
+  usage = "_FUNC_(xml, xpath) - Returns the text contents of the first xml node that matches the XPath expression.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>b</b><c>cc</c></a>','a/c');
+       cc
+  """)
 // scalastyle:on line.size.limit
 case class XPathString(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath_string"
@@ -150,8 +190,12 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract {
 
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(xml, xpath) - Returns a string array of values within xml nodes that match the xpath expression",
-  extended = "> SELECT _FUNC_('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()');\n['b1','b2','b3']")
+  usage = "_FUNC_(xml, xpath) - Returns a string array of values within the nodes of xml that match the XPath expression.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()');
+       ['b1','b2','b3']
+  """)
 // scalastyle:on line.size.limit
 case class XPathList(xml: Expression, path: Expression) extends XPathExtract {
   override def prettyName: String = "xpath"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
index 24d825f5cb33a..ea5398761c46d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala
@@ -103,18 +103,22 @@ case class DescribeFunctionCommand(
     functionName.funcName.toLowerCase match {
       case "<>" =>
         Row(s"Function: $functionName") ::
-          Row(s"Usage: a <> b - Returns TRUE if a is not equal to b") :: Nil
+          Row("Usage: expr1 <> expr2 - " +
+            "Returns true if `expr1` is not equal to `expr2`.") :: Nil
       case "!=" =>
         Row(s"Function: $functionName") ::
-          Row(s"Usage: a != b - Returns TRUE if a is not equal to b") :: Nil
+          Row("Usage: expr1 != expr2 - " +
+            "Returns true if `expr1` is not equal to `expr2`.") :: Nil
       case "between" =>
-        Row(s"Function: between") ::
-          Row(s"Usage: a [NOT] BETWEEN b AND c - " +
-            s"evaluate if a is [not] in between b and c") :: Nil
+        Row("Function: between") ::
+          Row("Usage: expr1 [NOT] BETWEEN expr2 AND expr3 - " +
+            "evaluate if `expr1` is [not] in between `expr2` and `expr3`.") :: Nil
       case "case" =>
-        Row(s"Function: case") ::
-          Row(s"Usage: CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - " +
-            s"When a = b, returns c; when a = d, return e; else return f") :: Nil
+        Row("Function: case") ::
+          Row("Usage: CASE expr1 WHEN expr2 THEN expr3 " +
+            "[WHEN expr4 THEN expr5]* [ELSE expr6] END - " +
+            "When `expr1` = `expr2`, returns `expr3`; " +
+            "when `expr1` = `expr4`, return `expr5`; else return `expr6`.") :: Nil
       case _ =>
         try {
           val info = sparkSession.sessionState.catalog.lookupFunctionInfo(functionName)
@@ -126,7 +130,7 @@ case class DescribeFunctionCommand(
 
           if (isExtended) {
             result :+
-              Row(s"Extended Usage:\n${replaceFunctionName(info.getExtended, name)}")
+              Row(s"Extended Usage:${replaceFunctionName(info.getExtended, info.getName)}")
           } else {
             result
           }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 9a3d93cf17b78..6b517bc70f7d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -85,15 +85,16 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     checkKeywordsExist(sql("describe function extended upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase",
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase",
       "Extended Usage:",
+      "Examples:",
       "> SELECT upper('SparkSql');",
-      "'SPARKSQL'")
+      "SPARKSQL")
 
     checkKeywordsExist(sql("describe functioN Upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase")
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase")
 
     checkKeywordsNotExist(sql("describe functioN Upper"), "Extended Usage")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index bde3c8a42e1c0..22d4c929bf565 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1445,34 +1445,34 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       sql("DESCRIBE FUNCTION log"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Logarithm") ::
         Row("Function: log") ::
-        Row("Usage: log(b, x) - Returns the logarithm of x with base b.") :: Nil
+        Row("Usage: log(base, expr) - Returns the logarithm of `expr` with `base`.") :: Nil
     )
     // predicate operator
     checkAnswer(
       sql("DESCRIBE FUNCTION or"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Or") ::
         Row("Function: or") ::
-        Row("Usage: a or b - Logical OR.") :: Nil
+        Row("Usage: expr1 or expr2 - Logical OR.") :: Nil
     )
     checkAnswer(
       sql("DESCRIBE FUNCTION !"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Not") ::
         Row("Function: !") ::
-        Row("Usage: ! a - Logical not") :: Nil
+        Row("Usage: ! expr - Logical not.") :: Nil
     )
     // arithmetic operators
     checkAnswer(
       sql("DESCRIBE FUNCTION +"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Add") ::
         Row("Function: +") ::
-        Row("Usage: a + b - Returns a+b.") :: Nil
+        Row("Usage: expr1 + expr2 - Returns `expr1`+`expr2`.") :: Nil
     )
     // comparison operators
     checkAnswer(
       sql("DESCRIBE FUNCTION <"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.LessThan") ::
         Row("Function: <") ::
-        Row("Usage: a < b - Returns TRUE if a is less than b.") :: Nil
+        Row("Usage: expr1 < expr2 - Returns true if `expr1` is less than `expr2`.") :: Nil
     )
     // STRING
     checkAnswer(
@@ -1480,15 +1480,21 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       Row("Class: org.apache.spark.sql.catalyst.expressions.Concat") ::
         Row("Function: concat") ::
         Row("Usage: concat(str1, str2, ..., strN) " +
-          "- Returns the concatenation of str1, str2, ..., strN") :: Nil
+          "- Returns the concatenation of `str1`, `str2`, ..., `strN`.") :: Nil
     )
     // extended mode
     checkAnswer(
       sql("DESCRIBE FUNCTION EXTENDED ^"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.BitwiseXor") ::
-        Row("Extended Usage:\n> SELECT 3 ^ 5; 2") ::
+        Row(
+          """Extended Usage:
+            |    Examples:
+            |      > SELECT 3 ^ 5;
+            |       2
+            |  """.stripMargin) ::
         Row("Function: ^") ::
-        Row("Usage: a ^ b - Bitwise exclusive OR.") :: Nil
+        Row("Usage: expr1 ^ expr2 - Returns the result of " +
+          "bitwise exclusive OR of `expr1` and `expr2`.") :: Nil
     )
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 3a597d6afb153..ad70835d06d92 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -271,15 +271,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     checkKeywordsExist(sql("describe function extended upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase",
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase",
       "Extended Usage:",
-      "> SELECT upper('SparkSql')",
-      "'SPARKSQL'")
+      "Examples:",
+      "> SELECT upper('SparkSql');",
+      "SPARKSQL")
 
     checkKeywordsExist(sql("describe functioN Upper"),
       "Function: upper",
       "Class: org.apache.spark.sql.catalyst.expressions.Upper",
-      "Usage: upper(str) - Returns str with all characters changed to uppercase")
+      "Usage: upper(str) - Returns `str` with all characters changed to uppercase")
 
     checkKeywordsNotExist(sql("describe functioN Upper"),
       "Extended Usage")
@@ -290,25 +291,28 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     checkKeywordsExist(sql("describe functioN  `~`"),
       "Function: ~",
       "Class: org.apache.spark.sql.catalyst.expressions.BitwiseNot",
-      "Usage: ~ b - Bitwise NOT.")
+      "Usage: ~ expr - Returns the result of bitwise NOT of `expr`.")
 
     // Hard coded describe functions
     checkKeywordsExist(sql("describe function  `<>`"),
       "Function: <>",
-      "Usage: a <> b - Returns TRUE if a is not equal to b")
+      "Usage: expr1 <> expr2 - Returns true if `expr1` is not equal to `expr2`")
 
     checkKeywordsExist(sql("describe function  `!=`"),
       "Function: !=",
-      "Usage: a != b - Returns TRUE if a is not equal to b")
+      "Usage: expr1 != expr2 - Returns true if `expr1` is not equal to `expr2`")
 
     checkKeywordsExist(sql("describe function  `between`"),
       "Function: between",
-      "Usage: a [NOT] BETWEEN b AND c - evaluate if a is [not] in between b and c")
+      "Usage: expr1 [NOT] BETWEEN expr2 AND expr3 - " +
+        "evaluate if `expr1` is [not] in between `expr2` and `expr3`")
 
     checkKeywordsExist(sql("describe function  `case`"),
       "Function: case",
-      "Usage: CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - " +
-        "When a = b, returns c; when a = d, return e; else return f")
+      "Usage: CASE expr1 WHEN expr2 THEN expr3 " +
+        "[WHEN expr4 THEN expr5]* [ELSE expr6] END - " +
+        "When `expr1` = `expr2`, returns `expr3`; " +
+        "when `expr1` = `expr4`, return `expr5`; else return `expr6`")
   }
 
   test("describe functions - user defined functions") {

From 2cf39d63833ea0bf2a4c66c259409ee7808fdab6 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 2 Nov 2016 21:01:03 -0700
Subject: [PATCH 017/534] [SPARK-18175][SQL] Improve the test case coverage of
 implicit type casting

### What changes were proposed in this pull request?

So far, we have limited test case coverage about implicit type casting. We need to draw a matrix to find all the possible casting pairs.
- Reorged the existing test cases
- Added all the possible type casting pairs
- Drawed a matrix to show the implicit type casting. The table is very wide. Maybe hard to review. Thus, you also can access the same table via the link to [a google sheet](https://docs.google.com/spreadsheets/d/19PS4ikrs-Yye_mfu-rmIKYGnNe-NmOTt5DDT1fOD3pI/edit?usp=sharing).

SourceType\CastToType | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType | MapType | StructType | NullType | CalendarIntervalType | DecimalType | NumericType | IntegralType
------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |  -----------
**ByteType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(3, 0) | ByteType | ByteType
**ShortType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(5, 0) | ShortType | ShortType
**IntegerType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(10, 0) | IntegerType | IntegerType
**LongType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(20, 0) | LongType | LongType
**DoubleType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(30, 15) | DoubleType | IntegerType
**FloatType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(14, 7) | FloatType | IntegerType
**Dec(10, 2)** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X    | X    | StringType | X    | X    | X    | X    | X    | X    | X    | DecimalType(10, 2) | Dec(10, 2) | IntegerType
**BinaryType** | X    | X    | X    | X    | X    | X    | X    | BinaryType | X    | StringType | X    | X    | X    | X    | X    | X    | X    | X    | X    | X
**BooleanType** | X    | X    | X    | X    | X    | X    | X    | X    | BooleanType | StringType | X    | X    | X    | X    | X    | X    | X    | X    | X    | X
**StringType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | X    | StringType | DateType | TimestampType | X    | X    | X    | X    | X    | DecimalType(38, 18) | DoubleType | X
**DateType** | X    | X    | X    | X    | X    | X    | X    | X    | X    | StringType | DateType | TimestampType | X    | X    | X    | X    | X    | X    | X    | X
**TimestampType** | X    | X    | X    | X    | X    | X    | X    | X    | X    | StringType | DateType | TimestampType | X    | X    | X    | X    | X    | X    | X    | X
**ArrayType** | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | ArrayType* | X    | X    | X    | X    | X    | X    | X
**MapType** | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | MapType* | X    | X    | X    | X    | X    | X
**StructType** | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | StructType* | X    | X    | X    | X    | X
**NullType** | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType | MapType | StructType | NullType | CalendarIntervalType | DecimalType(38, 18) | DoubleType | IntegerType
**CalendarIntervalType** | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | X    | CalendarIntervalType | X    | X    | X
Note: ArrayType\*, MapType\*, StructType\* are castable only when the internal child types also match; otherwise, not castable
### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15691 from gatorsmile/implicitTypeCasting.

(cherry picked from commit 9ddec8636c4f5e8c4592aefecec9886b409ced8f)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../catalyst/analysis/TypeCoercionSuite.scala | 255 ++++++++++++++----
 1 file changed, 199 insertions(+), 56 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 6f69613f85315..590c9d5e8474b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -30,37 +30,211 @@ import org.apache.spark.unsafe.types.CalendarInterval
 
 class TypeCoercionSuite extends PlanTest {
 
-  test("eligible implicit type cast") {
-    def shouldCast(from: DataType, to: AbstractDataType, expected: DataType): Unit = {
-      val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
-      assert(got.map(_.dataType) == Option(expected),
-        s"Failed to cast $from to $to")
+  // scalastyle:off line.size.limit
+  // The following table shows all implicit data type conversions that are not visible to the user.
+  // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
+  // | Source Type\CAST TO  | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType  | MapType  | StructType  | NullType | CalendarIntervalType |     DecimalType     | NumericType | IntegralType |
+  // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
+  // | ByteType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(3, 0)   | ByteType    | ByteType     |
+  // | ShortType            | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(5, 0)   | ShortType   | ShortType    |
+  // | IntegerType          | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(10, 0)  | IntegerType | IntegerType  |
+  // | LongType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(20, 0)  | LongType    | LongType     |
+  // | DoubleType           | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(30, 15) | DoubleType  | IntegerType  |
+  // | FloatType            | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(14, 7)  | FloatType   | IntegerType  |
+  // | Dec(10, 2)           | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | X          | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | DecimalType(10, 2)  | Dec(10, 2)  | IntegerType  |
+  // | BinaryType           | X        | X         | X           | X        | X          | X         | X          | BinaryType | X           | StringType | X        | X             | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | BooleanType          | X        | X         | X           | X        | X          | X         | X          | X          | BooleanType | StringType | X        | X             | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | StringType           | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | X           | StringType | DateType | TimestampType | X          | X        | X           | X        | X                    | DecimalType(38, 18) | DoubleType  | X            |
+  // | DateType             | X        | X         | X           | X        | X          | X         | X          | X          | X           | StringType | DateType | TimestampType | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | TimestampType        | X        | X         | X           | X        | X          | X         | X          | X          | X           | StringType | DateType | TimestampType | X          | X        | X           | X        | X                    | X                   | X           | X            |
+  // | ArrayType            | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | ArrayType* | X        | X           | X        | X                    | X                   | X           | X            |
+  // | MapType              | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | MapType* | X           | X        | X                    | X                   | X           | X            |
+  // | StructType           | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | X        | StructType* | X        | X                    | X                   | X           | X            |
+  // | NullType             | ByteType | ShortType | IntegerType | LongType | DoubleType | FloatType | Dec(10, 2) | BinaryType | BooleanType | StringType | DateType | TimestampType | ArrayType  | MapType  | StructType  | NullType | CalendarIntervalType | DecimalType(38, 18) | DoubleType  | IntegerType  |
+  // | CalendarIntervalType | X        | X         | X           | X        | X          | X         | X          | X          | X           | X          | X        | X             | X          | X        | X           | X        | CalendarIntervalType | X                   | X           | X            |
+  // +----------------------+----------+-----------+-------------+----------+------------+-----------+------------+------------+-------------+------------+----------+---------------+------------+----------+-------------+----------+----------------------+---------------------+-------------+--------------+
+  // Note: ArrayType*, MapType*, StructType* are castable only when the internal child types also match; otherwise, not castable
+  // scalastyle:on line.size.limit
+
+  private def shouldCast(from: DataType, to: AbstractDataType, expected: DataType): Unit = {
+    val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
+    assert(got.map(_.dataType) == Option(expected),
+      s"Failed to cast $from to $to")
+  }
+
+  private def shouldNotCast(from: DataType, to: AbstractDataType): Unit = {
+    val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
+    assert(got.isEmpty, s"Should not be able to cast $from to $to, but got $got")
+  }
+
+  val integralTypes: Seq[DataType] =
+    Seq(ByteType, ShortType, IntegerType, LongType)
+  val fractionalTypes: Seq[DataType] =
+    Seq(DoubleType, FloatType, DecimalType.SYSTEM_DEFAULT, DecimalType(10, 2))
+  val numericTypes: Seq[DataType] = integralTypes ++ fractionalTypes
+  val atomicTypes: Seq[DataType] =
+    numericTypes ++ Seq(BinaryType, BooleanType, StringType, DateType, TimestampType)
+  val complexTypes: Seq[DataType] =
+    Seq(ArrayType(IntegerType),
+      ArrayType(StringType),
+      MapType(StringType, StringType),
+      new StructType().add("a1", StringType),
+      new StructType().add("a1", StringType).add("a2", IntegerType))
+  val allTypes: Seq[DataType] =
+    atomicTypes ++ complexTypes ++ Seq(NullType, CalendarIntervalType)
+
+  // Check whether the type `checkedType` can be cast to all the types in `castableTypes`,
+  // but cannot be cast to the other types in `allTypes`.
+  private def checkTypeCasting(checkedType: DataType, castableTypes: Seq[DataType]): Unit = {
+    val nonCastableTypes = allTypes.filterNot(castableTypes.contains)
+
+    castableTypes.foreach { tpe =>
+      shouldCast(checkedType, tpe, tpe)
+    }
+    nonCastableTypes.foreach { tpe =>
+      shouldNotCast(checkedType, tpe)
     }
+  }
+
+  test("implicit type cast - ByteType") {
+    val checkedType = ByteType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.ByteDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
+
+  test("implicit type cast - ShortType") {
+    val checkedType = ShortType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.ShortDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
+
+  test("implicit type cast - IntegerType") {
+    val checkedType = IntegerType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(IntegerType, DecimalType, DecimalType.IntDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
 
-    shouldCast(NullType, NullType, NullType)
-    shouldCast(NullType, IntegerType, IntegerType)
-    shouldCast(NullType, DecimalType, DecimalType.SYSTEM_DEFAULT)
+  test("implicit type cast - LongType") {
+    val checkedType = LongType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.LongDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldCast(checkedType, IntegralType, checkedType)
+  }
 
-    shouldCast(ByteType, IntegerType, IntegerType)
-    shouldCast(IntegerType, IntegerType, IntegerType)
-    shouldCast(IntegerType, LongType, LongType)
-    shouldCast(IntegerType, DecimalType, DecimalType(10, 0))
-    shouldCast(LongType, IntegerType, IntegerType)
-    shouldCast(LongType, DecimalType, DecimalType(20, 0))
+  test("implicit type cast - FloatType") {
+    val checkedType = FloatType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.FloatDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldNotCast(checkedType, IntegralType)
+  }
 
-    shouldCast(DateType, TimestampType, TimestampType)
-    shouldCast(TimestampType, DateType, DateType)
+  test("implicit type cast - DoubleType") {
+    val checkedType = DoubleType
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, DecimalType.DoubleDecimal)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldNotCast(checkedType, IntegralType)
+  }
 
-    shouldCast(StringType, IntegerType, IntegerType)
-    shouldCast(StringType, DateType, DateType)
-    shouldCast(StringType, TimestampType, TimestampType)
-    shouldCast(IntegerType, StringType, StringType)
-    shouldCast(DateType, StringType, StringType)
-    shouldCast(TimestampType, StringType, StringType)
+  test("implicit type cast - DecimalType(10, 2)") {
+    val checkedType = DecimalType(10, 2)
+    checkTypeCasting(checkedType, castableTypes = numericTypes ++ Seq(StringType))
+    shouldCast(checkedType, DecimalType, checkedType)
+    shouldCast(checkedType, NumericType, checkedType)
+    shouldNotCast(checkedType, IntegralType)
+  }
 
-    shouldCast(StringType, BinaryType, BinaryType)
-    shouldCast(BinaryType, StringType, StringType)
+  test("implicit type cast - BinaryType") {
+    val checkedType = BinaryType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
 
+  test("implicit type cast - BooleanType") {
+    val checkedType = BooleanType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - StringType") {
+    val checkedType = StringType
+    val nonCastableTypes =
+      complexTypes ++ Seq(BooleanType, NullType, CalendarIntervalType)
+    checkTypeCasting(checkedType, castableTypes = allTypes.filterNot(nonCastableTypes.contains))
+    shouldCast(checkedType, DecimalType, DecimalType.SYSTEM_DEFAULT)
+    shouldCast(checkedType, NumericType, NumericType.defaultConcreteType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - DateType") {
+    val checkedType = DateType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType, TimestampType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - TimestampType") {
+    val checkedType = TimestampType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType, StringType, DateType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - ArrayType(StringType)") {
+    val checkedType = ArrayType(StringType)
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - MapType(StringType, StringType)") {
+    val checkedType = MapType(StringType, StringType)
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - StructType().add(\"a1\", StringType)") {
+    val checkedType = new StructType().add("a1", StringType)
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("implicit type cast - NullType") {
+    val checkedType = NullType
+    checkTypeCasting(checkedType, castableTypes = allTypes)
+    shouldCast(checkedType, DecimalType, DecimalType.SYSTEM_DEFAULT)
+    shouldCast(checkedType, NumericType, NumericType.defaultConcreteType)
+    shouldCast(checkedType, IntegralType, IntegralType.defaultConcreteType)
+  }
+
+  test("implicit type cast - CalendarIntervalType") {
+    val checkedType = CalendarIntervalType
+    checkTypeCasting(checkedType, castableTypes = Seq(checkedType))
+    shouldNotCast(checkedType, DecimalType)
+    shouldNotCast(checkedType, NumericType)
+    shouldNotCast(checkedType, IntegralType)
+  }
+
+  test("eligible implicit type cast - TypeCollection") {
     shouldCast(NullType, TypeCollection(StringType, BinaryType), StringType)
 
     shouldCast(StringType, TypeCollection(StringType, BinaryType), StringType)
@@ -81,15 +255,8 @@ class TypeCoercionSuite extends PlanTest {
     shouldCast(DecimalType(10, 2), TypeCollection(DecimalType, IntegerType), DecimalType(10, 2))
     shouldCast(IntegerType, TypeCollection(DecimalType(10, 2), StringType), DecimalType(10, 2))
 
-    shouldCast(StringType, NumericType, DoubleType)
     shouldCast(StringType, TypeCollection(NumericType, BinaryType), DoubleType)
 
-    // NumericType should not be changed when function accepts any of them.
-    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType,
-      DecimalType.SYSTEM_DEFAULT, DecimalType(10, 2)).foreach { tpe =>
-      shouldCast(tpe, NumericType, tpe)
-    }
-
     shouldCast(
       ArrayType(StringType, false),
       TypeCollection(ArrayType(StringType), StringType),
@@ -101,32 +268,8 @@ class TypeCoercionSuite extends PlanTest {
       ArrayType(StringType, true))
   }
 
-  test("ineligible implicit type cast") {
-    def shouldNotCast(from: DataType, to: AbstractDataType): Unit = {
-      val got = TypeCoercion.ImplicitTypeCasts.implicitCast(Literal.create(null, from), to)
-      assert(got.isEmpty, s"Should not be able to cast $from to $to, but got $got")
-    }
-
-    shouldNotCast(IntegerType, DateType)
-    shouldNotCast(IntegerType, TimestampType)
-    shouldNotCast(LongType, DateType)
-    shouldNotCast(LongType, TimestampType)
-    shouldNotCast(DecimalType.SYSTEM_DEFAULT, DateType)
-    shouldNotCast(DecimalType.SYSTEM_DEFAULT, TimestampType)
-
+  test("ineligible implicit type cast - TypeCollection") {
     shouldNotCast(IntegerType, TypeCollection(DateType, TimestampType))
-
-    shouldNotCast(IntegerType, ArrayType)
-    shouldNotCast(IntegerType, MapType)
-    shouldNotCast(IntegerType, StructType)
-
-    shouldNotCast(CalendarIntervalType, StringType)
-
-    // Don't implicitly cast complex types to string.
-    shouldNotCast(ArrayType(StringType), StringType)
-    shouldNotCast(MapType(StringType, StringType), StringType)
-    shouldNotCast(new StructType().add("a1", StringType), StringType)
-    shouldNotCast(MapType(StringType, StringType), StringType)
   }
 
   test("tightest common bound for types") {

From 965c964c2657aaf575f0e00ce6b74a8f05172c06 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 2 Nov 2016 23:50:50 -0700
Subject: [PATCH 018/534] [SPARK-18200][GRAPHX] Support zero as an initial
 capacity in OpenHashSet

## What changes were proposed in this pull request?

[SPARK-18200](https://issues.apache.org/jira/browse/SPARK-18200) reports Apache Spark 2.x raises `java.lang.IllegalArgumentException: requirement failed: Invalid initial capacity` while running `triangleCount`. The root cause is that `VertexSet`, a type alias of `OpenHashSet`, does not allow zero as a initial size. This PR loosens the restriction to allow zero.

## How was this patch tested?

Pass the Jenkins test with a new test case in `OpenHashSetSuite`.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15741 from dongjoon-hyun/SPARK-18200.

(cherry picked from commit d24e736471f34ef8f2c12766393379c4213fe96e)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/util/collection/OpenHashSet.scala | 10 +++++++---
 .../spark/util/collection/OpenHashMapSuite.scala       |  3 ---
 .../spark/util/collection/OpenHashSetSuite.scala       |  5 +++++
 .../util/collection/PrimitiveKeyOpenHashMapSuite.scala |  3 ---
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 0f6a425e3db9a..7a1be8515d965 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -48,7 +48,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   require(initialCapacity <= OpenHashSet.MAX_CAPACITY,
     s"Can't make capacity bigger than ${OpenHashSet.MAX_CAPACITY} elements")
-  require(initialCapacity >= 1, "Invalid initial capacity")
+  require(initialCapacity >= 0, "Invalid initial capacity")
   require(loadFactor < 1.0, "Load factor must be less than 1.0")
   require(loadFactor > 0.0, "Load factor must be greater than 0.0")
 
@@ -271,8 +271,12 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
   private def hashcode(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt()
 
   private def nextPowerOf2(n: Int): Int = {
-    val highBit = Integer.highestOneBit(n)
-    if (highBit == n) n else highBit << 1
+    if (n == 0) {
+      2
+    } else {
+      val highBit = Integer.highestOneBit(n)
+      if (highBit == n) n else highBit << 1
+    }
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
index 3066e9996abda..335ecb9320ab9 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     intercept[IllegalArgumentException] {
       new OpenHashMap[String, Int](-1)
     }
-    intercept[IllegalArgumentException] {
-      new OpenHashMap[String, String](0)
-    }
   }
 
   test("primitive value") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
index 2607a543dd614..210bc5c099742 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -176,4 +176,9 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
     assert(set.size === 1000)
     assert(set.capacity > 1000)
   }
+
+  test("SPARK-18200 Support zero as an initial set size") {
+    val set = new OpenHashSet[Long](0)
+    assert(set.size === 0)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
index 508e737b725bc..f5ee428020fd4 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMapSuite.scala
@@ -49,9 +49,6 @@ class PrimitiveKeyOpenHashMapSuite extends SparkFunSuite with Matchers {
     intercept[IllegalArgumentException] {
       new PrimitiveKeyOpenHashMap[Int, Int](-1)
     }
-    intercept[IllegalArgumentException] {
-      new PrimitiveKeyOpenHashMap[Int, Int](0)
-    }
   }
 
   test("basic operations") {

From c4c5328f2ab2ddb2137e575865ced93c6bc624b1 Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Thu, 3 Nov 2016 00:18:03 -0700
Subject: [PATCH 019/534] [SPARK-17122][SQL] support drop current database

## What changes were proposed in this pull request?

In Spark 1.6 and earlier, we can drop the database we are using. In Spark 2.0, native implementation prevent us from dropping current database, which may break some old queries. This PR would re-enable the feature.
## How was this patch tested?

one new unit test in `SessionCatalogSuite`.

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #15011 from adrian-wang/dropcurrent.

(cherry picked from commit 96cc1b5675273c276e04c4dc19ef9033a314292d)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/catalog/SessionCatalog.scala     |  2 --
 .../catalyst/catalog/SessionCatalogSuite.scala    | 15 +++++++++++++++
 .../spark/sql/execution/command/DDLSuite.scala    |  9 +++++----
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 714ef825ab831..2d2120dda8bde 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -160,8 +160,6 @@ class SessionCatalog(
     val dbName = formatDatabaseName(db)
     if (dbName == DEFAULT_DATABASE) {
       throw new AnalysisException(s"Can not drop default database")
-    } else if (dbName == getCurrentDatabase) {
-      throw new AnalysisException(s"Can not drop current database `$dbName`")
     }
     externalCatalog.dropDatabase(dbName, ignoreIfNotExists, cascade)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 187611bc77460..b77fef225a0c8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -127,6 +127,21 @@ class SessionCatalogSuite extends SparkFunSuite {
     catalog.dropDatabase("db_that_does_not_exist", ignoreIfNotExists = true, cascade = false)
   }
 
+  test("drop current database and drop default database") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    catalog.setCurrentDatabase("db1")
+    assert(catalog.getCurrentDatabase == "db1")
+    catalog.dropDatabase("db1", ignoreIfNotExists = false, cascade = true)
+    intercept[NoSuchDatabaseException] {
+      catalog.createTable(newTable("tbl1", "db1"), ignoreIfExists = false)
+    }
+    catalog.setCurrentDatabase("default")
+    assert(catalog.getCurrentDatabase == "default")
+    intercept[AnalysisException] {
+      catalog.dropDatabase("default", ignoreIfNotExists = false, cascade = true)
+    }
+  }
+
   test("alter database") {
     val catalog = new SessionCatalog(newBasicCatalog())
     val db1 = catalog.getDatabaseMetadata("db1")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 22d4c929bf565..d4d001497deb2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1599,10 +1599,11 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   test("drop current database") {
     sql("CREATE DATABASE temp")
     sql("USE temp")
-    val m = intercept[AnalysisException] {
-      sql("DROP DATABASE temp")
-    }.getMessage
-    assert(m.contains("Can not drop current database `temp`"))
+    sql("DROP DATABASE temp")
+    val e = intercept[AnalysisException] {
+        sql("CREATE TABLE t (a INT, b INT)")
+      }.getMessage
+    assert(e.contains("Database 'temp' not found"))
   }
 
   test("drop default database") {

From bc7f05f5f03653c623190b8178bcbe981a41c2f3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 3 Nov 2016 02:42:48 -0700
Subject: [PATCH 020/534] [SPARK-18219] Move commit protocol API (internal)
 from sql/core to core module

## What changes were proposed in this pull request?
This patch moves the new commit protocol API from sql/core to core module, so we can use it in the future in the RDD API.

As part of this patch, I also moved the speficiation of the random uuid for the write path out of the commit protocol, and instead pass in a job id.

## How was this patch tested?
N/A

Author: Reynold Xin <rxin@databricks.com>

Closes #15731 from rxin/SPARK-18219.

(cherry picked from commit 937af592e65f4dd878aafcabf8fe2cfe7fa3d9b3)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../internal/io/FileCommitProtocol.scala      | 126 +++++++++
 .../io/HadoopMapReduceCommitProtocol.scala    | 111 ++++++++
 .../datasources/FileCommitProtocol.scala      | 257 ------------------
 .../datasources/FileFormatWriter.scala        |   3 +-
 .../InsertIntoHadoopFsRelationCommand.scala   |   6 +-
 .../SQLHadoopMapReduceCommitProtocol.scala    |  72 +++++
 .../execution/streaming/FileStreamSink.scala  |   9 +-
 .../ManifestFileCommitProtocol.scala          |   6 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   4 +-
 9 files changed, 327 insertions(+), 267 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
 create mode 100644 core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
new file mode 100644
index 0000000000000..fb8020585cf89
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import org.apache.hadoop.mapreduce._
+
+import org.apache.spark.util.Utils
+
+
+/**
+ * An interface to define how a single Spark job commits its outputs. Two notes:
+ *
+ * 1. Implementations must be serializable, as the committer instance instantiated on the driver
+ *    will be used for tasks on executors.
+ * 2. Implementations should have a constructor with either 2 or 3 arguments:
+ *    (jobId: String, path: String) or (jobId: String, path: String, isAppend: Boolean).
+ * 3. A committer should not be reused across multiple Spark jobs.
+ *
+ * The proper call sequence is:
+ *
+ * 1. Driver calls setupJob.
+ * 2. As part of each task's execution, executor calls setupTask and then commitTask
+ *    (or abortTask if task failed).
+ * 3. When all necessary tasks completed successfully, the driver calls commitJob. If the job
+ *    failed to execute (e.g. too many failed tasks), the job should call abortJob.
+ */
+abstract class FileCommitProtocol {
+  import FileCommitProtocol._
+
+  /**
+   * Setups up a job. Must be called on the driver before any other methods can be invoked.
+   */
+  def setupJob(jobContext: JobContext): Unit
+
+  /**
+   * Commits a job after the writes succeed. Must be called on the driver.
+   */
+  def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit
+
+  /**
+   * Aborts a job after the writes fail. Must be called on the driver.
+   *
+   * Calling this function is a best-effort attempt, because it is possible that the driver
+   * just crashes (or killed) before it can call abort.
+   */
+  def abortJob(jobContext: JobContext): Unit
+
+  /**
+   * Sets up a task within a job.
+   * Must be called before any other task related methods can be invoked.
+   */
+  def setupTask(taskContext: TaskAttemptContext): Unit
+
+  /**
+   * Notifies the commit protocol to add a new file, and gets back the full path that should be
+   * used. Must be called on the executors when running tasks.
+   *
+   * Note that the returned temp file may have an arbitrary path. The commit protocol only
+   * promises that the file will be at the location specified by the arguments after job commit.
+   *
+   * A full file path consists of the following parts:
+   *  1. the base path
+   *  2. some sub-directory within the base path, used to specify partitioning
+   *  3. file prefix, usually some unique job id with the task id
+   *  4. bucket id
+   *  5. source specific file extension, e.g. ".snappy.parquet"
+   *
+   * The "dir" parameter specifies 2, and "ext" parameter specifies both 4 and 5, and the rest
+   * are left to the commit protocol implementation to decide.
+   */
+  def newTaskTempFile(taskContext: TaskAttemptContext, dir: Option[String], ext: String): String
+
+  /**
+   * Commits a task after the writes succeed. Must be called on the executors when running tasks.
+   */
+  def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage
+
+  /**
+   * Aborts a task after the writes have failed. Must be called on the executors when running tasks.
+   *
+   * Calling this function is a best-effort attempt, because it is possible that the executor
+   * just crashes (or killed) before it can call abort.
+   */
+  def abortTask(taskContext: TaskAttemptContext): Unit
+}
+
+
+object FileCommitProtocol {
+  class TaskCommitMessage(val obj: Any) extends Serializable
+
+  object EmptyTaskCommitMessage extends TaskCommitMessage(null)
+
+  /**
+   * Instantiates a FileCommitProtocol using the given className.
+   */
+  def instantiate(className: String, jobId: String, outputPath: String, isAppend: Boolean)
+    : FileCommitProtocol = {
+    val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
+
+    // First try the one with argument (jobId: String, outputPath: String, isAppend: Boolean).
+    // If that doesn't exist, try the one with (jobId: string, outputPath: String).
+    try {
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      ctor.newInstance(jobId, outputPath, isAppend.asInstanceOf[java.lang.Boolean])
+    } catch {
+      case _: NoSuchMethodException =>
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
new file mode 100644
index 0000000000000..66ccb6d437708
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal.io
+
+import java.util.Date
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+
+import org.apache.spark.SparkHadoopWriter
+import org.apache.spark.internal.Logging
+import org.apache.spark.mapred.SparkHadoopMapRedUtil
+
+/**
+ * An [[FileCommitProtocol]] implementation backed by an underlying Hadoop OutputCommitter
+ * (from the newer mapreduce API, not the old mapred API).
+ *
+ * Unlike Hadoop's OutputCommitter, this implementation is serializable.
+ */
+class HadoopMapReduceCommitProtocol(jobId: String, path: String)
+  extends FileCommitProtocol with Serializable with Logging {
+
+  import FileCommitProtocol._
+
+  /** OutputCommitter from Hadoop is not serializable so marking it transient. */
+  @transient private var committer: OutputCommitter = _
+
+  protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
+    context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+  }
+
+  override def newTaskTempFile(
+      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
+    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
+    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
+    // the file name is fine and won't overflow.
+    val split = taskContext.getTaskAttemptID.getTaskID.getId
+    val filename = f"part-$split%05d-$jobId$ext"
+
+    val stagingDir: String = committer match {
+      // For FileOutputCommitter it has its own staging path called "work path".
+      case f: FileOutputCommitter => Option(f.getWorkPath.toString).getOrElse(path)
+      case _ => path
+    }
+
+    dir.map { d =>
+      new Path(new Path(stagingDir, d), filename).toString
+    }.getOrElse {
+      new Path(stagingDir, filename).toString
+    }
+  }
+
+  override def setupJob(jobContext: JobContext): Unit = {
+    // Setup IDs
+    val jobId = SparkHadoopWriter.createJobID(new Date, 0)
+    val taskId = new TaskID(jobId, TaskType.MAP, 0)
+    val taskAttemptId = new TaskAttemptID(taskId, 0)
+
+    // Set up the configuration object
+    jobContext.getConfiguration.set("mapred.job.id", jobId.toString)
+    jobContext.getConfiguration.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
+    jobContext.getConfiguration.set("mapred.task.id", taskAttemptId.toString)
+    jobContext.getConfiguration.setBoolean("mapred.task.is.map", true)
+    jobContext.getConfiguration.setInt("mapred.task.partition", 0)
+
+    val taskAttemptContext = new TaskAttemptContextImpl(jobContext.getConfiguration, taskAttemptId)
+    committer = setupCommitter(taskAttemptContext)
+    committer.setupJob(jobContext)
+  }
+
+  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
+    committer.commitJob(jobContext)
+  }
+
+  override def abortJob(jobContext: JobContext): Unit = {
+    committer.abortJob(jobContext, JobStatus.State.FAILED)
+  }
+
+  override def setupTask(taskContext: TaskAttemptContext): Unit = {
+    committer = setupCommitter(taskContext)
+    committer.setupTask(taskContext)
+  }
+
+  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
+    val attemptId = taskContext.getTaskAttemptID
+    SparkHadoopMapRedUtil.commitTask(
+      committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
+    EmptyTaskCommitMessage
+  }
+
+  override def abortTask(taskContext: TaskAttemptContext): Unit = {
+    committer.abortTask(taskContext)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
deleted file mode 100644
index f5dd5ce22919d..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileCommitProtocol.scala
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources
-
-import java.util.{Date, UUID}
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce._
-import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
-import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-
-import org.apache.spark.SparkHadoopWriter
-import org.apache.spark.internal.Logging
-import org.apache.spark.mapred.SparkHadoopMapRedUtil
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.util.Utils
-
-
-object FileCommitProtocol {
-  class TaskCommitMessage(val obj: Any) extends Serializable
-
-  object EmptyTaskCommitMessage extends TaskCommitMessage(null)
-
-  /**
-   * Instantiates a FileCommitProtocol using the given className.
-   */
-  def instantiate(className: String, outputPath: String, isAppend: Boolean): FileCommitProtocol = {
-    try {
-      val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
-
-      // First try the one with argument (outputPath: String, isAppend: Boolean).
-      // If that doesn't exist, try the one with (outputPath: String).
-      try {
-        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[Boolean])
-        ctor.newInstance(outputPath, isAppend.asInstanceOf[java.lang.Boolean])
-      } catch {
-        case _: NoSuchMethodException =>
-          val ctor = clazz.getDeclaredConstructor(classOf[String])
-          ctor.newInstance(outputPath)
-      }
-    } catch {
-      case e: ClassNotFoundException =>
-        throw e
-    }
-  }
-}
-
-
-/**
- * An interface to define how a single Spark job commits its outputs. Two notes:
- *
- * 1. Implementations must be serializable, as the committer instance instantiated on the driver
- *    will be used for tasks on executors.
- * 2. A committer should not be reused across multiple Spark jobs.
- *
- * The proper call sequence is:
- *
- * 1. Driver calls setupJob.
- * 2. As part of each task's execution, executor calls setupTask and then commitTask
- *    (or abortTask if task failed).
- * 3. When all necessary tasks completed successfully, the driver calls commitJob. If the job
- *    failed to execute (e.g. too many failed tasks), the job should call abortJob.
- */
-abstract class FileCommitProtocol {
-  import FileCommitProtocol._
-
-  /**
-   * Setups up a job. Must be called on the driver before any other methods can be invoked.
-   */
-  def setupJob(jobContext: JobContext): Unit
-
-  /**
-   * Commits a job after the writes succeed. Must be called on the driver.
-   */
-  def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit
-
-  /**
-   * Aborts a job after the writes fail. Must be called on the driver.
-   *
-   * Calling this function is a best-effort attempt, because it is possible that the driver
-   * just crashes (or killed) before it can call abort.
-   */
-  def abortJob(jobContext: JobContext): Unit
-
-  /**
-   * Sets up a task within a job.
-   * Must be called before any other task related methods can be invoked.
-   */
-  def setupTask(taskContext: TaskAttemptContext): Unit
-
-  /**
-   * Notifies the commit protocol to add a new file, and gets back the full path that should be
-   * used. Must be called on the executors when running tasks.
-   *
-   * Note that the returned temp file may have an arbitrary path. The commit protocol only
-   * promises that the file will be at the location specified by the arguments after job commit.
-   *
-   * A full file path consists of the following parts:
-   *  1. the base path
-   *  2. some sub-directory within the base path, used to specify partitioning
-   *  3. file prefix, usually some unique job id with the task id
-   *  4. bucket id
-   *  5. source specific file extension, e.g. ".snappy.parquet"
-   *
-   * The "dir" parameter specifies 2, and "ext" parameter specifies both 4 and 5, and the rest
-   * are left to the commit protocol implementation to decide.
-   */
-  def newTaskTempFile(taskContext: TaskAttemptContext, dir: Option[String], ext: String): String
-
-  /**
-   * Commits a task after the writes succeed. Must be called on the executors when running tasks.
-   */
-  def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage
-
-  /**
-   * Aborts a task after the writes have failed. Must be called on the executors when running tasks.
-   *
-   * Calling this function is a best-effort attempt, because it is possible that the executor
-   * just crashes (or killed) before it can call abort.
-   */
-  def abortTask(taskContext: TaskAttemptContext): Unit
-}
-
-
-/**
- * An [[FileCommitProtocol]] implementation backed by an underlying Hadoop OutputCommitter
- * (from the newer mapreduce API, not the old mapred API).
- *
- * Unlike Hadoop's OutputCommitter, this implementation is serializable.
- */
-class HadoopCommitProtocolWrapper(path: String, isAppend: Boolean)
-  extends FileCommitProtocol with Serializable with Logging {
-
-  import FileCommitProtocol._
-
-  /** OutputCommitter from Hadoop is not serializable so marking it transient. */
-  @transient private var committer: OutputCommitter = _
-
-  /** UUID used to identify the job in file name. */
-  private val uuid: String = UUID.randomUUID().toString
-
-  private def setupCommitter(context: TaskAttemptContext): Unit = {
-    committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
-
-    if (!isAppend) {
-      // If we are appending data to an existing dir, we will only use the output committer
-      // associated with the file output format since it is not safe to use a custom
-      // committer for appending. For example, in S3, direct parquet output committer may
-      // leave partial data in the destination dir when the appending job fails.
-      // See SPARK-8578 for more details.
-      val configuration = context.getConfiguration
-      val clazz =
-        configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
-
-      if (clazz != null) {
-        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
-
-        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
-        // has an associated output committer. To override this output committer,
-        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
-        // If a data source needs to override the output committer, it needs to set the
-        // output committer in prepareForWrite method.
-        if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
-          // The specified output committer is a FileOutputCommitter.
-          // So, we will use the FileOutputCommitter-specified constructor.
-          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
-          committer = ctor.newInstance(new Path(path), context)
-        } else {
-          // The specified output committer is just an OutputCommitter.
-          // So, we will use the no-argument constructor.
-          val ctor = clazz.getDeclaredConstructor()
-          committer = ctor.newInstance()
-        }
-      }
-    }
-    logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
-  }
-
-  override def newTaskTempFile(
-      taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
-    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
-    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
-    // the file name is fine and won't overflow.
-    val split = taskContext.getTaskAttemptID.getTaskID.getId
-    val filename = f"part-$split%05d-$uuid$ext"
-
-    val stagingDir: String = committer match {
-      // For FileOutputCommitter it has its own staging path called "work path".
-      case f: FileOutputCommitter => Option(f.getWorkPath.toString).getOrElse(path)
-      case _ => path
-    }
-
-    dir.map { d =>
-      new Path(new Path(stagingDir, d), filename).toString
-    }.getOrElse {
-      new Path(stagingDir, filename).toString
-    }
-  }
-
-  override def setupJob(jobContext: JobContext): Unit = {
-    // Setup IDs
-    val jobId = SparkHadoopWriter.createJobID(new Date, 0)
-    val taskId = new TaskID(jobId, TaskType.MAP, 0)
-    val taskAttemptId = new TaskAttemptID(taskId, 0)
-
-    // Set up the configuration object
-    jobContext.getConfiguration.set("mapred.job.id", jobId.toString)
-    jobContext.getConfiguration.set("mapred.tip.id", taskAttemptId.getTaskID.toString)
-    jobContext.getConfiguration.set("mapred.task.id", taskAttemptId.toString)
-    jobContext.getConfiguration.setBoolean("mapred.task.is.map", true)
-    jobContext.getConfiguration.setInt("mapred.task.partition", 0)
-
-    val taskAttemptContext = new TaskAttemptContextImpl(jobContext.getConfiguration, taskAttemptId)
-    setupCommitter(taskAttemptContext)
-
-    committer.setupJob(jobContext)
-  }
-
-  override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
-    committer.commitJob(jobContext)
-  }
-
-  override def abortJob(jobContext: JobContext): Unit = {
-    committer.abortJob(jobContext, JobStatus.State.FAILED)
-  }
-
-  override def setupTask(taskContext: TaskAttemptContext): Unit = {
-    setupCommitter(taskContext)
-    committer.setupTask(taskContext)
-  }
-
-  override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
-    val attemptId = taskContext.getTaskAttemptID
-    SparkHadoopMapRedUtil.commitTask(
-      committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
-    EmptyTaskCommitMessage
-  }
-
-  override def abortTask(taskContext: TaskAttemptContext): Unit = {
-    committer.abortTask(taskContext)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index bc00a0a749c09..e404dcd5452b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -29,6 +29,8 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -37,7 +39,6 @@ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.{SQLExecution, UnsafeKVExternalSorter}
-import org.apache.spark.sql.execution.datasources.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 230c74a47ba2a..927c0c5b95a17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -21,6 +21,7 @@ import java.io.IOException
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
@@ -86,8 +87,9 @@ case class InsertIntoHadoopFsRelationCommand(
     if (doInsertion) {
       val committer = FileCommitProtocol.instantiate(
         sparkSession.sessionState.conf.fileCommitProtocolClass,
-        outputPath.toString,
-        isAppend)
+        jobId = java.util.UUID.randomUUID().toString,
+        outputPath = outputPath.toString,
+        isAppend = isAppend)
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
new file mode 100644
index 0000000000000..9b9ed28412cac
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext}
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
+ * Hadoop output committer using an option specified in SQLConf.
+ */
+class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String, isAppend: Boolean)
+  extends HadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging {
+
+  override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
+    var committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+
+    if (!isAppend) {
+      // If we are appending data to an existing dir, we will only use the output committer
+      // associated with the file output format since it is not safe to use a custom
+      // committer for appending. For example, in S3, direct parquet output committer may
+      // leave partial data in the destination dir when the appending job fails.
+      // See SPARK-8578 for more details.
+      val configuration = context.getConfiguration
+      val clazz =
+        configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter])
+
+      if (clazz != null) {
+        logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}")
+
+        // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat
+        // has an associated output committer. To override this output committer,
+        // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS.
+        // If a data source needs to override the output committer, it needs to set the
+        // output committer in prepareForWrite method.
+        if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) {
+          // The specified output committer is a FileOutputCommitter.
+          // So, we will use the FileOutputCommitter-specified constructor.
+          val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext])
+          committer = ctor.newInstance(new Path(path), context)
+        } else {
+          // The specified output committer is just an OutputCommitter.
+          // So, we will use the no-argument constructor.
+          val ctor = clazz.getDeclaredConstructor()
+          committer = ctor.newInstance()
+        }
+      }
+    }
+    logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}")
+    committer
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index daec2b5450971..e849cafef4184 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -20,9 +20,10 @@ package org.apache.spark.sql.execution.streaming
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.execution.datasources.{FileCommitProtocol, FileFormat, FileFormatWriter}
+import org.apache.spark.sql.execution.datasources.{FileFormat, FileFormatWriter}
 
 object FileStreamSink {
   // The name of the subdirectory that is used to store metadata about which files are valid.
@@ -54,7 +55,11 @@ class FileStreamSink(
       logInfo(s"Skipping already committed batch $batchId")
     } else {
       val committer = FileCommitProtocol.instantiate(
-        sparkSession.sessionState.conf.streamingFileCommitProtocolClass, path, isAppend = false)
+        className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
+        jobId = batchId.toString,
+        outputPath = path,
+        isAppend = false)
+
       committer match {
         case manifestCommitter: ManifestFileCommitProtocol =>
           manifestCommitter.setupManifestOptions(fileLog, batchId)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 510312267a98d..1fe13fa1623fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -25,8 +25,8 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.execution.datasources.FileCommitProtocol
-import org.apache.spark.sql.execution.datasources.FileCommitProtocol.TaskCommitMessage
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 
 /**
  * A [[FileCommitProtocol]] that tracks the list of valid files in a manifest file, used in
@@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.datasources.FileCommitProtocol.TaskCommitM
  *
  * @param path path to write the final output to.
  */
-class ManifestFileCommitProtocol(path: String)
+class ManifestFileCommitProtocol(jobId: String, path: String)
   extends FileCommitProtocol with Serializable with Logging {
 
   // Track the list of files added by a task, only used on the executors.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7bb3ac02fa5d0..7b8ed65054c3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -30,7 +30,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.CatalystConf
-import org.apache.spark.sql.execution.datasources.HadoopCommitProtocolWrapper
+import org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol
 import org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol
 import org.apache.spark.util.Utils
 
@@ -385,7 +385,7 @@ object SQLConf {
     SQLConfigBuilder("spark.sql.sources.commitProtocolClass")
       .internal()
       .stringConf
-      .createWithDefault(classOf[HadoopCommitProtocolWrapper].getName)
+      .createWithDefault(classOf[SQLHadoopMapReduceCommitProtocol].getName)
 
   val PARALLEL_PARTITION_DISCOVERY_THRESHOLD =
     SQLConfigBuilder("spark.sql.sources.parallelPartitionDiscovery.threshold")

From 71104c9c97a648c94e6619279ad49752c01c89c3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 3 Nov 2016 02:45:54 -0700
Subject: [PATCH 021/534] [SQL] minor - internal doc improvement for
 InsertIntoTable.

## What changes were proposed in this pull request?
I was reading this part of the code and was really confused by the "partition" parameter. This patch adds some documentation for it to reduce confusion in the future.

I also looked around other logical plans but most of them are either already documented, or pretty self-evident to people that know Spark SQL.

## How was this patch tested?
N/A - doc change only.

Author: Reynold Xin <rxin@databricks.com>

Closes #15749 from rxin/doc-improvement.

(cherry picked from commit 0ea5d5b24c1f7b29efeac0e72d271aba279523f7)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../plans/logical/basicLogicalOperators.scala | 16 ++++++++++
 .../hive/execution/InsertIntoHiveTable.scala  | 31 ++++++++++++++++---
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 7a15c2285d584..65ceab2ce27b1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -360,6 +360,22 @@ case class OverwriteOptions(
   }
 }
 
+/**
+ * Insert some data into a table.
+ *
+ * @param table the logical plan representing the table. In the future this should be a
+ *              [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we converge Hive tables
+ *              and data source tables.
+ * @param partition a map from the partition key to the partition value (optional). If the partition
+ *                  value is optional, dynamic partition insert will be performed.
+ *                  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have
+ *                  Map('a' -> Some('1'), 'b' -> Some('2')),
+ *                  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
+ *                  would have Map('a' -> Some('1'), 'b' -> None).
+ * @param child the logical plan representing data to write to.
+ * @param overwrite overwrite existing table or partitions.
+ * @param ifNotExists If true, only write if the table or partition does not exist.
+ */
 case class InsertIntoTable(
     table: LogicalPlan,
     partition: Map[String, Option[String]],
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 05164d774ccaf..15be12cfc0ad4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -35,13 +35,35 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
-import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand}
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.SparkException
 import org.apache.spark.util.SerializableJobConf
 
 
+/**
+ * Command for writing data out to a Hive table.
+ *
+ * This class is mostly a mess, for legacy reasons (since it evolved in organic ways and had to
+ * follow Hive's internal implementations closely, which itself was a mess too). Please don't
+ * blame Reynold for this! He was just moving code around!
+ *
+ * In the future we should converge the write path for Hive with the normal data source write path,
+ * as defined in [[org.apache.spark.sql.execution.datasources.FileFormatWriter]].
+ *
+ * @param table the logical plan representing the table. In the future this should be a
+ *              [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] once we converge Hive tables
+ *              and data source tables.
+ * @param partition a map from the partition key to the partition value (optional). If the partition
+ *                  value is optional, dynamic partition insert will be performed.
+ *                  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have
+ *                  Map('a' -> Some('1'), 'b' -> Some('2')),
+ *                  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
+ *                  would have Map('a' -> Some('1'), 'b' -> None).
+ * @param child the logical plan representing data to write to.
+ * @param overwrite overwrite existing table or partitions.
+ * @param ifNotExists If true, only write if the table or partition does not exist.
+ */
 case class InsertIntoHiveTable(
     table: MetastoreRelation,
     partition: Map[String, Option[String]],
@@ -81,8 +103,7 @@ case class InsertIntoHiveTable(
         throw new IllegalStateException("Cannot create staging directory  '" + dir.toString + "'")
       }
       fs.deleteOnExit(dir)
-    }
-    catch {
+    } catch {
       case e: IOException =>
         throw new RuntimeException(
           "Cannot create staging directory '" + dir.toString + "': " + e.getMessage, e)
@@ -123,7 +144,7 @@ case class InsertIntoHiveTable(
 
     FileOutputFormat.setOutputPath(
       conf.value,
-      SparkHiveWriterContainer.createPathFromString(fileSinkConf.getDirName, conf.value))
+      SparkHiveWriterContainer.createPathFromString(fileSinkConf.getDirName(), conf.value))
     log.debug("Saving as hadoop file of type " + valueClass.getSimpleName)
     writerContainer.driverSideSetup()
     sqlContext.sparkContext.runJob(rdd, writerContainer.writeToFile _)
@@ -263,7 +284,7 @@ case class InsertIntoHiveTable(
           // version and we may not want to catch up new Hive version every time. We delete the
           // Hive partition first and then load data file into the Hive partition.
           if (oldPart.nonEmpty && overwrite) {
-            oldPart.get.storage.locationUri.map { uri =>
+            oldPart.get.storage.locationUri.foreach { uri =>
               val partitionPath = new Path(uri)
               val fs = partitionPath.getFileSystem(hadoopConf)
               if (fs.exists(partitionPath)) {

From 99891e56ea286580323fd82e303064d3c0730d85 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 3 Nov 2016 07:45:20 -0700
Subject: [PATCH 022/534] [SPARK-18177][ML][PYSPARK] Add missing
 'subsamplingRate' of pyspark GBTClassifier

## What changes were proposed in this pull request?
Add missing 'subsamplingRate' of pyspark GBTClassifier

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15692 from zhengruifeng/gbt_subsamplingRate.

(cherry picked from commit 9dc9f9a5dde37d085808a264cfb9cf4d4f72417d)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 python/pyspark/ml/classification.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index d9ff356b9403a..56c8c62259e79 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -900,19 +900,19 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic",
-                 maxIter=20, stepSize=0.1, seed=None):
+                 maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None)
+                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)
         """
         super(GBTClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.GBTClassifier", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                         lossType="logistic", maxIter=20, stepSize=0.1)
+                         lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -921,12 +921,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None):
+                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
-                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None)
+                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0)
         Sets params for Gradient Boosted Tree Classification.
         """
         kwargs = self.setParams._input_kwargs

From c2876bfbf06fe1057c4236128d41782c61685c53 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 3 Nov 2016 16:35:36 +0100
Subject: [PATCH 023/534] [SPARK-17981][SPARK-17957][SQL] Fix Incorrect
 Nullability Setting to False in FilterExec

### What changes were proposed in this pull request?

When `FilterExec` contains `isNotNull`, which could be inferred and pushed down or users specified, we convert the nullability of the involved columns if the top-layer expression is null-intolerant. However, this is not correct, if the top-layer expression is not a leaf expression, it could still tolerate the null when it has null-tolerant child expressions.

For example, `cast(coalesce(a#5, a#15) as double)`. Although `cast` is a null-intolerant expression, but obviously`coalesce` is null-tolerant. Thus, it could eat null.

When the nullability is wrong, we could generate incorrect results in different cases. For example,

``` Scala
    val df1 = Seq((1, 2), (2, 3)).toDF("a", "b")
    val df2 = Seq((2, 5), (3, 4)).toDF("a", "c")
    val joinedDf = df1.join(df2, Seq("a"), "outer").na.fill(0)
    val df3 = Seq((3, 1)).toDF("a", "d")
    joinedDf.join(df3, "a").show
```

The optimized plan is like

```
Project [a#29, b#30, c#31, d#42]
+- Join Inner, (a#29 = a#41)
   :- Project [cast(coalesce(cast(coalesce(a#5, a#15) as double), 0.0) as int) AS a#29, cast(coalesce(cast(b#6 as double), 0.0) as int) AS b#30, cast(coalesce(cast(c#16 as double), 0.0) as int) AS c#31]
   :  +- Filter isnotnull(cast(coalesce(cast(coalesce(a#5, a#15) as double), 0.0) as int))
   :     +- Join FullOuter, (a#5 = a#15)
   :        :- LocalRelation [a#5, b#6]
   :        +- LocalRelation [a#15, c#16]
   +- LocalRelation [a#41, d#42]
```

Without the fix, it returns an empty result. With the fix, it can return a correct answer:

```
+---+---+---+---+
|  a|  b|  c|  d|
+---+---+---+---+
|  3|  0|  4|  1|
+---+---+---+---+
```
### How was this patch tested?

Added test cases to verify the nullability changes in FilterExec. Also added a test case for verifying the reported incorrect result.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15523 from gatorsmile/nullabilityFilterExec.

(cherry picked from commit 66a99f4a411ee7dc94ff1070a8fd6865fd004093)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../execution/basicPhysicalOperators.scala    |  8 +-
 .../org/apache/spark/sql/DataFrameSuite.scala | 74 ++++++++++++++++++-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 32133f52630cd..e6f1de5cb05b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -90,7 +90,13 @@ case class FilterExec(condition: Expression, child: SparkPlan)
 
   // Split out all the IsNotNulls from condition.
   private val (notNullPreds, otherPreds) = splitConjunctivePredicates(condition).partition {
-    case IsNotNull(a: NullIntolerant) if a.references.subsetOf(child.outputSet) => true
+    case IsNotNull(a) => isNullIntolerant(a) && a.references.subsetOf(child.outputSet)
+    case _ => false
+  }
+
+  // If one expression and its children are null intolerant, it is null intolerant.
+  private def isNullIntolerant(expr: Expression): Boolean = expr match {
+    case e: NullIntolerant => e.children.forall(isNullIntolerant)
     case _ => false
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 33b3b78c9f04f..f5bc8785d5a2c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -28,8 +28,8 @@ import org.scalatest.Matchers._
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project, Union}
-import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, OneRowRelation, Project, Union}
+import org.apache.spark.sql.execution.{FilterExec, QueryExecution}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange}
 import org.apache.spark.sql.functions._
@@ -1635,6 +1635,76 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  private def verifyNullabilityInFilterExec(
+      df: DataFrame,
+      expr: String,
+      expectedNonNullableColumns: Seq[String]): Unit = {
+    val dfWithFilter = df.where(s"isnotnull($expr)").selectExpr(expr)
+    // In the logical plan, all the output columns of input dataframe are nullable
+    dfWithFilter.queryExecution.optimizedPlan.collect {
+      case e: Filter => assert(e.output.forall(_.nullable))
+    }
+
+    dfWithFilter.queryExecution.executedPlan.collect {
+      // When the child expression in isnotnull is null-intolerant (i.e. any null input will
+      // result in null output), the involved columns are converted to not nullable;
+      // otherwise, no change should be made.
+      case e: FilterExec =>
+        assert(e.output.forall { o =>
+          if (expectedNonNullableColumns.contains(o.name)) !o.nullable else o.nullable
+        })
+    }
+  }
+
+  test("SPARK-17957: no change on nullability in FilterExec output") {
+    val df = sparkContext.parallelize(Seq(
+      null.asInstanceOf[java.lang.Integer] -> new java.lang.Integer(3),
+      new java.lang.Integer(1) -> null.asInstanceOf[java.lang.Integer],
+      new java.lang.Integer(2) -> new java.lang.Integer(4))).toDF()
+
+    verifyNullabilityInFilterExec(df,
+      expr = "Rand()", expectedNonNullableColumns = Seq.empty[String])
+    verifyNullabilityInFilterExec(df,
+      expr = "coalesce(_1, _2)", expectedNonNullableColumns = Seq.empty[String])
+    verifyNullabilityInFilterExec(df,
+      expr = "coalesce(_1, 0) + Rand()", expectedNonNullableColumns = Seq.empty[String])
+    verifyNullabilityInFilterExec(df,
+      expr = "cast(coalesce(cast(coalesce(_1, _2) as double), 0.0) as int)",
+      expectedNonNullableColumns = Seq.empty[String])
+  }
+
+  test("SPARK-17957: set nullability to false in FilterExec output") {
+    val df = sparkContext.parallelize(Seq(
+      null.asInstanceOf[java.lang.Integer] -> new java.lang.Integer(3),
+      new java.lang.Integer(1) -> null.asInstanceOf[java.lang.Integer],
+      new java.lang.Integer(2) -> new java.lang.Integer(4))).toDF()
+
+    verifyNullabilityInFilterExec(df,
+      expr = "_1 + _2 * 3", expectedNonNullableColumns = Seq("_1", "_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "_1 + _2", expectedNonNullableColumns = Seq("_1", "_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "_1", expectedNonNullableColumns = Seq("_1"))
+    // `constructIsNotNullConstraints` infers the IsNotNull(_2) from IsNotNull(_2 + Rand())
+    // Thus, we are able to set nullability of _2 to false.
+    // If IsNotNull(_2) is not given from `constructIsNotNullConstraints`, the impl of
+    // isNullIntolerant in `FilterExec` needs an update for more advanced inference.
+    verifyNullabilityInFilterExec(df,
+      expr = "_2 + Rand()", expectedNonNullableColumns = Seq("_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "_2 * 3 + coalesce(_1, 0)", expectedNonNullableColumns = Seq("_2"))
+    verifyNullabilityInFilterExec(df,
+      expr = "cast((_1 + _2) as boolean)", expectedNonNullableColumns = Seq("_1", "_2"))
+  }
+
+  test("SPARK-17957: outer join + na.fill") {
+    val df1 = Seq((1, 2), (2, 3)).toDF("a", "b")
+    val df2 = Seq((2, 5), (3, 4)).toDF("a", "c")
+    val joinedDf = df1.join(df2, Seq("a"), "outer").na.fill(0)
+    val df3 = Seq((3, 1)).toDF("a", "d")
+    checkAnswer(joinedDf.join(df3, "a"), Row(3, 0, 4, 1))
+  }
+
   test("SPARK-17123: Performing set operations that combine non-scala native types") {
     val dates = Seq(
       (new Date(0), BigDecimal.valueOf(1), new Timestamp(2)),

From 4f91630c8100ee3a6fd168bc4247ca6fadd0a736 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 3 Nov 2016 11:48:05 -0700
Subject: [PATCH 024/534] [SPARK-18244][SQL] Rename partitionProviderIsHive ->
 tracksPartitionsInCatalog

## What changes were proposed in this pull request?
This patch renames partitionProviderIsHive to tracksPartitionsInCatalog, as the old name was too Hive specific.

## How was this patch tested?
Should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #15750 from rxin/SPARK-18244.

(cherry picked from commit b17057c0a69b9c56e503483d97f5dc209eef0884)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/catalog/interface.scala      |  9 ++++----
 .../sql/catalyst/trees/TreeNodeSuite.scala    |  2 +-
 .../command/createDataSourceTables.scala      |  2 +-
 .../spark/sql/execution/command/ddl.scala     |  4 ++--
 .../spark/sql/execution/command/tables.scala  |  2 +-
 .../execution/datasources/DataSource.scala    |  2 +-
 .../datasources/DataSourceStrategy.scala      |  7 ++++---
 .../InsertIntoHadoopFsRelationCommand.scala   |  6 +-----
 .../sql/execution/command/DDLSuite.scala      |  2 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  | 21 ++++++++++++-------
 10 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 7c3bec897956a..34748a04859ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -138,8 +138,9 @@ case class BucketSpec(
  *                 Can be None if this table is a View, should be "hive" for hive serde tables.
  * @param unsupportedFeatures is a list of string descriptions of features that are used by the
  *        underlying table but not supported by Spark SQL yet.
- * @param partitionProviderIsHive whether this table's partition metadata is stored in the Hive
- *                                metastore.
+ * @param tracksPartitionsInCatalog whether this table's partition metadata is stored in the
+ *                                  catalog. If false, it is inferred automatically based on file
+ *                                  structure.
  */
 case class CatalogTable(
     identifier: TableIdentifier,
@@ -158,7 +159,7 @@ case class CatalogTable(
     viewText: Option[String] = None,
     comment: Option[String] = None,
     unsupportedFeatures: Seq[String] = Seq.empty,
-    partitionProviderIsHive: Boolean = false) {
+    tracksPartitionsInCatalog: Boolean = false) {
 
   /** schema of this table's partition columns */
   def partitionSchema: StructType = StructType(schema.filter {
@@ -217,7 +218,7 @@ case class CatalogTable(
         if (properties.nonEmpty) s"Properties: $tableProperties" else "",
         if (stats.isDefined) s"Statistics: ${stats.get.simpleString}" else "",
         s"$storage",
-        if (partitionProviderIsHive) "Partition Provider: Hive" else "")
+        if (tracksPartitionsInCatalog) "Partition Provider: Catalog" else "")
 
     output.filter(_.nonEmpty).mkString("CatalogTable(\n\t", "\n\t", ")")
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 3eff12f9eed14..af1eaa1f23746 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -489,7 +489,7 @@ class TreeNodeSuite extends SparkFunSuite {
         "owner" -> "",
         "createTime" -> 0,
         "lastAccessTime" -> -1,
-        "partitionProviderIsHive" -> false,
+        "tracksPartitionsInCatalog" -> false,
         "properties" -> JNull,
         "unsupportedFeatures" -> List.empty[String]))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index d4b28274cc453..7e16e43f2bb0e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -92,7 +92,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
       // If metastore partition management for file source tables is enabled, we start off with
       // partition provider hive, but no partitions in the metastore. The user has to call
       // `msck repair table` to populate the table partitions.
-      partitionProviderIsHive = partitionColumnNames.nonEmpty &&
+      tracksPartitionsInCatalog = partitionColumnNames.nonEmpty &&
         sparkSession.sessionState.conf.manageFilesourcePartitions)
     // We will return Nil or throw exception at the beginning if the table already exists, so when
     // we reach here, the table should not exist and we should set `ignoreIfExists` to false.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 52af915b0be65..b4d3ca1f37074 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -523,7 +523,7 @@ case class AlterTableRecoverPartitionsCommand(
     // Updates the table to indicate that its partition metadata is stored in the Hive metastore.
     // This is always the case for Hive format tables, but is not true for Datasource tables created
     // before Spark 2.1 unless they are converted via `msck repair table`.
-    spark.sessionState.catalog.alterTable(table.copy(partitionProviderIsHive = true))
+    spark.sessionState.catalog.alterTable(table.copy(tracksPartitionsInCatalog = true))
     catalog.refreshTable(tableName)
     logInfo(s"Recovered all partitions ($total).")
     Seq.empty[Row]
@@ -702,7 +702,7 @@ object DDLUtils {
         s"$action is not allowed on $tableName since filesource partition management is " +
           "disabled (spark.sql.hive.manageFilesourcePartitions = false).")
     }
-    if (!table.partitionProviderIsHive && isDatasourceTable(table)) {
+    if (!table.tracksPartitionsInCatalog && isDatasourceTable(table)) {
       throw new AnalysisException(
         s"$action is not allowed on $tableName since its partition metadata is not stored in " +
           "the Hive metastore. To import this information into the metastore, run " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index f32c956f5999e..00c646b9185b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -467,7 +467,7 @@ case class DescribeTableCommand(
 
     if (table.tableType == CatalogTableType.VIEW) describeViewInfo(table, buffer)
 
-    if (DDLUtils.isDatasourceTable(table) && table.partitionProviderIsHive) {
+    if (DDLUtils.isDatasourceTable(table) && table.tracksPartitionsInCatalog) {
       append(buffer, "Partition Provider:", "Hive", "")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 0b50448a7af18..52666119351b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -308,7 +308,7 @@ case class DataSource(
         }
 
         val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
-            catalogTable.isDefined && catalogTable.get.partitionProviderIsHive) {
+            catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
           new CatalogFileIndex(
             sparkSession,
             catalogTable.get,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index e87998fe4ad8d..a548e88cb683a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -182,9 +182,10 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
           "Cannot overwrite a path that is also being read from.")
       }
 
-      val overwritingSinglePartition = (overwrite.specificPartition.isDefined &&
+      val overwritingSinglePartition =
+        overwrite.specificPartition.isDefined &&
         t.sparkSession.sessionState.conf.manageFilesourcePartitions &&
-        l.catalogTable.get.partitionProviderIsHive)
+        l.catalogTable.get.tracksPartitionsInCatalog
 
       val effectiveOutputPath = if (overwritingSinglePartition) {
         val partition = t.sparkSession.sessionState.catalog.getPartition(
@@ -203,7 +204,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
       def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = {
         if (l.catalogTable.isDefined && updatedPartitions.nonEmpty &&
             l.catalogTable.get.partitionColumnNames.nonEmpty &&
-            l.catalogTable.get.partitionProviderIsHive) {
+            l.catalogTable.get.tracksPartitionsInCatalog) {
           val metastoreUpdater = AlterTableAddPartitionCommand(
             l.catalogTable.get.identifier,
             updatedPartitions.map(p => (p, None)),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 927c0c5b95a17..9c75e2ae74761 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -31,11 +31,7 @@ import org.apache.spark.sql.execution.command.RunnableCommand
 
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
- * Writing to dynamic partitions is also supported.  Each [[InsertIntoHadoopFsRelationCommand]]
- * issues a single write job, and owns a UUID that identifies this job.  Each concrete
- * implementation of [[HadoopFsRelation]] should use this UUID together with task id to generate
- * unique file path for each task output file.  This UUID is passed to executor side via a
- * property named `spark.sql.sources.writeJobUUID`.
+ * Writing to dynamic partitions is also supported.
  */
 case class InsertIntoHadoopFsRelationCommand(
     outputPath: Path,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index d4d001497deb2..52b09c54464e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -96,7 +96,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       provider = Some("hive"),
       partitionColumnNames = Seq("a", "b"),
       createTime = 0L,
-      partitionProviderIsHive = true)
+      tracksPartitionsInCatalog = true)
   }
 
   private def createTable(catalog: SessionCatalog, name: TableIdentifier): Unit = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index ebba203ac593c..64ba52672b1c8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -323,8 +323,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     val properties = new scala.collection.mutable.HashMap[String, String]
     properties.put(DATASOURCE_PROVIDER, provider)
-    if (table.partitionProviderIsHive) {
-      properties.put(TABLE_PARTITION_PROVIDER, "hive")
+    if (table.tracksPartitionsInCatalog) {
+      properties.put(TABLE_PARTITION_PROVIDER, TABLE_PARTITION_PROVIDER_CATALOG)
     }
 
     // Serialized JSON schema string may be too long to be stored into a single metastore table
@@ -489,10 +489,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         updateLocationInStorageProps(oldTableDef, newLocation).copy(locationUri = newLocation)
       }
 
-      val partitionProviderProp = if (tableDefinition.partitionProviderIsHive) {
-        TABLE_PARTITION_PROVIDER -> "hive"
+      val partitionProviderProp = if (tableDefinition.tracksPartitionsInCatalog) {
+        TABLE_PARTITION_PROVIDER -> TABLE_PARTITION_PROVIDER_CATALOG
       } else {
-        TABLE_PARTITION_PROVIDER -> "builtin"
+        TABLE_PARTITION_PROVIDER -> TABLE_PARTITION_PROVIDER_FILESYSTEM
       }
 
       // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
@@ -537,7 +537,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table
     } else {
       getProviderFromTableProperties(table).map { provider =>
-        assert(provider != "hive", "Hive serde table should not save provider in table properties.")
+        assert(provider != TABLE_PARTITION_PROVIDER_CATALOG,
+          "Hive serde table should not save provider in table properties.")
         // Internally we store the table location in storage properties with key "path" for data
         // source tables. Here we set the table location to `locationUri` field and filter out the
         // path option in storage properties, to avoid exposing this concept externally.
@@ -545,6 +546,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           val tableLocation = getLocationFromStorageProps(table)
           updateLocationInStorageProps(table, None).copy(locationUri = tableLocation)
         }
+        val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
 
         table.copy(
           storage = storageWithLocation,
@@ -552,9 +554,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           provider = Some(provider),
           partitionColumnNames = getPartitionColumnsFromTableProperties(table),
           bucketSpec = getBucketSpecFromTableProperties(table),
-          partitionProviderIsHive = table.properties.get(TABLE_PARTITION_PROVIDER) == Some("hive"))
+          tracksPartitionsInCatalog = partitionProvider == Some(TABLE_PARTITION_PROVIDER_CATALOG)
+        )
       } getOrElse {
-        table.copy(provider = Some("hive"), partitionProviderIsHive = true)
+        table.copy(provider = Some("hive"), tracksPartitionsInCatalog = true)
       }
     }
 
@@ -851,6 +854,8 @@ object HiveExternalCatalog {
   val STATISTICS_COL_STATS_PREFIX = STATISTICS_PREFIX + "colStats."
 
   val TABLE_PARTITION_PROVIDER = SPARK_SQL_PREFIX + "partitionProvider"
+  val TABLE_PARTITION_PROVIDER_CATALOG = "catalog"
+  val TABLE_PARTITION_PROVIDER_FILESYSTEM = "filesystem"
 
 
   def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = {

From 3e139e2390085cfb42f7136f150b0fa08c14eb61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=A6=8F=E6=98=9F?= <fuxing@wacai.com>
Date: Thu, 3 Nov 2016 12:02:01 -0700
Subject: [PATCH 025/534] [SPARK-18237][HIVE] hive.exec.stagingdir have no
 effect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hive.exec.stagingdir have no effect in spark2.0.1，
Hive confs in hive-site.xml will be loaded in `hadoopConf`, so we should use `hadoopConf` in `InsertIntoHiveTable` instead of `SessionState.conf`

Author: 福星 <fuxing@wacai.com>

Closes #15744 from ClassNotFoundExp/master.

(cherry picked from commit 16293311cdb25a62733a9aae4355659b971a3ce1)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/sql/hive/execution/InsertIntoHiveTable.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 15be12cfc0ad4..e333fc7febc2a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -76,7 +76,8 @@ case class InsertIntoHiveTable(
 
   def output: Seq[Attribute] = Seq.empty
 
-  val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
+  val hadoopConf = sessionState.newHadoopConf()
+  val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging")
 
   private def executionId: String = {
     val rand: Random = new Random
@@ -163,7 +164,6 @@ case class InsertIntoHiveTable(
     // instances within the closure, since Serializer is not serializable while TableDesc is.
     val tableDesc = table.tableDesc
     val tableLocation = table.hiveQlTable.getDataLocation
-    val hadoopConf = sessionState.newHadoopConf()
     val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf)
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
     val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean

From 569f77a11819523bdf5dc2c6429fc3399cbb6519 Mon Sep 17 00:00:00 2001
From: Kishor Patil <kpatil@yahoo-inc.com>
Date: Thu, 3 Nov 2016 16:10:26 -0500
Subject: [PATCH 026/534] [SPARK-18099][YARN] Fail if same files added to
 distributed cache for --files and --archives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

During spark-submit, if yarn dist cache is instructed to add same file under --files and --archives, This code change ensures the spark yarn distributed cache behaviour is retained i.e. to warn and fail if same files is mentioned in both --files and --archives.
## How was this patch tested?

Manually tested:
1. if same jar is mentioned in --jars and --files it will continue to submit the job.
- basically functionality [SPARK-14423] #12203 is unchanged
  1. if same file is mentioned in --files and --archives it will fail to submit the job.

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

… under archives and files

Author: Kishor Patil <kpatil@yahoo-inc.com>

Closes #15627 from kishorvpatil/spark18099.

(cherry picked from commit 098e4ca9c7af61e64839a50c65be449749af6482)
Signed-off-by: Tom Graves <tgraves@yahoo-inc.com>
---
 .../org/apache/spark/deploy/yarn/Client.scala | 12 +++++-
 .../spark/deploy/yarn/ClientSuite.scala       | 42 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 053a78617d4e0..172fb46c986c6 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -598,8 +598,16 @@ private[spark] class Client(
     ).foreach { case (flist, resType, addToClasspath) =>
       flist.foreach { file =>
         val (_, localizedPath) = distribute(file, resType = resType)
-        if (addToClasspath && localizedPath != null) {
-          cachedSecondaryJarLinks += localizedPath
+        // If addToClassPath, we ignore adding jar multiple times to distitrbuted cache.
+        if (addToClasspath) {
+          if (localizedPath != null) {
+            cachedSecondaryJarLinks += localizedPath
+          }
+        } else {
+          if (localizedPath != null) {
+            throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" +
+              " to the distributed cache.")
+          }
         }
       }
     }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 0a4f291e25fb0..06516c1baf1cc 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -282,6 +282,48 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     }
   }
 
+  test("distribute archive multiple times") {
+    val libs = Utils.createTempDir()
+    // Create jars dir and RELEASE file to avoid IllegalStateException.
+    val jarsDir = new File(libs, "jars")
+    assert(jarsDir.mkdir())
+    new FileOutputStream(new File(libs, "RELEASE")).close()
+
+    val userLib1 = Utils.createTempDir()
+    val testJar = TestUtils.createJarWithFiles(Map(), userLib1)
+
+    // Case 1:  FILES_TO_DISTRIBUTE and ARCHIVES_TO_DISTRIBUTE can't have duplicate files
+    val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
+      .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+    val client = createClient(sparkConf)
+    val tempDir = Utils.createTempDir()
+    intercept[IllegalArgumentException] {
+      client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
+    }
+
+    // Case 2: FILES_TO_DISTRIBUTE can't have duplicate files.
+    val sparkConfFiles = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
+
+    val clientFiles = createClient(sparkConfFiles)
+    val tempDirForFiles = Utils.createTempDir()
+    intercept[IllegalArgumentException] {
+      clientFiles.prepareLocalResources(new Path(tempDirForFiles.getAbsolutePath()), Nil)
+    }
+
+    // Case 3: ARCHIVES_TO_DISTRIBUTE can't have duplicate files.
+    val sparkConfArchives = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
+
+    val clientArchives = createClient(sparkConfArchives)
+    val tempDirForArchives = Utils.createTempDir()
+    intercept[IllegalArgumentException] {
+      clientArchives.prepareLocalResources(new Path(tempDirForArchives.getAbsolutePath()), Nil)
+    }
+  }
+
   test("distribute local spark jars") {
     val temp = Utils.createTempDir()
     val jarsDir = new File(temp, "jars")

From 2daca62cd342203694f22232ceb026dcaf56d3d5 Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Thu, 3 Nov 2016 14:43:25 -0700
Subject: [PATCH 027/534] [SPARK-18212][SS][KAFKA] increase executor poll
 timeout

## What changes were proposed in this pull request?

Increase poll timeout to try and address flaky test

## How was this patch tested?

Ran existing unit tests

Author: cody koeninger <cody@koeninger.org>

Closes #15737 from koeninger/SPARK-18212.

(cherry picked from commit 67659c9afaeb2289e56fd87fafee953e8f050383)
Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 .../scala/org/apache/spark/sql/kafka010/KafkaSource.scala    | 5 ++++-
 .../scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 61cba737d148a..b21508cd7ebd8 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -88,7 +88,10 @@ private[kafka010] case class KafkaSource(
 
   private val sc = sqlContext.sparkContext
 
-  private val pollTimeoutMs = sourceOptions.getOrElse("kafkaConsumer.pollTimeoutMs", "512").toLong
+  private val pollTimeoutMs = sourceOptions.getOrElse(
+    "kafkaConsumer.pollTimeoutMs",
+    sc.conf.getTimeAsMs("spark.network.timeout", "120s").toString
+  ).toLong
 
   private val maxOffsetFetchAttempts =
     sourceOptions.getOrElse("fetchOffset.numRetries", "3").toInt
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
index 5b5a9ac48c7ca..98394251bb233 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
@@ -66,7 +66,8 @@ private[spark] class KafkaRDD[K, V](
       " must be set to false for executor kafka params, else offsets may commit before processing")
 
   // TODO is it necessary to have separate configs for initial poll time vs ongoing poll time?
-  private val pollTimeout = conf.getLong("spark.streaming.kafka.consumer.poll.ms", 512)
+  private val pollTimeout = conf.getLong("spark.streaming.kafka.consumer.poll.ms",
+    conf.getTimeAsMs("spark.network.timeout", "120s"))
   private val cacheInitialCapacity =
     conf.getInt("spark.streaming.kafka.consumer.cache.initialCapacity", 16)
   private val cacheMaxCapacity =

From af60b1ebbf5cb91dc724aad9d3d7476ce9085ac9 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 3 Nov 2016 15:30:45 -0700
Subject: [PATCH 028/534] [SPARK-18257][SS] Improve error reporting for
 FileStressSuite

## What changes were proposed in this pull request?
This patch improves error reporting for FileStressSuite, when there is an error in Spark itself (not user code). This works by simply tightening the exception verification, and gets rid of the unnecessary thread for starting the stream.

Also renamed the class FileStreamStressSuite to make it more obvious it is a streaming suite.

## How was this patch tested?
This is a test only change and I manually verified error reporting by injecting some bug in the addBatch code for FileStreamSink.

Author: Reynold Xin <rxin@databricks.com>

Closes #15757 from rxin/SPARK-18257.

(cherry picked from commit f22954ad49bf5a32c7b6d8487cd38ffe0da904ca)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 ...uite.scala => FileStreamStressSuite.scala} | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/streaming/{FileStressSuite.scala => FileStreamStressSuite.scala} (85%)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
similarity index 85%
rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStressSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
index f9e236c449634..28412ea07a75c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamStressSuite.scala
@@ -36,9 +36,12 @@ import org.apache.spark.util.Utils
  *
  * At the end, the resulting files are loaded and the answer is checked.
  */
-class FileStressSuite extends StreamTest {
+class FileStreamStressSuite extends StreamTest {
   import testImplicits._
 
+  // Error message thrown in the streaming job for testing recovery.
+  private val injectedErrorMsg = "test suite injected failure!"
+
   testQuietly("fault tolerance stress test - unpartitioned output") {
     stressTest(partitionWrites = false)
   }
@@ -101,13 +104,14 @@ class FileStressSuite extends StreamTest {
     val input = spark.readStream.format("text").load(inputDir)
 
     def startStream(): StreamingQuery = {
+      val errorMsg = injectedErrorMsg  // work around serialization issue
       val output = input
         .repartition(5)
         .as[String]
         .mapPartitions { iter =>
           val rand = Random.nextInt(100)
           if (rand < 10) {
-            sys.error("failure")
+            sys.error(errorMsg)
           }
           iter.map(_.toLong)
         }
@@ -131,22 +135,21 @@ class FileStressSuite extends StreamTest {
     }
 
     var failures = 0
-    val streamThread = new Thread("stream runner") {
-      while (continue) {
-        if (failures % 10 == 0) { logError(s"Query restart #$failures") }
-        stream = startStream()
-
-        try {
-          stream.awaitTermination()
-        } catch {
-          case ce: StreamingQueryException =>
-            failures += 1
-        }
+    while (continue) {
+      if (failures % 10 == 0) { logError(s"Query restart #$failures") }
+      stream = startStream()
+
+      try {
+        stream.awaitTermination()
+      } catch {
+        case e: StreamingQueryException
+          if e.getCause != null && e.getCause.getCause != null &&
+              e.getCause.getCause.getMessage.contains(injectedErrorMsg) =>
+          // Getting the expected error message
+          failures += 1
       }
     }
 
-    streamThread.join()
-
     logError(s"Stream restarted $failures times.")
     assert(spark.read.parquet(outputDir).distinct().count() == numRecords)
   }

From 37550c49218e1890f8adc10c9549a23dc072e21f Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 3 Nov 2016 17:27:23 -0700
Subject: [PATCH 029/534] [SPARK-18138][DOCS] Document that Java 7, Python 2.6,
 Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0

## What changes were proposed in this pull request?

Document that Java 7, Python 2.6, Scala 2.10, Hadoop < 2.6 are deprecated in Spark 2.1.0. This does not actually implement any of the change in SPARK-18138, just peppers the documentation with notices about it.

## How was this patch tested?

Doc build

Author: Sean Owen <sowen@cloudera.com>

Closes #15733 from srowen/SPARK-18138.

(cherry picked from commit dc4c60098641cf64007e2f0e36378f000ad5f6b1)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../main/scala/org/apache/spark/SparkContext.scala   | 12 ++++++++++++
 docs/building-spark.md                               |  6 ++++++
 docs/index.md                                        |  4 ++++
 docs/programming-guide.md                            |  4 ++++
 python/pyspark/context.py                            |  4 ++++
 5 files changed, 30 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 63478c88b057b..9f0f6074229dd 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -183,6 +183,8 @@ class SparkContext(config: SparkConf) extends Logging {
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
 
+  warnDeprecatedVersions()
+
   /* ------------------------------------------------------------------------------------- *
    | Private variables. These variables keep the internal state of the context, and are    |
    | not accessible by the outside world. They're mutable since we want to initialize all  |
@@ -346,6 +348,16 @@ class SparkContext(config: SparkConf) extends Logging {
     value
   }
 
+  private def warnDeprecatedVersions(): Unit = {
+    val javaVersion = System.getProperty("java.version").split("[+.\\-]+", 3)
+    if (javaVersion.length >= 2 && javaVersion(1).toInt == 7) {
+      logWarning("Support for Java 7 is deprecated as of Spark 2.0.0")
+    }
+    if (scala.util.Properties.releaseVersion.exists(_.startsWith("2.10"))) {
+      logWarning("Support for Scala 2.10 is deprecated as of Spark 2.1.0")
+    }
+  }
+
   /** Control our logLevel. This overrides any user-defined log settings.
    * @param logLevel The desired log level as a string.
    * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
diff --git a/docs/building-spark.md b/docs/building-spark.md
index ebe46a42a15c6..2b404bd3e116c 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -13,6 +13,7 @@ redirect_from: "building-with-maven.html"
 
 The Maven-based build is the build of reference for Apache Spark.
 Building Spark using Maven requires Maven 3.3.9 or newer and Java 7+.
+Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be removed in Spark 2.2.0.
 
 ### Setting up Maven's Memory Usage
 
@@ -79,6 +80,9 @@ Because HDFS is not protocol-compatible across versions, if you want to read fro
   </tbody>
 </table>
 
+Note that support for versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0 and may be 
+removed in Spark 2.2.0.
+
 
 You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different from `hadoop.version`. Spark only supports YARN versions 2.2.0 and later.
 
@@ -129,6 +133,8 @@ To produce a Spark package compiled with Scala 2.10, use the `-Dscala-2.10` prop
 
     ./dev/change-scala-version.sh 2.10
     ./build/mvn -Pyarn -Phadoop-2.4 -Dscala-2.10 -DskipTests clean package
+    
+Note that support for Scala 2.10 is deprecated as of Spark 2.1.0 and may be removed in Spark 2.2.0.
 
 ## Building submodules individually
 
diff --git a/docs/index.md b/docs/index.md
index a7a92f6c4f6d7..fe51439ae08d7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -28,6 +28,10 @@ Spark runs on Java 7+, Python 2.6+/3.4+ and R 3.1+. For the Scala API, Spark {{s
 uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
 ({{site.SCALA_BINARY_VERSION}}.x).
 
+Note that support for Java 7 and Python 2.6 are deprecated as of Spark 2.0.0, and support for 
+Scala 2.10 and versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0, and may be 
+removed in Spark 2.2.0.
+
 # Running the Examples and Shell
 
 Spark comes with several sample programs.  Scala, Java, Python and R examples are in the
diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 7516579ec6dbf..b9a2110b602a0 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -59,6 +59,8 @@ Spark {{site.SPARK_VERSION}} works with Java 7 and higher. If you are using Java
 for concisely writing functions, otherwise you can use the classes in the
 [org.apache.spark.api.java.function](api/java/index.html?org/apache/spark/api/java/function/package-summary.html) package.
 
+Note that support for Java 7 is deprecated as of Spark 2.0.0 and may be removed in Spark 2.2.0.
+
 To write a Spark application in Java, you need to add a dependency on Spark. Spark is available through Maven Central at:
 
     groupId = org.apache.spark
@@ -87,6 +89,8 @@ import org.apache.spark.SparkConf
 Spark {{site.SPARK_VERSION}} works with Python 2.6+ or Python 3.4+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
+Note that support for Python 2.6 is deprecated as of Spark 2.0.0, and may be removed in Spark 2.2.0.
+
 To run Spark applications in Python, use the `bin/spark-submit` script located in the Spark directory.
 This script will load Spark's Java/Scala libraries and allow you to submit applications to a cluster.
 You can also use `bin/pyspark` to launch an interactive Python shell.
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 1b2e199c395be..2c2cf6a373bb7 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -22,6 +22,7 @@
 import signal
 import sys
 import threading
+import warnings
 from threading import RLock
 from tempfile import NamedTemporaryFile
 
@@ -187,6 +188,9 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
         self.pythonVer = "%d.%d" % sys.version_info[:2]
 
+        if sys.version_info < (2, 7):
+            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")
+
         # Broadcast's __reduce__ method stores Broadcast instances here.
         # This allows other code to determine which Broadcast instances have
         # been pickled, so it can determine which Java broadcast objects to

From 91d567150b305d05acb8543da5cbf21df244352d Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Thu, 3 Nov 2016 21:59:59 -0700
Subject: [PATCH 030/534] [SPARK-18259][SQL] Do not capture Throwable in
 QueryExecution

## What changes were proposed in this pull request?
`QueryExecution.toString` currently captures `java.lang.Throwable`s; this is far from a best practice and can lead to confusing situation or invalid application states. This PR fixes this by only capturing `AnalysisException`s.

## How was this patch tested?
Added a `QueryExecutionSuite`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #15760 from hvanhovell/SPARK-18259.

(cherry picked from commit aa412c55e31e61419d3de57ef4b13e50f9b38af0)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/execution/QueryExecution.scala  |  2 +-
 .../sql/execution/QueryExecutionSuite.scala   | 50 +++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index cb45a6d78b9b6..b3ef29f6e34c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -104,7 +104,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     ReuseSubquery(sparkSession.sessionState.conf))
 
   protected def stringOrError[A](f: => A): String =
-    try f.toString catch { case e: Throwable => e.toString }
+    try f.toString catch { case e: AnalysisException => e.toString }
 
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
new file mode 100644
index 0000000000000..8bceab39f71d5
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
+import org.apache.spark.sql.test.SharedSQLContext
+
+class QueryExecutionSuite extends SharedSQLContext {
+  test("toString() exception/error handling") {
+    val badRule = new SparkStrategy {
+      var mode: String = ""
+      override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match {
+        case "exception" => throw new AnalysisException(mode)
+        case "error" => throw new Error(mode)
+        case _ => Nil
+      }
+    }
+    spark.experimental.extraStrategies = badRule :: Nil
+
+    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)
+
+    // Nothing!
+    badRule.mode = ""
+    assert(qe.toString.contains("OneRowRelation"))
+
+    // Throw an AnalysisException - this should be captured.
+    badRule.mode = "exception"
+    assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))
+
+    // Throw an Error - this should not be captured.
+    badRule.mode = "error"
+    val error = intercept[Error](qe.toString)
+    assert(error.getMessage.contains("error"))
+  }
+}

From 8e145a94bbaca6ba4bff258cf4028bcf0317499f Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Thu, 3 Nov 2016 22:27:35 -0700
Subject: [PATCH 031/534] [SPARK-14393][SQL][DOC] update doc for python and R

## What changes were proposed in this pull request?

minor doc update that should go to master & branch-2.1

## How was this patch tested?

manual

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15747 from felixcheung/pySPARK-14393.

(cherry picked from commit a08463b1d32348a81d0f148dfaf22741d5c23b1a)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/functions.R             | 2 +-
 python/pyspark/sql/functions.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 4d94b4cd05d44..9a545f0647915 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1485,7 +1485,7 @@ setMethod("soundex",
 
 #' Return the partition ID as a column
 #'
-#' Return the partition ID of the Spark task as a SparkDataFrame column.
+#' Return the partition ID as a SparkDataFrame column.
 #' Note that this is nondeterministic because it depends on data partitioning and
 #' task scheduling.
 #'
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 45e3c22bfc6a9..245357a4bad9f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -543,7 +543,7 @@ def shiftRightUnsigned(col, numBits):
 
 @since(1.6)
 def spark_partition_id():
-    """A column for partition ID of the Spark task.
+    """A column for partition ID.
 
     Note that this is indeterministic because it depends on data partitioning and task scheduling.
 

From cfe76028bb116d72eab6601bff3b2a1856597370 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 3 Nov 2016 23:15:33 -0700
Subject: [PATCH 032/534] [SPARK-18200][GRAPHX][FOLLOW-UP] Support zero as an
 initial capacity in OpenHashSet

## What changes were proposed in this pull request?

This is a follow-up PR of #15741 in order to keep `nextPowerOf2` consistent.

**Before**
```
nextPowerOf2(0) => 2
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

**After**
```
nextPowerOf2(0) => 1
nextPowerOf2(1) => 1
nextPowerOf2(2) => 2
nextPowerOf2(3) => 4
nextPowerOf2(4) => 4
nextPowerOf2(5) => 8
```

## How was this patch tested?

N/A

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15754 from dongjoon-hyun/SPARK-18200-2.

(cherry picked from commit 27602c33751cebf6cd173c0de103454608cf6625)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../scala/org/apache/spark/util/collection/OpenHashSet.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
index 7a1be8515d965..60f6f537c1d54 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -272,7 +272,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
 
   private def nextPowerOf2(n: Int): Int = {
     if (n == 0) {
-      2
+      1
     } else {
       val highBit = Integer.highestOneBit(n)
       if (highBit == n) n else highBit << 1

From a2d7e25e7c85ce17c8ceac5e1806afe96d3acc14 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 4 Nov 2016 12:06:06 -0700
Subject: [PATCH 033/534] [SPARK-18197][CORE] Optimise AppendOnlyMap
 implementation

## What changes were proposed in this pull request?
This improvement works by using the fastest comparison test first and we observed a 1% throughput performance improvement on PageRank (HiBench large profile) with this change.

We used tprof and before the change in AppendOnlyMap.changeValue (where the optimisation occurs) this method was being used for 8053 profiling ticks representing 0.72% of the overall application time.

After this change we observed this method only occurring for 2786 ticks and for 0.25% of the overall time.

## How was this patch tested?
Existing unit tests and for performance we used HiBench large, profiling with tprof and IBM Healthcenter.

Author: Adam Roberts <aroberts@uk.ibm.com>

Closes #15714 from a-roberts/patch-9.

(cherry picked from commit a42d738c5de08bd395a7c220c487146173c6c163)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/util/collection/AppendOnlyMap.scala   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
index 6b74a29aceda9..bcb95b416dd25 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala
@@ -140,16 +140,16 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64)
     var i = 1
     while (true) {
       val curKey = data(2 * pos)
-      if (k.eq(curKey) || k.equals(curKey)) {
-        val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
-        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
-        return newValue
-      } else if (curKey.eq(null)) {
+      if (curKey.eq(null)) {
         val newValue = updateFunc(false, null.asInstanceOf[V])
         data(2 * pos) = k
         data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
         incrementSize()
         return newValue
+      } else if (k.eq(curKey) || k.equals(curKey)) {
+        val newValue = updateFunc(true, data(2 * pos + 1).asInstanceOf[V])
+        data(2 * pos + 1) = newValue.asInstanceOf[AnyRef]
+        return newValue
       } else {
         val delta = i
         pos = (pos + delta) & mask

From e51978c3deaa91ae8115c8f2db1af692622a1616 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Fri, 4 Nov 2016 21:18:13 +0100
Subject: [PATCH 034/534] [SPARK-17337][SQL] Do not pushdown predicates through
 filters with  predicate subqueries

## What changes were proposed in this pull request?
The `PushDownPredicate` rule can create a wrong result if we try to push a filter containing a predicate subquery through a project when the subquery and the project share attributes (have the same source).

The current PR fixes this by making sure that we do not push down when there is a predicate subquery that outputs the same attributes as the filters new child plan.

## How was this patch tested?
Added a test to `SubquerySuite`. nsyca has done previous work this. I have taken test from his initial PR.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #15761 from hvanhovell/SPARK-17337.

(cherry picked from commit 550cd56e8b6addb26efe3ce16976c9c34fa0c832)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 16 ++++++++++++-
 .../org/apache/spark/sql/SubquerySuite.scala  | 24 +++++++++++++++----
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index b6ad5db74e3c8..6ba8b33b3fa74 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -689,7 +689,7 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
     // state and all the input rows processed before. In another word, the order of input rows
     // matters for non-deterministic expressions, while pushing down predicates changes the order.
     case filter @ Filter(condition, project @ Project(fields, grandChild))
-      if fields.forall(_.deterministic) =>
+      if fields.forall(_.deterministic) && canPushThroughCondition(grandChild, condition) =>
 
       // Create a map of Aliases to their values from the child projection.
       // e.g., 'SELECT a + b AS c, d ...' produces Map(c -> a + b).
@@ -830,6 +830,20 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       filter
     }
   }
+
+  /**
+   * Check if we can safely push a filter through a projection, by making sure that predicate
+   * subqueries in the condition do not contain the same attributes as the plan they are moved
+   * into. This can happen when the plan and predicate subquery have the same source.
+   */
+  private def canPushThroughCondition(plan: LogicalPlan, condition: Expression): Boolean = {
+    val attributes = plan.outputSet
+    val matched = condition.find {
+      case PredicateSubquery(p, _, _, _) => p.outputSet.intersect(attributes).nonEmpty
+      case _ => false
+    }
+    matched.isEmpty
+  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index eab45050f7e63..89348668340be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -608,8 +608,8 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
             | where exists (select 1 from onerow t2 where t1.c1=t2.c1)
             | and   exists (select 1 from onerow LIMIT 1)""".stripMargin),
         Row(1) :: Nil)
-     }
-   }
+    }
+  }
 
   test("SPARK-16804: Correlated subqueries containing LIMIT - 2") {
     withTempView("onerow") {
@@ -623,6 +623,22 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
             |               from   (select 1 from onerow t2 LIMIT 1)
             |               where  t1.c1=t2.c1)""".stripMargin),
         Row(1) :: Nil)
-     }
-   }
+    }
+  }
+
+  test("SPARK-17337: Incorrect column resolution leads to incorrect results") {
+    withTempView("t1", "t2") {
+      Seq(1, 2).toDF("c1").createOrReplaceTempView("t1")
+      Seq(1).toDF("c2").createOrReplaceTempView("t2")
+
+      checkAnswer(
+        sql(
+          """
+            | select *
+            | from   (select t2.c2+1 as c3
+            |         from   t1 left join t2 on t1.c1=t2.c2) t3
+            | where  c3 not in (select c2 from t2)""".stripMargin),
+        Row(2) :: Nil)
+    }
+  }
 }

From 0a303a6948a3224070fc16516e0cc0a84df6df7f Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 4 Nov 2016 15:54:28 -0700
Subject: [PATCH 035/534] [SPARK-18167] Re-enable the non-flaky parts of
 SQLQuerySuite

## What changes were proposed in this pull request?

It seems the proximate cause of the test failures is that `cast(str as decimal)` in derby will raise an exception instead of returning NULL. This is a problem since Hive sometimes inserts `__HIVE_DEFAULT_PARTITION__` entries into the partition table as documented here: https://github.com/apache/hive/blob/trunk/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java#L1034

Basically, when these special default partitions are present, partition pruning pushdown using the SQL-direct mode will fail due this cast exception. As commented on in `MetaStoreDirectSql.java` above, this is normally fine since Hive falls back to JDO pruning, however when the pruning predicate contains an unsupported operator such as `>`, that will fail as well.

The only remaining question is why this behavior is nondeterministic. We know that when the test flakes, retries do not help, therefore the cause must be environmental. The current best hypothesis is that some config is different between different jenkins runs, which is why this PR prints out the Spark SQL and Hive confs for the test. The hope is that by comparing the config state for failure vs success we can isolate the root cause of the flakiness.

**Update:** we could not isolate the issue. It does not seem to be due to configuration differences. As such, I'm going to enable the non-flaky parts of the test since we are fairly confident these issues only occur with Derby (which is not used in production).

## How was this patch tested?

N/A

Author: Eric Liang <ekl@databricks.com>

Closes #15725 from ericl/print-confs-out.

(cherry picked from commit 4cee2ce251110218e68c0f8f30363ec2f2498bea)
Signed-off-by: Yin Huai <yhuai@databricks.com>
---
 .../sql/hive/execution/SQLQuerySuite.scala    | 31 ++++++-------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ad70835d06d92..cc09aef32699b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1569,27 +1569,16 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
-  ignore("SPARK-10562: partition by column with mixed case name") {
-    def runOnce() {
-      withTable("tbl10562") {
-        val df = Seq(2012 -> "a").toDF("Year", "val")
-        df.write.partitionBy("Year").saveAsTable("tbl10562")
-        checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012))
-        checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
-        checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
-        checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
-        checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a"))
-      }
-    }
-    try {
-      runOnce()
-    } catch {
-      case t: Throwable =>
-        // Retry to gather more test data. TODO(ekl) revert this once we deflake this test.
-        runOnce()
-        runOnce()
-        runOnce()
-        throw t
+  test("SPARK-10562: partition by column with mixed case name") {
+    withTable("tbl10562") {
+      val df = Seq(2012 -> "a").toDF("Year", "val")
+      df.write.partitionBy("Year").saveAsTable("tbl10562")
+      checkAnswer(sql("SELECT year FROM tbl10562"), Row(2012))
+      checkAnswer(sql("SELECT Year FROM tbl10562"), Row(2012))
+      checkAnswer(sql("SELECT yEAr FROM tbl10562"), Row(2012))
+// TODO(ekl) this is causing test flakes [SPARK-18167], but we think the issue is derby specific
+//      checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year > 2015"), Nil)
+      checkAnswer(sql("SELECT val FROM tbl10562 WHERE Year == 2012"), Row("a"))
     }
   }
 

From 491db67a5fd067ef5e767ac4a07144722302d95a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 4 Nov 2016 23:34:29 -0700
Subject: [PATCH 036/534] [SPARK-18189] [SQL] [Followup] Move test from
 ReplSuite to prevent java.lang.ClassCircularityError

closes #15774

(cherry picked from commit 0f7c9e84e0d00813bf56712097677add5657f19f)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../scala/org/apache/spark/repl/ReplSuite.scala | 17 -----------------
 .../org/apache/spark/sql/DatasetSuite.scala     | 12 ++++++++++++
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 96d2dfc2658b9..9262e938c2a60 100644
--- a/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/scala-2.11/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -473,21 +473,4 @@ class ReplSuite extends SparkFunSuite {
     assertDoesNotContain("AssertionError", output)
     assertDoesNotContain("Exception", output)
   }
-
-  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
-    val resultValue = 12345
-    val output = runInterpreter("local",
-      s"""
-         |val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
-         |val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
-         |val broadcasted = sc.broadcast($resultValue)
-         |
-         |// Using broadcast triggers serialization issue in KeyValueGroupedDataset
-         |val dataset = mapGroups.map(_ => broadcasted.value)
-         |dataset.collect()
-      """.stripMargin)
-    assertDoesNotContain("error:", output)
-    assertDoesNotContain("Exception", output)
-    assertContains(s": Array[Int] = Array($resultValue, $resultValue)", output)
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 55f04878052aa..6fa7b0487732e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -923,6 +923,18 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
         .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() })
   }
 
+  test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
+    val resultValue = 12345
+    val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
+    val mapGroups = keyValueGrouped.mapGroups((k, v) => (k, 1))
+    val broadcasted = spark.sparkContext.broadcast(resultValue)
+
+    // Using broadcast triggers serialization issue in KeyValueGroupedDataset
+    val dataset = mapGroups.map(_ => broadcasted.value)
+
+    assert(dataset.collect() sameElements Array(resultValue, resultValue))
+  }
+
   Seq(true, false).foreach { eager =>
     def testCheckpointing(testName: String)(f: => Unit): Unit = {
       test(s"Dataset.checkpoint() - $testName (eager = $eager)") {

From 707630147e51114aa90f58f375df43bb2b5f7fb4 Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Fri, 4 Nov 2016 23:44:46 -0700
Subject: [PATCH 037/534] [SPARK-17710][FOLLOW UP] Add comments to state why
 'Utils.classForName' is not used

## What changes were proposed in this pull request?
Add comments.

## How was this patch tested?
Build passed.

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15776 from weiqingy/SPARK-17710.

(cherry picked from commit 8a9ca1924792d1a7c733bdfd757996b3ade0d63d)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 22c28fba2087e..1de66af632a8a 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2539,6 +2539,8 @@ private[util] object CallerContext extends Logging {
   val callerContextSupported: Boolean = {
     SparkHadoopUtil.get.conf.getBoolean("hadoop.caller.context.enabled", false) && {
       try {
+        // `Utils.classForName` will make `ReplSuite` fail with `ClassCircularityError` in
+        // master Maven build, so do not use it before resolving SPARK-17714.
         // scalastyle:off classforname
         Class.forName("org.apache.hadoop.ipc.CallerContext")
         Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")
@@ -2604,6 +2606,8 @@ private[spark] class CallerContext(
   def setCurrentContext(): Unit = {
     if (CallerContext.callerContextSupported) {
       try {
+        // `Utils.classForName` will make `ReplSuite` fail with `ClassCircularityError` in
+        // master Maven build, so do not use it before resolving SPARK-17714.
         // scalastyle:off classforname
         val callerContext = Class.forName("org.apache.hadoop.ipc.CallerContext")
         val builder = Class.forName("org.apache.hadoop.ipc.CallerContext$Builder")

From 42386e796f6519d22092fba88a8c42cba6511d7c Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Sat, 5 Nov 2016 00:07:51 -0700
Subject: [PATCH 038/534] [SPARK-18260] Make from_json null safe

## What changes were proposed in this pull request?

`from_json` is currently not safe against `null` rows. This PR adds a fix and a regression test for it.

## How was this patch tested?

Regression test

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #15771 from brkyvz/json_fix.

(cherry picked from commit 6e2701815761d5870111cb56300e30d3059b39ed)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/catalyst/expressions/jsonExpressions.scala  | 4 +++-
 .../sql/catalyst/expressions/JsonExpressionsSuite.scala   | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index e034735375274..89fe7c48c000e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -498,7 +498,9 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child:
   override def children: Seq[Expression] = child :: Nil
 
   override def eval(input: InternalRow): Any = {
-    try parser.parse(child.eval(input).toString).head catch {
+    val json = child.eval(input)
+    if (json == null) return null
+    try parser.parse(json.toString).head catch {
       case _: SparkSQLJsonProcessingException => null
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index f9db649bc2404..3bfa0bfda6209 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -344,6 +344,14 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     )
   }
 
+  test("from_json null input column") {
+    val schema = StructType(StructField("a", IntegerType) :: Nil)
+    checkEvaluation(
+      JsonToStruct(schema, Map.empty, Literal(null)),
+      null
+    )
+  }
+
   test("to_json") {
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     val struct = Literal.create(create_row(1), schema)

From d3b6066900a16f5c4351ac9117d651fec9a84b51 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sat, 5 Nov 2016 00:58:50 -0700
Subject: [PATCH 039/534] [SPARK-17183][SPARK-17983][SPARK-18101][SQL] put hive
 serde table schema to table properties like data source table

## What changes were proposed in this pull request?

For data source tables, we will put its table schema, partition columns, etc. to table properties, to work around some hive metastore issues, e.g. not case-preserving, bad decimal type support, etc.

We should also do this for hive serde tables, to reduce the difference between hive serde tables and data source tables, e.g. column names should be case preserving.
## How was this patch tested?

existing tests, and a new test in `HiveExternalCatalog`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #14750 from cloud-fan/minor1.

(cherry picked from commit 95ec4e25bb65f37f80222ffe70a95993a9149f80)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalyst/catalog/ExternalCatalog.scala    |   8 +-
 .../catalyst/catalog/InMemoryCatalog.scala    |   6 -
 .../org/apache/spark/sql/types/DataType.scala |  24 ++
 .../catalog/ExternalCatalogSuite.scala        |  20 ++
 .../apache/spark/sql/DataFrameWriter.scala    |  10 +-
 .../spark/sql/execution/SparkSqlParser.scala  |   4 +-
 .../spark/sql/execution/SparkStrategies.scala |   6 +-
 .../spark/sql/execution/command/ddl.scala     |   4 +-
 .../sql/execution/datasources/rules.scala     |   5 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  | 218 +++++++++++++-----
 .../input1-2-d3aa54d5436b7b59ff5c7091b7ca6145 |   4 +-
 .../input2-1-e0efeda558cd0194f4764a5735147b16 |   4 +-
 .../input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd |   4 +-
 .../input2-4-235f92683416fab031e6e7490487b15b |   6 +-
 ...columns-2-b74990316ec4245fd8a7011e684b39da |   6 +-
 .../hive/PartitionedTablePerfStatsSuite.scala |   9 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   4 +-
 17 files changed, 245 insertions(+), 97 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index a5e02523d2889..14dd707fa0f1c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
-import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException}
+import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException, NoSuchTableException}
 import org.apache.spark.sql.catalyst.expressions.Expression
 
 
@@ -39,6 +39,12 @@ abstract class ExternalCatalog {
     }
   }
 
+  protected def requireTableExists(db: String, table: String): Unit = {
+    if (!tableExists(db, table)) {
+      throw new NoSuchTableException(db = db, table = table)
+    }
+  }
+
   protected def requireFunctionExists(db: String, funcName: String): Unit = {
     if (!functionExists(db, funcName)) {
       throw new NoSuchFunctionException(db = db, func = funcName)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index ea675b76607d6..bc396880f22a3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -64,12 +64,6 @@ class InMemoryCatalog(
     catalog(db).tables(table).partitions.contains(spec)
   }
 
-  private def requireTableExists(db: String, table: String): Unit = {
-    if (!tableExists(db, table)) {
-      throw new NoSuchTableException(db = db, table = table)
-    }
-  }
-
   private def requireTableNotExists(db: String, table: String): Unit = {
     if (tableExists(db, table)) {
       throw new TableAlreadyExistsException(db = db, table = table)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 312585df1516b..2642d9395ba88 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -250,4 +250,28 @@ object DataType {
       case (fromDataType, toDataType) => fromDataType == toDataType
     }
   }
+
+  /**
+   * Compares two types, ignoring nullability of ArrayType, MapType, StructType, and ignoring case
+   * sensitivity of field names in StructType.
+   */
+  private[sql] def equalsIgnoreCaseAndNullability(from: DataType, to: DataType): Boolean = {
+    (from, to) match {
+      case (ArrayType(fromElement, _), ArrayType(toElement, _)) =>
+        equalsIgnoreCaseAndNullability(fromElement, toElement)
+
+      case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) =>
+        equalsIgnoreCaseAndNullability(fromKey, toKey) &&
+          equalsIgnoreCaseAndNullability(fromValue, toValue)
+
+      case (StructType(fromFields), StructType(toFields)) =>
+        fromFields.length == toFields.length &&
+          fromFields.zip(toFields).forall { case (l, r) =>
+            l.name.equalsIgnoreCase(r.name) &&
+              equalsIgnoreCaseAndNullability(l.dataType, r.dataType)
+          }
+
+      case (fromDataType, toDataType) => fromDataType == toDataType
+    }
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index f283f4287c5bf..66f92d1b1b0af 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -270,6 +270,26 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     assert(catalog.listTables("db2", "*1").toSet == Set("tbl1"))
   }
 
+  test("column names should be case-preserving and column nullability should be retained") {
+    val catalog = newBasicCatalog()
+    val tbl = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = storageFormat,
+      schema = new StructType()
+        .add("HelLo", "int", nullable = false)
+        .add("WoRLd", "int", nullable = true),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("WoRLd"),
+      bucketSpec = Some(BucketSpec(4, Seq("HelLo"), Nil)))
+    catalog.createTable(tbl, ignoreIfExists = false)
+
+    val readBack = catalog.getTable("db1", "tbl")
+    assert(readBack.schema == tbl.schema)
+    assert(readBack.partitionColumnNames == tbl.partitionColumnNames)
+    assert(readBack.bucketSpec == tbl.bucketSpec)
+  }
+
   // --------------------------------------------------------------------------
   // Partitions
   // --------------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index f95362e292280..e0c89811ddbfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -24,10 +24,10 @@ import scala.collection.JavaConverters._
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType}
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, OverwriteOptions, Union}
-import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand
-import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, CreateTable, DataSource, HadoopFsRelation}
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, OverwriteOptions}
+import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, DDLUtils}
+import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -359,7 +359,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   private def saveAsTable(tableIdent: TableIdentifier): Unit = {
-    if (source.toLowerCase == "hive") {
+    if (source.toLowerCase == DDLUtils.HIVE_PROVIDER) {
       throw new AnalysisException("Cannot create hive serde table with saveAsTable API")
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 634ffde3543cb..b8be3d17ba444 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -331,7 +331,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     }
     val options = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val provider = ctx.tableProvider.qualifiedName.getText
-    if (provider.toLowerCase == "hive") {
+    if (provider.toLowerCase == DDLUtils.HIVE_PROVIDER) {
       throw new AnalysisException("Cannot create hive serde table with CREATE TABLE USING")
     }
     val schema = Option(ctx.colTypeList()).map(createSchema)
@@ -1034,7 +1034,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
       tableType = tableType,
       storage = storage,
       schema = schema,
-      provider = Some("hive"),
+      provider = Some(DDLUtils.HIVE_PROVIDER),
       partitionColumnNames = partitionCols.map(_.name),
       properties = properties,
       comment = comment)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 5412aca95dcf1..190fdd84343ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -415,7 +415,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
   object DDLStrategy extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
-      case CreateTable(tableDesc, mode, None) if tableDesc.provider.get == "hive" =>
+      case CreateTable(tableDesc, mode, None)
+        if tableDesc.provider.get == DDLUtils.HIVE_PROVIDER =>
         val cmd = CreateTableCommand(tableDesc, ifNotExists = mode == SaveMode.Ignore)
         ExecutedCommandExec(cmd) :: Nil
 
@@ -427,7 +428,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // CREATE TABLE ... AS SELECT ... for hive serde table is handled in hive module, by rule
       // `CreateTables`
 
-      case CreateTable(tableDesc, mode, Some(query)) if tableDesc.provider.get != "hive" =>
+      case CreateTable(tableDesc, mode, Some(query))
+        if tableDesc.provider.get != DDLUtils.HIVE_PROVIDER =>
         val cmd =
           CreateDataSourceTableAsSelectCommand(
             tableDesc,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index b4d3ca1f37074..8500ab460a1b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -687,8 +687,10 @@ case class AlterTableSetLocationCommand(
 
 
 object DDLUtils {
+  val HIVE_PROVIDER = "hive"
+
   def isDatasourceTable(table: CatalogTable): Boolean = {
-    table.provider.isDefined && table.provider.get != "hive"
+    table.provider.isDefined && table.provider.get != HIVE_PROVIDER
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 4647b11af4dfb..5ba44ff9f5d9d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, RowOrd
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation}
 import org.apache.spark.sql.types.{AtomicType, StructType}
@@ -127,7 +128,7 @@ case class AnalyzeCreateTable(sparkSession: SparkSession) extends Rule[LogicalPl
     checkDuplication(normalizedPartitionCols, "partition")
 
     if (schema.nonEmpty && normalizedPartitionCols.length == schema.length) {
-      if (tableDesc.provider.get == "hive") {
+      if (tableDesc.provider.get == DDLUtils.HIVE_PROVIDER) {
         // When we hit this branch, it means users didn't specify schema for the table to be
         // created, as we always include partition columns in table schema for hive serde tables.
         // The real schema will be inferred at hive metastore by hive serde, plus the given
@@ -292,7 +293,7 @@ object HiveOnlyCheck extends (LogicalPlan => Unit) {
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case CreateTable(tableDesc, _, Some(_))
-          if tableDesc.provider.get == "hive" =>
+          if tableDesc.provider.get == DDLUtils.HIVE_PROVIDER =>
         throw new AnalysisException("Hive support is required to use CREATE Hive TABLE AS SELECT")
 
       case _ => // OK
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 64ba52672b1c8..b537061d0d221 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -95,8 +95,14 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
   }
 
-  private def requireTableExists(db: String, table: String): Unit = {
-    withClient { getTable(db, table) }
+  /**
+   * Get the raw table metadata from hive metastore directly. The raw table metadata may contains
+   * special data source properties and should not be exposed outside of `HiveExternalCatalog`. We
+   * should interpret these special data source properties and restore the original table metadata
+   * before returning it.
+   */
+  private def getRawTable(db: String, table: String): CatalogTable = withClient {
+    client.getTable(db, table)
   }
 
   /**
@@ -187,16 +193,32 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     if (tableExists(db, table) && !ignoreIfExists) {
       throw new TableAlreadyExistsException(db = db, table = table)
     }
-    // Before saving data source table metadata into Hive metastore, we should:
-    //  1. Put table metadata like provider, schema, etc. in table properties.
-    //  2. Check if this table is hive compatible
-    //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
-    //         spec to empty and save table metadata to Hive.
-    //    2.2  If it's hive compatible, set serde information in table metadata and try to save
-    //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
-    if (DDLUtils.isDatasourceTable(tableDefinition)) {
+
+    if (tableDefinition.tableType == VIEW) {
+      client.createTable(tableDefinition, ignoreIfExists)
+    } else if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) {
+      // Here we follow data source tables and put table metadata like provider, schema, etc. in
+      // table properties, so that we can work around the Hive metastore issue about not case
+      // preserving and make Hive serde table support mixed-case column names.
+      val tableWithDataSourceProps = tableDefinition.copy(
+        properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
+      client.createTable(tableWithDataSourceProps, ignoreIfExists)
+    } else {
+      // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type
+      // support, no column nullability, etc., we should do some extra works before saving table
+      // metadata into Hive metastore:
+      //  1. Put table metadata like provider, schema, etc. in table properties.
+      //  2. Check if this table is hive compatible.
+      //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
+      //         spec to empty and save table metadata to Hive.
+      //    2.2  If it's hive compatible, set serde information in table metadata and try to save
+      //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
       val tableProperties = tableMetaToTableProps(tableDefinition)
 
+      // Ideally we should not create a managed table with location, but Hive serde table can
+      // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
+      // to create the table directory and write out data before we create this table, to avoid
+      // exposing a partial written table.
       val needDefaultTableLocation = tableDefinition.tableType == MANAGED &&
         tableDefinition.storage.locationUri.isEmpty
       val tableLocation = if (needDefaultTableLocation) {
@@ -304,8 +326,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           logWarning(message)
           saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
       }
-    } else {
-      client.createTable(tableDefinition, ignoreIfExists)
     }
   }
 
@@ -417,11 +437,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   override def renameTable(db: String, oldName: String, newName: String): Unit = withClient {
-    val rawTable = client.getTable(db, oldName)
-
-    val storageWithNewPath = if (rawTable.tableType == MANAGED) {
-      // If it's a managed table and we are renaming it, then the path option becomes inaccurate
-      // and we need to update it according to the new table name.
+    val rawTable = getRawTable(db, oldName)
+
+    // Note that Hive serde tables don't use path option in storage properties to store the value
+    // of table location, but use `locationUri` field to store it directly. And `locationUri` field
+    // will be updated automatically in Hive metastore by the `alterTable` call at the end of this
+    // method. Here we only update the path option if the path option already exists in storage
+    // properties, to avoid adding a unnecessary path option for Hive serde tables.
+    val hasPathOption = new CaseInsensitiveMap(rawTable.storage.properties).contains("path")
+    val storageWithNewPath = if (rawTable.tableType == MANAGED && hasPathOption) {
+      // If it's a managed table with path option and we are renaming it, then the path option
+      // becomes inaccurate and we need to update it according to the new table name.
       val newTablePath = defaultTablePath(TableIdentifier(newName, Some(db)))
       updateLocationInStorageProps(rawTable, Some(newTablePath))
     } else {
@@ -442,7 +468,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   private def updateLocationInStorageProps(
       table: CatalogTable,
       newPath: Option[String]): CatalogStorageFormat = {
-    val propsWithoutPath = table.storage.properties.filterKeys(_.toLowerCase != "path")
+    // We can't use `filterKeys` here, as the map returned by `filterKeys` is not serializable,
+    // while `CatalogTable` should be serializable.
+    val propsWithoutPath = table.storage.properties.filter {
+      case (k, v) => k.toLowerCase != "path"
+    }
     table.storage.copy(properties = propsWithoutPath ++ newPath.map("path" -> _))
   }
 
@@ -475,18 +505,51 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       tableDefinition
     }
 
-    if (DDLUtils.isDatasourceTable(withStatsProps)) {
-      val oldTableDef = client.getTable(db, withStatsProps.identifier.table)
+    if (tableDefinition.tableType == VIEW) {
+      client.alterTable(withStatsProps)
+    } else {
+      val oldTableDef = getRawTable(db, withStatsProps.identifier.table)
 
-      val oldLocation = getLocationFromStorageProps(oldTableDef)
-      val newLocation = tableDefinition.storage.locationUri
-      // Only update the `locationUri` field if the location is really changed, because this table
-      // may be not Hive-compatible and can not set the `locationUri` field. We should respect the
-      // old `locationUri` even it's None.
-      val storageWithNewLocation = if (oldLocation == newLocation) {
-        oldTableDef.storage
+      val newStorage = if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) {
+        tableDefinition.storage
       } else {
-        updateLocationInStorageProps(oldTableDef, newLocation).copy(locationUri = newLocation)
+        // We can't alter the table storage of data source table directly for 2 reasons:
+        //   1. internally we use path option in storage properties to store the value of table
+        //      location, but the given `tableDefinition` is from outside and doesn't have the path
+        //      option, we need to add it manually.
+        //   2. this data source table may be created on a file, not a directory, then we can't set
+        //      the `locationUri` field and save it to Hive metastore, because Hive only allows
+        //      directory as table location.
+        //
+        // For example, an external data source table is created with a single file '/path/to/file'.
+        // Internally, we will add a path option with value '/path/to/file' to storage properties,
+        // and set the `locationUri` to a special value due to SPARK-15269(please see
+        // `saveTableIntoHive` for more details). When users try to get the table metadata back, we
+        // will restore the `locationUri` field from the path option and remove the path option from
+        // storage properties. When users try to alter the table storage, the given
+        // `tableDefinition` will have `locationUri` field with value `/path/to/file` and the path
+        // option is not set.
+        //
+        // Here we need 2 extra steps:
+        //   1. add path option to storage properties, to match the internal format, i.e. using path
+        //      option to store the value of table location.
+        //   2. set the `locationUri` field back to the old one from the existing table metadata,
+        //      if users don't want to alter the table location. This step is necessary as the
+        //      `locationUri` is not always same with the path option, e.g. in the above example
+        //      `locationUri` is a special value and we should respect it. Note that, if users
+        //       want to alter the table location to a file path, we will fail. This should be fixed
+        //       in the future.
+
+        val newLocation = tableDefinition.storage.locationUri
+        val storageWithPathOption = tableDefinition.storage.copy(
+          properties = tableDefinition.storage.properties ++ newLocation.map("path" -> _))
+
+        val oldLocation = getLocationFromStorageProps(oldTableDef)
+        if (oldLocation == newLocation) {
+          storageWithPathOption.copy(locationUri = oldTableDef.storage.locationUri)
+        } else {
+          storageWithPathOption
+        }
       }
 
       val partitionProviderProp = if (tableDefinition.tracksPartitionsInCatalog) {
@@ -498,23 +561,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition,
       // to retain the spark specific format if it is. Also add old data source properties to table
       // properties, to retain the data source table format.
-      val oldDataSourceProps = oldTableDef.properties.filter(_._1.startsWith(SPARK_SQL_PREFIX))
+      val oldDataSourceProps = oldTableDef.properties.filter(_._1.startsWith(DATASOURCE_PREFIX))
       val newTableProps = oldDataSourceProps ++ withStatsProps.properties + partitionProviderProp
       val newDef = withStatsProps.copy(
-        storage = storageWithNewLocation,
+        storage = newStorage,
         schema = oldTableDef.schema,
         partitionColumnNames = oldTableDef.partitionColumnNames,
         bucketSpec = oldTableDef.bucketSpec,
         properties = newTableProps)
 
       client.alterTable(newDef)
-    } else {
-      client.alterTable(withStatsProps)
     }
   }
 
   override def getTable(db: String, table: String): CatalogTable = withClient {
-    restoreTableMetadata(client.getTable(db, table))
+    restoreTableMetadata(getRawTable(db, table))
   }
 
   override def getTableOption(db: String, table: String): Option[CatalogTable] = withClient {
@@ -536,28 +597,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val tableWithSchema = if (table.tableType == VIEW) {
       table
     } else {
-      getProviderFromTableProperties(table).map { provider =>
-        assert(provider != TABLE_PARTITION_PROVIDER_CATALOG,
-          "Hive serde table should not save provider in table properties.")
-        // Internally we store the table location in storage properties with key "path" for data
-        // source tables. Here we set the table location to `locationUri` field and filter out the
-        // path option in storage properties, to avoid exposing this concept externally.
-        val storageWithLocation = {
-          val tableLocation = getLocationFromStorageProps(table)
-          updateLocationInStorageProps(table, None).copy(locationUri = tableLocation)
-        }
-        val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
-
-        table.copy(
-          storage = storageWithLocation,
-          schema = getSchemaFromTableProperties(table),
-          provider = Some(provider),
-          partitionColumnNames = getPartitionColumnsFromTableProperties(table),
-          bucketSpec = getBucketSpecFromTableProperties(table),
-          tracksPartitionsInCatalog = partitionProvider == Some(TABLE_PARTITION_PROVIDER_CATALOG)
-        )
-      } getOrElse {
-        table.copy(provider = Some("hive"), tracksPartitionsInCatalog = true)
+      getProviderFromTableProperties(table) match {
+        // No provider in table properties, which means this table is created by Spark prior to 2.1,
+        // or is created at Hive side.
+        case None =>
+          table.copy(provider = Some(DDLUtils.HIVE_PROVIDER), tracksPartitionsInCatalog = true)
+
+        // This is a Hive serde table created by Spark 2.1 or higher versions.
+        case Some(DDLUtils.HIVE_PROVIDER) => restoreHiveSerdeTable(table)
+
+        // This is a regular data source table.
+        case Some(provider) => restoreDataSourceTable(table, provider)
       }
     }
 
@@ -583,6 +633,50 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     tableWithStats.copy(properties = getOriginalTableProperties(table))
   }
 
+  private def restoreHiveSerdeTable(table: CatalogTable): CatalogTable = {
+    val hiveTable = table.copy(
+      provider = Some(DDLUtils.HIVE_PROVIDER),
+      tracksPartitionsInCatalog = true)
+
+    val schemaFromTableProps = getSchemaFromTableProperties(table)
+    if (DataType.equalsIgnoreCaseAndNullability(schemaFromTableProps, table.schema)) {
+      hiveTable.copy(
+        schema = schemaFromTableProps,
+        partitionColumnNames = getPartitionColumnsFromTableProperties(table),
+        bucketSpec = getBucketSpecFromTableProperties(table))
+    } else {
+      // Hive metastore may change the table schema, e.g. schema inference. If the table
+      // schema we read back is different(ignore case and nullability) from the one in table
+      // properties which was written when creating table, we should respect the table schema
+      // from hive.
+      logWarning(s"The table schema given by Hive metastore(${table.schema.simpleString}) is " +
+        "different from the schema when this table was created by Spark SQL" +
+        s"(${schemaFromTableProps.simpleString}). We have to fall back to the table schema from " +
+        "Hive metastore which is not case preserving.")
+      hiveTable
+    }
+  }
+
+  private def restoreDataSourceTable(table: CatalogTable, provider: String): CatalogTable = {
+    // Internally we store the table location in storage properties with key "path" for data
+    // source tables. Here we set the table location to `locationUri` field and filter out the
+    // path option in storage properties, to avoid exposing this concept externally.
+    val storageWithLocation = {
+      val tableLocation = getLocationFromStorageProps(table)
+      // We pass None as `newPath` here, to remove the path option in storage properties.
+      updateLocationInStorageProps(table, newPath = None).copy(locationUri = tableLocation)
+    }
+    val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
+
+    table.copy(
+      provider = Some(provider),
+      storage = storageWithLocation,
+      schema = getSchemaFromTableProperties(table),
+      partitionColumnNames = getPartitionColumnsFromTableProperties(table),
+      bucketSpec = getBucketSpecFromTableProperties(table),
+      tracksPartitionsInCatalog = partitionProvider == Some(TABLE_PARTITION_PROVIDER_CATALOG))
+  }
+
   override def tableExists(db: String, table: String): Boolean = withClient {
     client.tableExists(db, table)
   }
@@ -623,7 +717,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     val orderedPartitionSpec = new util.LinkedHashMap[String, String]()
     getTable(db, table).partitionColumnNames.foreach { colName =>
-      orderedPartitionSpec.put(colName, partition(colName))
+      // Hive metastore is not case preserving and keeps partition columns with lower cased names,
+      // and Hive will validate the column names in partition spec to make sure they are partition
+      // columns. Here we Lowercase the column names before passing the partition spec to Hive
+      // client, to satisfy Hive.
+      orderedPartitionSpec.put(colName.toLowerCase, partition(colName))
     }
 
     client.loadPartition(
@@ -648,7 +746,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     val orderedPartitionSpec = new util.LinkedHashMap[String, String]()
     getTable(db, table).partitionColumnNames.foreach { colName =>
-      orderedPartitionSpec.put(colName, partition(colName))
+      // Hive metastore is not case preserving and keeps partition columns with lower cased names,
+      // and Hive will validate the column names in partition spec to make sure they are partition
+      // columns. Here we Lowercase the column names before passing the partition spec to Hive
+      // client, to satisfy Hive.
+      orderedPartitionSpec.put(colName.toLowerCase, partition(colName))
     }
 
     client.loadDynamicPartitions(
@@ -754,7 +856,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient {
-    val rawTable = client.getTable(db, table)
+    val rawTable = getRawTable(db, table)
     val catalogTable = restoreTableMetadata(rawTable)
     val partitionColumnNames = catalogTable.partitionColumnNames.toSet
     val nonPartitionPruningPredicates = predicates.filterNot {
diff --git a/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145 b/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145
index d3ffb995aff4b..93ba96ec8c159 100644
--- a/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145
+++ b/sql/hive/src/test/resources/golden/input1-2-d3aa54d5436b7b59ff5c7091b7ca6145
@@ -1,2 +1,2 @@
-a                   	int                 	                    
-b                   	double              	                    
+A                   	int
+B                   	double
diff --git a/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16 b/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16
index d3ffb995aff4b..93ba96ec8c159 100644
--- a/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16
+++ b/sql/hive/src/test/resources/golden/input2-1-e0efeda558cd0194f4764a5735147b16
@@ -1,2 +1,2 @@
-a                   	int                 	                    
-b                   	double              	                    
+A                   	int
+B                   	double
diff --git a/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd b/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd
index d3ffb995aff4b..93ba96ec8c159 100644
--- a/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd
+++ b/sql/hive/src/test/resources/golden/input2-2-aa9ab0598e0cb7a12c719f9b3d98dbfd
@@ -1,2 +1,2 @@
-a                   	int                 	                    
-b                   	double              	                    
+A                   	int
+B                   	double
diff --git a/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b b/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b
index 77eaef91c9c3f..d52fcf0ebbdb3 100644
--- a/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b
+++ b/sql/hive/src/test/resources/golden/input2-4-235f92683416fab031e6e7490487b15b
@@ -1,3 +1,3 @@
-a                   	array<int>          	                    
-b                   	double              	                    
-c                   	map<double,int>     	                    
+A                   	array<int>
+B                   	double
+C                   	map<double,int>
diff --git a/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da b/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da
index 70c14c3ef34ab..2f7168cba9307 100644
--- a/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da
+++ b/sql/hive/src/test/resources/golden/show_columns-2-b74990316ec4245fd8a7011e684b39da
@@ -1,3 +1,3 @@
-key                 
-value               
-ds                  
+KEY
+VALUE
+ds
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index d8e31c4e39a5c..b41bc862e9bc5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -105,12 +105,9 @@ class PartitionedTablePerfStatsSuite
         assert(df4.count() == 0)
         assert(df4.inputFiles.length == 0)
 
-        // TODO(ekl) enable for hive tables as well once SPARK-17983 is fixed
-        if (spec.isDatasourceTable) {
-          val df5 = spark.sql("select * from test where fieldOne = 4")
-          assert(df5.count() == 1)
-          assert(df5.inputFiles.length == 5)
-        }
+        val df5 = spark.sql("select * from test where fieldOne = 4")
+        assert(df5.count() == 1)
+        assert(df5.inputFiles.length == 5)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index cc09aef32699b..28e5dffb11523 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -521,7 +521,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     val catalogTable =
       sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
     relation match {
-      case LogicalRelation(r: HadoopFsRelation, _, Some(table)) =>
+      case LogicalRelation(r: HadoopFsRelation, _, _) =>
         if (!isDataSourceTable) {
           fail(
             s"${classOf[MetastoreRelation].getCanonicalName} is expected, but found " +
@@ -529,7 +529,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         }
         userSpecifiedLocation match {
           case Some(location) =>
-            assert(table.storage.locationUri.get === location)
+            assert(r.options("path") === location)
           case None => // OK.
         }
         assert(catalogTable.provider.get === format)

From 6d292069d3229a29862fe83c23a82edcf2289e1f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 5 Nov 2016 11:29:17 +0100
Subject: [PATCH 040/534] [SPARK-18287][SQL] Move hash expressions from
 misc.scala into hash.scala

## What changes were proposed in this pull request?
As the title suggests, this patch moves hash expressions from misc.scala into hash.scala, to make it easier to find the hash functions. I wanted to do this a while ago but decided to wait for the branch-2.1 cut so the chance of conflicts will be smaller.

## How was this patch tested?
Test cases were also moved out of MiscFunctionsSuite into HashExpressionsSuite.

Author: Reynold Xin <rxin@databricks.com>

Closes #15784 from rxin/SPARK-18287.

(cherry picked from commit e2648d35577c9664968cf6da5069277dbfb410d2)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/catalyst/expressions/hash.scala | 788 ++++++++++++++++++
 .../spark/sql/catalyst/expressions/misc.scala | 761 -----------------
 .../expressions/HashExpressionsSuite.scala    | 144 ++++
 .../expressions/MiscFunctionsSuite.scala      | 119 ---
 4 files changed, 932 insertions(+), 880 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
new file mode 100644
index 0000000000000..415ef4e4a37ec
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -0,0 +1,788 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.security.{MessageDigest, NoSuchAlgorithmException}
+import java.util.zip.CRC32
+
+import scala.annotation.tailrec
+
+import org.apache.commons.codec.digest.DigestUtils
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.hash.Murmur3_x86_32
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
+import org.apache.spark.unsafe.Platform
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// This file defines all the expressions for hashing.
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * A function that calculates an MD5 128-bit checksum and returns it as a hex string
+ * For input of type [[BinaryType]]
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns an MD5 128-bit checksum as a hex string of `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       8cde774d6f7333752ed72cacddb05126
+  """)
+case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType)
+
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[Array[Byte]]))
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c =>
+      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
+  }
+}
+
+/**
+ * A function that calculates the SHA-2 family of functions (SHA-224, SHA-256, SHA-384, and SHA-512)
+ * and returns it as a hex string. The first argument is the string or binary to be hashed. The
+ * second argument indicates the desired bit length of the result, which must have a value of 224,
+ * 256, 384, 512, or 0 (which is equivalent to 256). SHA-224 is supported starting from Java 8. If
+ * asking for an unsupported SHA function, the return value is NULL. If either argument is NULL or
+ * the hash length is not one of the permitted values, the return value is NULL.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of `expr`.
+      SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.
+  """,
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', 256);
+       529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
+  """)
+// scalastyle:on line.size.limit
+case class Sha2(left: Expression, right: Expression)
+  extends BinaryExpression with Serializable with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+  override def nullable: Boolean = true
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType)
+
+  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
+    val bitLength = input2.asInstanceOf[Int]
+    val input = input1.asInstanceOf[Array[Byte]]
+    bitLength match {
+      case 224 =>
+        // DigestUtils doesn't support SHA-224 now
+        try {
+          val md = MessageDigest.getInstance("SHA-224")
+          md.update(input)
+          UTF8String.fromBytes(md.digest())
+        } catch {
+          // SHA-224 is not supported on the system, return null
+          case noa: NoSuchAlgorithmException => null
+        }
+      case 256 | 0 =>
+        UTF8String.fromString(DigestUtils.sha256Hex(input))
+      case 384 =>
+        UTF8String.fromString(DigestUtils.sha384Hex(input))
+      case 512 =>
+        UTF8String.fromString(DigestUtils.sha512Hex(input))
+      case _ => null
+    }
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val digestUtils = "org.apache.commons.codec.digest.DigestUtils"
+    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
+      s"""
+        if ($eval2 == 224) {
+          try {
+            java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
+            md.update($eval1);
+            ${ev.value} = UTF8String.fromBytes(md.digest());
+          } catch (java.security.NoSuchAlgorithmException e) {
+            ${ev.isNull} = true;
+          }
+        } else if ($eval2 == 256 || $eval2 == 0) {
+          ${ev.value} =
+            UTF8String.fromString($digestUtils.sha256Hex($eval1));
+        } else if ($eval2 == 384) {
+          ${ev.value} =
+            UTF8String.fromString($digestUtils.sha384Hex($eval1));
+        } else if ($eval2 == 512) {
+          ${ev.value} =
+            UTF8String.fromString($digestUtils.sha512Hex($eval1));
+        } else {
+          ${ev.isNull} = true;
+        }
+      """
+    })
+  }
+}
+
+/**
+ * A function that calculates a sha1 hash value and returns it as a hex string
+ * For input of type [[BinaryType]] or [[StringType]]
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns a sha1 hash value as a hex string of the `expr`.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
+  """)
+case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType)
+
+  protected override def nullSafeEval(input: Any): Any =
+    UTF8String.fromString(DigestUtils.sha1Hex(input.asInstanceOf[Array[Byte]]))
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c =>
+      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c))"
+    )
+  }
+}
+
+/**
+ * A function that computes a cyclic redundancy check value and returns it as a bigint
+ * For input of type [[BinaryType]]
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns a cyclic redundancy check value of the `expr` as a bigint.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark');
+       1557323817
+  """)
+case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def dataType: DataType = LongType
+
+  override def inputTypes: Seq[DataType] = Seq(BinaryType)
+
+  protected override def nullSafeEval(input: Any): Any = {
+    val checksum = new CRC32
+    checksum.update(input.asInstanceOf[Array[Byte]], 0, input.asInstanceOf[Array[Byte]].length)
+    checksum.getValue
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val CRC32 = "java.util.zip.CRC32"
+    val checksum = ctx.freshName("checksum")
+    nullSafeCodeGen(ctx, ev, value => {
+      s"""
+        $CRC32 $checksum = new $CRC32();
+        $checksum.update($value, 0, $value.length);
+        ${ev.value} = $checksum.getValue();
+      """
+    })
+  }
+}
+
+
+/**
+ * A function that calculates hash value for a group of expressions.  Note that the `seed` argument
+ * is not exposed to users and should only be set inside spark SQL.
+ *
+ * The hash value for an expression depends on its type and seed:
+ *  - null:               seed
+ *  - boolean:            turn boolean into int, 1 for true, 0 for false, and then use murmur3 to
+ *                        hash this int with seed.
+ *  - byte, short, int:   use murmur3 to hash the input as int with seed.
+ *  - long:               use murmur3 to hash the long input with seed.
+ *  - float:              turn it into int: java.lang.Float.floatToIntBits(input), and hash it.
+ *  - double:             turn it into long: java.lang.Double.doubleToLongBits(input), and hash it.
+ *  - decimal:            if it's a small decimal, i.e. precision <= 18, turn it into long and hash
+ *                        it. Else, turn it into bytes and hash it.
+ *  - calendar interval:  hash `microseconds` first, and use the result as seed to hash `months`.
+ *  - binary:             use murmur3 to hash the bytes with seed.
+ *  - string:             get the bytes of string and hash it.
+ *  - array:              The `result` starts with seed, then use `result` as seed, recursively
+ *                        calculate hash value for each element, and assign the element hash value
+ *                        to `result`.
+ *  - map:                The `result` starts with seed, then use `result` as seed, recursively
+ *                        calculate hash value for each key-value, and assign the key-value hash
+ *                        value to `result`.
+ *  - struct:             The `result` starts with seed, then use `result` as seed, recursively
+ *                        calculate hash value for each field, and assign the field hash value to
+ *                        `result`.
+ *
+ * Finally we aggregate the hash values for each expression by the same way of struct.
+ */
+abstract class HashExpression[E] extends Expression {
+  /** Seed of the HashExpression. */
+  val seed: E
+
+  override def foldable: Boolean = children.forall(_.foldable)
+
+  override def nullable: Boolean = false
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.isEmpty) {
+      TypeCheckResult.TypeCheckFailure("function hash requires at least one argument")
+    } else {
+      TypeCheckResult.TypeCheckSuccess
+    }
+  }
+
+  override def eval(input: InternalRow = null): Any = {
+    var hash = seed
+    var i = 0
+    val len = children.length
+    while (i < len) {
+      hash = computeHash(children(i).eval(input), children(i).dataType, hash)
+      i += 1
+    }
+    hash
+  }
+
+  protected def computeHash(value: Any, dataType: DataType, seed: E): E
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ev.isNull = "false"
+    val childrenHash = children.map { child =>
+      val childGen = child.genCode(ctx)
+      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
+        computeHash(childGen.value, child.dataType, ev.value, ctx)
+      }
+    }.mkString("\n")
+
+    ev.copy(code = s"""
+      ${ctx.javaType(dataType)} ${ev.value} = $seed;
+      $childrenHash""")
+  }
+
+  protected def nullSafeElementHash(
+      input: String,
+      index: String,
+      nullable: Boolean,
+      elementType: DataType,
+      result: String,
+      ctx: CodegenContext): String = {
+    val element = ctx.freshName("element")
+
+    ctx.nullSafeExec(nullable, s"$input.isNullAt($index)") {
+      s"""
+        final ${ctx.javaType(elementType)} $element = ${ctx.getValue(input, elementType, index)};
+        ${computeHash(element, elementType, result, ctx)}
+      """
+    }
+  }
+
+  protected def genHashInt(i: String, result: String): String =
+    s"$result = $hasherClassName.hashInt($i, $result);"
+
+  protected def genHashLong(l: String, result: String): String =
+    s"$result = $hasherClassName.hashLong($l, $result);"
+
+  protected def genHashBytes(b: String, result: String): String = {
+    val offset = "Platform.BYTE_ARRAY_OFFSET"
+    s"$result = $hasherClassName.hashUnsafeBytes($b, $offset, $b.length, $result);"
+  }
+
+  protected def genHashBoolean(input: String, result: String): String =
+    genHashInt(s"$input ? 1 : 0", result)
+
+  protected def genHashFloat(input: String, result: String): String =
+    genHashInt(s"Float.floatToIntBits($input)", result)
+
+  protected def genHashDouble(input: String, result: String): String =
+    genHashLong(s"Double.doubleToLongBits($input)", result)
+
+  protected def genHashDecimal(
+      ctx: CodegenContext,
+      d: DecimalType,
+      input: String,
+      result: String): String = {
+    if (d.precision <= Decimal.MAX_LONG_DIGITS) {
+      genHashLong(s"$input.toUnscaledLong()", result)
+    } else {
+      val bytes = ctx.freshName("bytes")
+      s"""
+            final byte[] $bytes = $input.toJavaBigDecimal().unscaledValue().toByteArray();
+            ${genHashBytes(bytes, result)}
+          """
+    }
+  }
+
+  protected def genHashCalendarInterval(input: String, result: String): String = {
+    val microsecondsHash = s"$hasherClassName.hashLong($input.microseconds, $result)"
+    s"$result = $hasherClassName.hashInt($input.months, $microsecondsHash);"
+  }
+
+  protected def genHashString(input: String, result: String): String = {
+    val baseObject = s"$input.getBaseObject()"
+    val baseOffset = s"$input.getBaseOffset()"
+    val numBytes = s"$input.numBytes()"
+    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes, $result);"
+  }
+
+  protected def genHashForMap(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      keyType: DataType,
+      valueType: DataType,
+      valueContainsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    val keys = ctx.freshName("keys")
+    val values = ctx.freshName("values")
+    s"""
+        final ArrayData $keys = $input.keyArray();
+        final ArrayData $values = $input.valueArray();
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          ${nullSafeElementHash(keys, index, false, keyType, result, ctx)}
+          ${nullSafeElementHash(values, index, valueContainsNull, valueType, result, ctx)}
+        }
+      """
+  }
+
+  protected def genHashForArray(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      elementType: DataType,
+      containsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    s"""
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          ${nullSafeElementHash(input, index, containsNull, elementType, result, ctx)}
+        }
+      """
+  }
+
+  protected def genHashForStruct(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      fields: Array[StructField]): String = {
+    fields.zipWithIndex.map { case (field, index) =>
+      nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx)
+    }.mkString("\n")
+  }
+
+  @tailrec
+  private def computeHashWithTailRec(
+      input: String,
+      dataType: DataType,
+      result: String,
+      ctx: CodegenContext): String = dataType match {
+    case NullType => ""
+    case BooleanType => genHashBoolean(input, result)
+    case ByteType | ShortType | IntegerType | DateType => genHashInt(input, result)
+    case LongType | TimestampType => genHashLong(input, result)
+    case FloatType => genHashFloat(input, result)
+    case DoubleType => genHashDouble(input, result)
+    case d: DecimalType => genHashDecimal(ctx, d, input, result)
+    case CalendarIntervalType => genHashCalendarInterval(input, result)
+    case BinaryType => genHashBytes(input, result)
+    case StringType => genHashString(input, result)
+    case ArrayType(et, containsNull) => genHashForArray(ctx, input, result, et, containsNull)
+    case MapType(kt, vt, valueContainsNull) =>
+      genHashForMap(ctx, input, result, kt, vt, valueContainsNull)
+    case StructType(fields) => genHashForStruct(ctx, input, result, fields)
+    case udt: UserDefinedType[_] => computeHashWithTailRec(input, udt.sqlType, result, ctx)
+  }
+
+  protected def computeHash(
+      input: String,
+      dataType: DataType,
+      result: String,
+      ctx: CodegenContext): String = computeHashWithTailRec(input, dataType, result, ctx)
+
+  protected def hasherClassName: String
+}
+
+/**
+ * Base class for interpreted hash functions.
+ */
+abstract class InterpretedHashFunction {
+  protected def hashInt(i: Int, seed: Long): Long
+
+  protected def hashLong(l: Long, seed: Long): Long
+
+  protected def hashUnsafeBytes(base: AnyRef, offset: Long, length: Int, seed: Long): Long
+
+  def hash(value: Any, dataType: DataType, seed: Long): Long = {
+    value match {
+      case null => seed
+      case b: Boolean => hashInt(if (b) 1 else 0, seed)
+      case b: Byte => hashInt(b, seed)
+      case s: Short => hashInt(s, seed)
+      case i: Int => hashInt(i, seed)
+      case l: Long => hashLong(l, seed)
+      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
+      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
+      case d: Decimal =>
+        val precision = dataType.asInstanceOf[DecimalType].precision
+        if (precision <= Decimal.MAX_LONG_DIGITS) {
+          hashLong(d.toUnscaledLong, seed)
+        } else {
+          val bytes = d.toJavaBigDecimal.unscaledValue().toByteArray
+          hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, seed)
+        }
+      case c: CalendarInterval => hashInt(c.months, hashLong(c.microseconds, seed))
+      case a: Array[Byte] =>
+        hashUnsafeBytes(a, Platform.BYTE_ARRAY_OFFSET, a.length, seed)
+      case s: UTF8String =>
+        hashUnsafeBytes(s.getBaseObject, s.getBaseOffset, s.numBytes(), seed)
+
+      case array: ArrayData =>
+        val elementType = dataType match {
+          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
+          case ArrayType(et, _) => et
+        }
+        var result = seed
+        var i = 0
+        while (i < array.numElements()) {
+          result = hash(array.get(i, elementType), elementType, result)
+          i += 1
+        }
+        result
+
+      case map: MapData =>
+        val (kt, vt) = dataType match {
+          case udt: UserDefinedType[_] =>
+            val mapType = udt.sqlType.asInstanceOf[MapType]
+            mapType.keyType -> mapType.valueType
+          case MapType(kt, vt, _) => kt -> vt
+        }
+        val keys = map.keyArray()
+        val values = map.valueArray()
+        var result = seed
+        var i = 0
+        while (i < map.numElements()) {
+          result = hash(keys.get(i, kt), kt, result)
+          result = hash(values.get(i, vt), vt, result)
+          i += 1
+        }
+        result
+
+      case struct: InternalRow =>
+        val types: Array[DataType] = dataType match {
+          case udt: UserDefinedType[_] =>
+            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
+          case StructType(fields) => fields.map(_.dataType)
+        }
+        var result = seed
+        var i = 0
+        val len = struct.numFields
+        while (i < len) {
+          result = hash(struct.get(i, types(i)), types(i), result)
+          i += 1
+        }
+        result
+    }
+  }
+}
+
+/**
+ * A MurMur3 Hash expression.
+ *
+ * We should use this hash function for both shuffle and bucket, so that we can guarantee shuffle
+ * and bucketing have same data distribution.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark', array(123), 2);
+        -1321691492
+  """)
+case class Murmur3Hash(children: Seq[Expression], seed: Int) extends HashExpression[Int] {
+  def this(arguments: Seq[Expression]) = this(arguments, 42)
+
+  override def dataType: DataType = IntegerType
+
+  override def prettyName: String = "hash"
+
+  override protected def hasherClassName: String = classOf[Murmur3_x86_32].getName
+
+  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
+    Murmur3HashFunction.hash(value, dataType, seed).toInt
+  }
+}
+
+object Murmur3HashFunction extends InterpretedHashFunction {
+  override protected def hashInt(i: Int, seed: Long): Long = {
+    Murmur3_x86_32.hashInt(i, seed.toInt)
+  }
+
+  override protected def hashLong(l: Long, seed: Long): Long = {
+    Murmur3_x86_32.hashLong(l, seed.toInt)
+  }
+
+  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
+    Murmur3_x86_32.hashUnsafeBytes(base, offset, len, seed.toInt)
+  }
+}
+
+/**
+ * A xxHash64 64-bit hash expression.
+ */
+case class XxHash64(children: Seq[Expression], seed: Long) extends HashExpression[Long] {
+  def this(arguments: Seq[Expression]) = this(arguments, 42L)
+
+  override def dataType: DataType = LongType
+
+  override def prettyName: String = "xxHash"
+
+  override protected def hasherClassName: String = classOf[XXH64].getName
+
+  override protected def computeHash(value: Any, dataType: DataType, seed: Long): Long = {
+    XxHash64Function.hash(value, dataType, seed)
+  }
+}
+
+object XxHash64Function extends InterpretedHashFunction {
+  override protected def hashInt(i: Int, seed: Long): Long = XXH64.hashInt(i, seed)
+
+  override protected def hashLong(l: Long, seed: Long): Long = XXH64.hashLong(l, seed)
+
+  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
+    XXH64.hashUnsafeBytes(base, offset, len, seed)
+  }
+}
+
+
+/**
+ * Simulates Hive's hashing function at
+ * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode() in Hive
+ *
+ * We should use this hash function for both shuffle and bucket of Hive tables, so that
+ * we can guarantee shuffle and bucketing have same data distribution
+ *
+ * TODO: Support Decimal and date related types
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.")
+case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
+  override val seed = 0
+
+  override def dataType: DataType = IntegerType
+
+  override def prettyName: String = "hive-hash"
+
+  override protected def hasherClassName: String = classOf[HiveHasher].getName
+
+  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
+    HiveHashFunction.hash(value, dataType, seed).toInt
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    ev.isNull = "false"
+    val childHash = ctx.freshName("childHash")
+    val childrenHash = children.map { child =>
+      val childGen = child.genCode(ctx)
+      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
+        computeHash(childGen.value, child.dataType, childHash, ctx)
+      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;"
+    }.mkString(s"int $childHash = 0;", s"\n$childHash = 0;\n", "")
+
+    ev.copy(code = s"""
+      ${ctx.javaType(dataType)} ${ev.value} = $seed;
+      $childrenHash""")
+  }
+
+  override def eval(input: InternalRow = null): Int = {
+    var hash = seed
+    var i = 0
+    val len = children.length
+    while (i < len) {
+      hash = (31 * hash) + computeHash(children(i).eval(input), children(i).dataType, hash)
+      i += 1
+    }
+    hash
+  }
+
+  override protected def genHashInt(i: String, result: String): String =
+    s"$result = $hasherClassName.hashInt($i);"
+
+  override protected def genHashLong(l: String, result: String): String =
+    s"$result = $hasherClassName.hashLong($l);"
+
+  override protected def genHashBytes(b: String, result: String): String =
+    s"$result = $hasherClassName.hashUnsafeBytes($b, Platform.BYTE_ARRAY_OFFSET, $b.length);"
+
+  override protected def genHashCalendarInterval(input: String, result: String): String = {
+    s"""
+        $result = (31 * $hasherClassName.hashInt($input.months)) +
+          $hasherClassName.hashLong($input.microseconds);"
+     """
+  }
+
+  override protected def genHashString(input: String, result: String): String = {
+    val baseObject = s"$input.getBaseObject()"
+    val baseOffset = s"$input.getBaseOffset()"
+    val numBytes = s"$input.numBytes()"
+    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes);"
+  }
+
+  override protected def genHashForArray(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      elementType: DataType,
+      containsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    val childResult = ctx.freshName("childResult")
+    s"""
+        int $childResult = 0;
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          $childResult = 0;
+          ${nullSafeElementHash(input, index, containsNull, elementType, childResult, ctx)};
+          $result = (31 * $result) + $childResult;
+        }
+      """
+  }
+
+  override protected def genHashForMap(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      keyType: DataType,
+      valueType: DataType,
+      valueContainsNull: Boolean): String = {
+    val index = ctx.freshName("index")
+    val keys = ctx.freshName("keys")
+    val values = ctx.freshName("values")
+    val keyResult = ctx.freshName("keyResult")
+    val valueResult = ctx.freshName("valueResult")
+    s"""
+        final ArrayData $keys = $input.keyArray();
+        final ArrayData $values = $input.valueArray();
+        int $keyResult = 0;
+        int $valueResult = 0;
+        for (int $index = 0; $index < $input.numElements(); $index++) {
+          $keyResult = 0;
+          ${nullSafeElementHash(keys, index, false, keyType, keyResult, ctx)}
+          $valueResult = 0;
+          ${nullSafeElementHash(values, index, valueContainsNull, valueType, valueResult, ctx)}
+          $result += $keyResult ^ $valueResult;
+        }
+      """
+  }
+
+  override protected def genHashForStruct(
+      ctx: CodegenContext,
+      input: String,
+      result: String,
+      fields: Array[StructField]): String = {
+    val localResult = ctx.freshName("localResult")
+    val childResult = ctx.freshName("childResult")
+    fields.zipWithIndex.map { case (field, index) =>
+      s"""
+         $childResult = 0;
+         ${nullSafeElementHash(input, index.toString, field.nullable, field.dataType,
+           childResult, ctx)}
+         $localResult = (31 * $localResult) + $childResult;
+       """
+    }.mkString(
+      s"""
+         int $localResult = 0;
+         int $childResult = 0;
+       """,
+      "",
+      s"$result = (31 * $result) + $localResult;"
+    )
+  }
+}
+
+object HiveHashFunction extends InterpretedHashFunction {
+  override protected def hashInt(i: Int, seed: Long): Long = {
+    HiveHasher.hashInt(i)
+  }
+
+  override protected def hashLong(l: Long, seed: Long): Long = {
+    HiveHasher.hashLong(l)
+  }
+
+  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
+    HiveHasher.hashUnsafeBytes(base, offset, len)
+  }
+
+  override def hash(value: Any, dataType: DataType, seed: Long): Long = {
+    value match {
+      case null => 0
+      case array: ArrayData =>
+        val elementType = dataType match {
+          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
+          case ArrayType(et, _) => et
+        }
+
+        var result = 0
+        var i = 0
+        val length = array.numElements()
+        while (i < length) {
+          result = (31 * result) + hash(array.get(i, elementType), elementType, 0).toInt
+          i += 1
+        }
+        result
+
+      case map: MapData =>
+        val (kt, vt) = dataType match {
+          case udt: UserDefinedType[_] =>
+            val mapType = udt.sqlType.asInstanceOf[MapType]
+            mapType.keyType -> mapType.valueType
+          case MapType(_kt, _vt, _) => _kt -> _vt
+        }
+        val keys = map.keyArray()
+        val values = map.valueArray()
+
+        var result = 0
+        var i = 0
+        val length = map.numElements()
+        while (i < length) {
+          result += hash(keys.get(i, kt), kt, 0).toInt ^ hash(values.get(i, vt), vt, 0).toInt
+          i += 1
+        }
+        result
+
+      case struct: InternalRow =>
+        val types: Array[DataType] = dataType match {
+          case udt: UserDefinedType[_] =>
+            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
+          case StructType(fields) => fields.map(_.dataType)
+        }
+
+        var result = 0
+        var i = 0
+        val length = struct.numFields
+        while (i < length) {
+          result = (31 * result) + hash(struct.get(i, types(i)), types(i), seed + 1).toInt
+          i += 1
+        }
+        result
+
+      case _ => super.hash(value, dataType, seed)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index 2ce10ef13215e..a874a1cf37086 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -17,529 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.security.{MessageDigest, NoSuchAlgorithmException}
-import java.util.zip.CRC32
-
-import scala.annotation.tailrec
-
-import org.apache.commons.codec.digest.DigestUtils
-
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.hash.Murmur3_x86_32
-import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
-import org.apache.spark.unsafe.Platform
-
-/**
- * A function that calculates an MD5 128-bit checksum and returns it as a hex string
- * For input of type [[BinaryType]]
- */
-@ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns an MD5 128-bit checksum as a hex string of `expr`.",
-  extended = """
-    Examples:
-      > SELECT _FUNC_('Spark');
-       8cde774d6f7333752ed72cacddb05126
-  """)
-case class Md5(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType)
-
-  protected override def nullSafeEval(input: Any): Any =
-    UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[Array[Byte]]))
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c =>
-      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
-  }
-}
-
-/**
- * A function that calculates the SHA-2 family of functions (SHA-224, SHA-256, SHA-384, and SHA-512)
- * and returns it as a hex string. The first argument is the string or binary to be hashed. The
- * second argument indicates the desired bit length of the result, which must have a value of 224,
- * 256, 384, 512, or 0 (which is equivalent to 256). SHA-224 is supported starting from Java 8. If
- * asking for an unsupported SHA function, the return value is NULL. If either argument is NULL or
- * the hash length is not one of the permitted values, the return value is NULL.
- */
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = """
-    _FUNC_(expr, bitLength) - Returns a checksum of SHA-2 family as a hex string of `expr`.
-      SHA-224, SHA-256, SHA-384, and SHA-512 are supported. Bit length of 0 is equivalent to 256.
-  """,
-  extended = """
-    Examples:
-      > SELECT _FUNC_('Spark', 256);
-       529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b
-  """)
-// scalastyle:on line.size.limit
-case class Sha2(left: Expression, right: Expression)
-  extends BinaryExpression with Serializable with ImplicitCastInputTypes {
-
-  override def dataType: DataType = StringType
-  override def nullable: Boolean = true
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType, IntegerType)
-
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    val bitLength = input2.asInstanceOf[Int]
-    val input = input1.asInstanceOf[Array[Byte]]
-    bitLength match {
-      case 224 =>
-        // DigestUtils doesn't support SHA-224 now
-        try {
-          val md = MessageDigest.getInstance("SHA-224")
-          md.update(input)
-          UTF8String.fromBytes(md.digest())
-        } catch {
-          // SHA-224 is not supported on the system, return null
-          case noa: NoSuchAlgorithmException => null
-        }
-      case 256 | 0 =>
-        UTF8String.fromString(DigestUtils.sha256Hex(input))
-      case 384 =>
-        UTF8String.fromString(DigestUtils.sha384Hex(input))
-      case 512 =>
-        UTF8String.fromString(DigestUtils.sha512Hex(input))
-      case _ => null
-    }
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val digestUtils = "org.apache.commons.codec.digest.DigestUtils"
-    nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
-      s"""
-        if ($eval2 == 224) {
-          try {
-            java.security.MessageDigest md = java.security.MessageDigest.getInstance("SHA-224");
-            md.update($eval1);
-            ${ev.value} = UTF8String.fromBytes(md.digest());
-          } catch (java.security.NoSuchAlgorithmException e) {
-            ${ev.isNull} = true;
-          }
-        } else if ($eval2 == 256 || $eval2 == 0) {
-          ${ev.value} =
-            UTF8String.fromString($digestUtils.sha256Hex($eval1));
-        } else if ($eval2 == 384) {
-          ${ev.value} =
-            UTF8String.fromString($digestUtils.sha384Hex($eval1));
-        } else if ($eval2 == 512) {
-          ${ev.value} =
-            UTF8String.fromString($digestUtils.sha512Hex($eval1));
-        } else {
-          ${ev.isNull} = true;
-        }
-      """
-    })
-  }
-}
-
-/**
- * A function that calculates a sha1 hash value and returns it as a hex string
- * For input of type [[BinaryType]] or [[StringType]]
- */
-@ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns a sha1 hash value as a hex string of the `expr`.",
-  extended = """
-    Examples:
-      > SELECT _FUNC_('Spark');
-       85f5955f4b27a9a4c2aab6ffe5d7189fc298b92c
-  """)
-case class Sha1(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType)
-
-  protected override def nullSafeEval(input: Any): Any =
-    UTF8String.fromString(DigestUtils.sha1Hex(input.asInstanceOf[Array[Byte]]))
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, c =>
-      s"UTF8String.fromString(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c))"
-    )
-  }
-}
-
-/**
- * A function that computes a cyclic redundancy check value and returns it as a bigint
- * For input of type [[BinaryType]]
- */
-@ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns a cyclic redundancy check value of the `expr` as a bigint.",
-  extended = """
-    Examples:
-      > SELECT _FUNC_('Spark');
-       1557323817
-  """)
-case class Crc32(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
-
-  override def dataType: DataType = LongType
-
-  override def inputTypes: Seq[DataType] = Seq(BinaryType)
-
-  protected override def nullSafeEval(input: Any): Any = {
-    val checksum = new CRC32
-    checksum.update(input.asInstanceOf[Array[Byte]], 0, input.asInstanceOf[Array[Byte]].length)
-    checksum.getValue
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val CRC32 = "java.util.zip.CRC32"
-    val checksum = ctx.freshName("checksum")
-    nullSafeCodeGen(ctx, ev, value => {
-      s"""
-        $CRC32 $checksum = new $CRC32();
-        $checksum.update($value, 0, $value.length);
-        ${ev.value} = $checksum.getValue();
-      """
-    })
-  }
-}
-
-
-/**
- * A function that calculates hash value for a group of expressions.  Note that the `seed` argument
- * is not exposed to users and should only be set inside spark SQL.
- *
- * The hash value for an expression depends on its type and seed:
- *  - null:               seed
- *  - boolean:            turn boolean into int, 1 for true, 0 for false, and then use murmur3 to
- *                        hash this int with seed.
- *  - byte, short, int:   use murmur3 to hash the input as int with seed.
- *  - long:               use murmur3 to hash the long input with seed.
- *  - float:              turn it into int: java.lang.Float.floatToIntBits(input), and hash it.
- *  - double:             turn it into long: java.lang.Double.doubleToLongBits(input), and hash it.
- *  - decimal:            if it's a small decimal, i.e. precision <= 18, turn it into long and hash
- *                        it. Else, turn it into bytes and hash it.
- *  - calendar interval:  hash `microseconds` first, and use the result as seed to hash `months`.
- *  - binary:             use murmur3 to hash the bytes with seed.
- *  - string:             get the bytes of string and hash it.
- *  - array:              The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each element, and assign the element hash value
- *                        to `result`.
- *  - map:                The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each key-value, and assign the key-value hash
- *                        value to `result`.
- *  - struct:             The `result` starts with seed, then use `result` as seed, recursively
- *                        calculate hash value for each field, and assign the field hash value to
- *                        `result`.
- *
- * Finally we aggregate the hash values for each expression by the same way of struct.
- */
-abstract class HashExpression[E] extends Expression {
-  /** Seed of the HashExpression. */
-  val seed: E
-
-  override def foldable: Boolean = children.forall(_.foldable)
-
-  override def nullable: Boolean = false
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.isEmpty) {
-      TypeCheckResult.TypeCheckFailure("function hash requires at least one argument")
-    } else {
-      TypeCheckResult.TypeCheckSuccess
-    }
-  }
-
-  override def eval(input: InternalRow): Any = {
-    var hash = seed
-    var i = 0
-    val len = children.length
-    while (i < len) {
-      hash = computeHash(children(i).eval(input), children(i).dataType, hash)
-      i += 1
-    }
-    hash
-  }
-
-  protected def computeHash(value: Any, dataType: DataType, seed: E): E
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    ev.isNull = "false"
-    val childrenHash = children.map { child =>
-      val childGen = child.genCode(ctx)
-      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
-        computeHash(childGen.value, child.dataType, ev.value, ctx)
-      }
-    }.mkString("\n")
-
-    ev.copy(code = s"""
-      ${ctx.javaType(dataType)} ${ev.value} = $seed;
-      $childrenHash""")
-  }
-
-  protected def nullSafeElementHash(
-      input: String,
-      index: String,
-      nullable: Boolean,
-      elementType: DataType,
-      result: String,
-      ctx: CodegenContext): String = {
-    val element = ctx.freshName("element")
-
-    ctx.nullSafeExec(nullable, s"$input.isNullAt($index)") {
-      s"""
-        final ${ctx.javaType(elementType)} $element = ${ctx.getValue(input, elementType, index)};
-        ${computeHash(element, elementType, result, ctx)}
-      """
-    }
-  }
-
-  protected def genHashInt(i: String, result: String): String =
-    s"$result = $hasherClassName.hashInt($i, $result);"
-
-  protected def genHashLong(l: String, result: String): String =
-    s"$result = $hasherClassName.hashLong($l, $result);"
-
-  protected def genHashBytes(b: String, result: String): String = {
-    val offset = "Platform.BYTE_ARRAY_OFFSET"
-    s"$result = $hasherClassName.hashUnsafeBytes($b, $offset, $b.length, $result);"
-  }
-
-  protected def genHashBoolean(input: String, result: String): String =
-    genHashInt(s"$input ? 1 : 0", result)
-
-  protected def genHashFloat(input: String, result: String): String =
-    genHashInt(s"Float.floatToIntBits($input)", result)
-
-  protected def genHashDouble(input: String, result: String): String =
-    genHashLong(s"Double.doubleToLongBits($input)", result)
-
-  protected def genHashDecimal(
-      ctx: CodegenContext,
-      d: DecimalType,
-      input: String,
-      result: String): String = {
-    if (d.precision <= Decimal.MAX_LONG_DIGITS) {
-      genHashLong(s"$input.toUnscaledLong()", result)
-    } else {
-      val bytes = ctx.freshName("bytes")
-      s"""
-            final byte[] $bytes = $input.toJavaBigDecimal().unscaledValue().toByteArray();
-            ${genHashBytes(bytes, result)}
-          """
-    }
-  }
-
-  protected def genHashCalendarInterval(input: String, result: String): String = {
-    val microsecondsHash = s"$hasherClassName.hashLong($input.microseconds, $result)"
-    s"$result = $hasherClassName.hashInt($input.months, $microsecondsHash);"
-  }
-
-  protected def genHashString(input: String, result: String): String = {
-    val baseObject = s"$input.getBaseObject()"
-    val baseOffset = s"$input.getBaseOffset()"
-    val numBytes = s"$input.numBytes()"
-    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes, $result);"
-  }
-
-  protected def genHashForMap(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      keyType: DataType,
-      valueType: DataType,
-      valueContainsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    val keys = ctx.freshName("keys")
-    val values = ctx.freshName("values")
-    s"""
-        final ArrayData $keys = $input.keyArray();
-        final ArrayData $values = $input.valueArray();
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          ${nullSafeElementHash(keys, index, false, keyType, result, ctx)}
-          ${nullSafeElementHash(values, index, valueContainsNull, valueType, result, ctx)}
-        }
-      """
-  }
-
-  protected def genHashForArray(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      elementType: DataType,
-      containsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    s"""
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          ${nullSafeElementHash(input, index, containsNull, elementType, result, ctx)}
-        }
-      """
-  }
-
-  protected def genHashForStruct(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      fields: Array[StructField]): String = {
-    fields.zipWithIndex.map { case (field, index) =>
-      nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx)
-    }.mkString("\n")
-  }
-
-  @tailrec
-  private def computeHashWithTailRec(
-      input: String,
-      dataType: DataType,
-      result: String,
-      ctx: CodegenContext): String = dataType match {
-    case NullType => ""
-    case BooleanType => genHashBoolean(input, result)
-    case ByteType | ShortType | IntegerType | DateType => genHashInt(input, result)
-    case LongType | TimestampType => genHashLong(input, result)
-    case FloatType => genHashFloat(input, result)
-    case DoubleType => genHashDouble(input, result)
-    case d: DecimalType => genHashDecimal(ctx, d, input, result)
-    case CalendarIntervalType => genHashCalendarInterval(input, result)
-    case BinaryType => genHashBytes(input, result)
-    case StringType => genHashString(input, result)
-    case ArrayType(et, containsNull) => genHashForArray(ctx, input, result, et, containsNull)
-    case MapType(kt, vt, valueContainsNull) =>
-      genHashForMap(ctx, input, result, kt, vt, valueContainsNull)
-    case StructType(fields) => genHashForStruct(ctx, input, result, fields)
-    case udt: UserDefinedType[_] => computeHashWithTailRec(input, udt.sqlType, result, ctx)
-  }
-
-  protected def computeHash(
-      input: String,
-      dataType: DataType,
-      result: String,
-      ctx: CodegenContext): String = computeHashWithTailRec(input, dataType, result, ctx)
-
-  protected def hasherClassName: String
-}
-
-/**
- * Base class for interpreted hash functions.
- */
-abstract class InterpretedHashFunction {
-  protected def hashInt(i: Int, seed: Long): Long
-
-  protected def hashLong(l: Long, seed: Long): Long
-
-  protected def hashUnsafeBytes(base: AnyRef, offset: Long, length: Int, seed: Long): Long
-
-  def hash(value: Any, dataType: DataType, seed: Long): Long = {
-    value match {
-      case null => seed
-      case b: Boolean => hashInt(if (b) 1 else 0, seed)
-      case b: Byte => hashInt(b, seed)
-      case s: Short => hashInt(s, seed)
-      case i: Int => hashInt(i, seed)
-      case l: Long => hashLong(l, seed)
-      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
-      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
-      case d: Decimal =>
-        val precision = dataType.asInstanceOf[DecimalType].precision
-        if (precision <= Decimal.MAX_LONG_DIGITS) {
-          hashLong(d.toUnscaledLong, seed)
-        } else {
-          val bytes = d.toJavaBigDecimal.unscaledValue().toByteArray
-          hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, bytes.length, seed)
-        }
-      case c: CalendarInterval => hashInt(c.months, hashLong(c.microseconds, seed))
-      case a: Array[Byte] =>
-        hashUnsafeBytes(a, Platform.BYTE_ARRAY_OFFSET, a.length, seed)
-      case s: UTF8String =>
-        hashUnsafeBytes(s.getBaseObject, s.getBaseOffset, s.numBytes(), seed)
-
-      case array: ArrayData =>
-        val elementType = dataType match {
-          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
-          case ArrayType(et, _) => et
-        }
-        var result = seed
-        var i = 0
-        while (i < array.numElements()) {
-          result = hash(array.get(i, elementType), elementType, result)
-          i += 1
-        }
-        result
-
-      case map: MapData =>
-        val (kt, vt) = dataType match {
-          case udt: UserDefinedType[_] =>
-            val mapType = udt.sqlType.asInstanceOf[MapType]
-            mapType.keyType -> mapType.valueType
-          case MapType(kt, vt, _) => kt -> vt
-        }
-        val keys = map.keyArray()
-        val values = map.valueArray()
-        var result = seed
-        var i = 0
-        while (i < map.numElements()) {
-          result = hash(keys.get(i, kt), kt, result)
-          result = hash(values.get(i, vt), vt, result)
-          i += 1
-        }
-        result
-
-      case struct: InternalRow =>
-        val types: Array[DataType] = dataType match {
-          case udt: UserDefinedType[_] =>
-            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
-          case StructType(fields) => fields.map(_.dataType)
-        }
-        var result = seed
-        var i = 0
-        val len = struct.numFields
-        while (i < len) {
-          result = hash(struct.get(i, types(i)), types(i), result)
-          i += 1
-        }
-        result
-    }
-  }
-}
-
-/**
- * A MurMur3 Hash expression.
- *
- * We should use this hash function for both shuffle and bucket, so that we can guarantee shuffle
- * and bucketing have same data distribution.
- */
-@ExpressionDescription(
-  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.",
-  extended = """
-    Examples:
-      > SELECT _FUNC_('Spark', array(123), 2);
-        -1321691492
-  """)
-case class Murmur3Hash(children: Seq[Expression], seed: Int) extends HashExpression[Int] {
-  def this(arguments: Seq[Expression]) = this(arguments, 42)
-
-  override def dataType: DataType = IntegerType
-
-  override def prettyName: String = "hash"
-
-  override protected def hasherClassName: String = classOf[Murmur3_x86_32].getName
-
-  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
-    Murmur3HashFunction.hash(value, dataType, seed).toInt
-  }
-}
-
-object Murmur3HashFunction extends InterpretedHashFunction {
-  override protected def hashInt(i: Int, seed: Long): Long = {
-    Murmur3_x86_32.hashInt(i, seed.toInt)
-  }
-
-  override protected def hashLong(l: Long, seed: Long): Long = {
-    Murmur3_x86_32.hashLong(l, seed.toInt)
-  }
-
-  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
-    Murmur3_x86_32.hashUnsafeBytes(base, offset, len, seed.toInt)
-  }
-}
 
 /**
  * Print the result of an expression to stderr (used for debugging codegen).
@@ -608,33 +88,6 @@ case class AssertTrue(child: Expression) extends UnaryExpression with ImplicitCa
   override def sql: String = s"assert_true(${child.sql})"
 }
 
-/**
- * A xxHash64 64-bit hash expression.
- */
-case class XxHash64(children: Seq[Expression], seed: Long) extends HashExpression[Long] {
-  def this(arguments: Seq[Expression]) = this(arguments, 42L)
-
-  override def dataType: DataType = LongType
-
-  override def prettyName: String = "xxHash"
-
-  override protected def hasherClassName: String = classOf[XXH64].getName
-
-  override protected def computeHash(value: Any, dataType: DataType, seed: Long): Long = {
-    XxHash64Function.hash(value, dataType, seed)
-  }
-}
-
-object XxHash64Function extends InterpretedHashFunction {
-  override protected def hashInt(i: Int, seed: Long): Long = XXH64.hashInt(i, seed)
-
-  override protected def hashLong(l: Long, seed: Long): Long = XXH64.hashLong(l, seed)
-
-  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
-    XXH64.hashUnsafeBytes(base, offset, len, seed)
-  }
-}
-
 /**
  * Returns the current database of the SessionCatalog.
  */
@@ -651,217 +104,3 @@ case class CurrentDatabase() extends LeafExpression with Unevaluable {
   override def nullable: Boolean = false
   override def prettyName: String = "current_database"
 }
-
-/**
- * Simulates Hive's hashing function at
- * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode() in Hive
- *
- * We should use this hash function for both shuffle and bucket of Hive tables, so that
- * we can guarantee shuffle and bucketing have same data distribution
- *
- * TODO: Support Decimal and date related types
- */
-@ExpressionDescription(
-  usage = "_FUNC_(expr1, expr2, ...) - Returns a hash value of the arguments.")
-case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
-  override val seed = 0
-
-  override def dataType: DataType = IntegerType
-
-  override def prettyName: String = "hive-hash"
-
-  override protected def hasherClassName: String = classOf[HiveHasher].getName
-
-  override protected def computeHash(value: Any, dataType: DataType, seed: Int): Int = {
-    HiveHashFunction.hash(value, dataType, seed).toInt
-  }
-
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    ev.isNull = "false"
-    val childHash = ctx.freshName("childHash")
-    val childrenHash = children.map { child =>
-      val childGen = child.genCode(ctx)
-      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
-        computeHash(childGen.value, child.dataType, childHash, ctx)
-      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;"
-    }.mkString(s"int $childHash = 0;", s"\n$childHash = 0;\n", "")
-
-    ev.copy(code = s"""
-      ${ctx.javaType(dataType)} ${ev.value} = $seed;
-      $childrenHash""")
-  }
-
-  override def eval(input: InternalRow): Int = {
-    var hash = seed
-    var i = 0
-    val len = children.length
-    while (i < len) {
-      hash = (31 * hash) + computeHash(children(i).eval(input), children(i).dataType, hash)
-      i += 1
-    }
-    hash
-  }
-
-  override protected def genHashInt(i: String, result: String): String =
-    s"$result = $hasherClassName.hashInt($i);"
-
-  override protected def genHashLong(l: String, result: String): String =
-    s"$result = $hasherClassName.hashLong($l);"
-
-  override protected def genHashBytes(b: String, result: String): String =
-    s"$result = $hasherClassName.hashUnsafeBytes($b, Platform.BYTE_ARRAY_OFFSET, $b.length);"
-
-  override protected def genHashCalendarInterval(input: String, result: String): String = {
-    s"""
-        $result = (31 * $hasherClassName.hashInt($input.months)) +
-          $hasherClassName.hashLong($input.microseconds);"
-     """
-  }
-
-  override protected def genHashString(input: String, result: String): String = {
-    val baseObject = s"$input.getBaseObject()"
-    val baseOffset = s"$input.getBaseOffset()"
-    val numBytes = s"$input.numBytes()"
-    s"$result = $hasherClassName.hashUnsafeBytes($baseObject, $baseOffset, $numBytes);"
-  }
-
-  override protected def genHashForArray(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      elementType: DataType,
-      containsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    val childResult = ctx.freshName("childResult")
-    s"""
-        int $childResult = 0;
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          $childResult = 0;
-          ${nullSafeElementHash(input, index, containsNull, elementType, childResult, ctx)};
-          $result = (31 * $result) + $childResult;
-        }
-      """
-  }
-
-  override protected def genHashForMap(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      keyType: DataType,
-      valueType: DataType,
-      valueContainsNull: Boolean): String = {
-    val index = ctx.freshName("index")
-    val keys = ctx.freshName("keys")
-    val values = ctx.freshName("values")
-    val keyResult = ctx.freshName("keyResult")
-    val valueResult = ctx.freshName("valueResult")
-    s"""
-        final ArrayData $keys = $input.keyArray();
-        final ArrayData $values = $input.valueArray();
-        int $keyResult = 0;
-        int $valueResult = 0;
-        for (int $index = 0; $index < $input.numElements(); $index++) {
-          $keyResult = 0;
-          ${nullSafeElementHash(keys, index, false, keyType, keyResult, ctx)}
-          $valueResult = 0;
-          ${nullSafeElementHash(values, index, valueContainsNull, valueType, valueResult, ctx)}
-          $result += $keyResult ^ $valueResult;
-        }
-      """
-  }
-
-  override protected def genHashForStruct(
-      ctx: CodegenContext,
-      input: String,
-      result: String,
-      fields: Array[StructField]): String = {
-    val localResult = ctx.freshName("localResult")
-    val childResult = ctx.freshName("childResult")
-    fields.zipWithIndex.map { case (field, index) =>
-      s"""
-         $childResult = 0;
-         ${nullSafeElementHash(input, index.toString, field.nullable, field.dataType,
-           childResult, ctx)}
-         $localResult = (31 * $localResult) + $childResult;
-       """
-    }.mkString(
-      s"""
-         int $localResult = 0;
-         int $childResult = 0;
-       """,
-      "",
-      s"$result = (31 * $result) + $localResult;"
-    )
-  }
-}
-
-object HiveHashFunction extends InterpretedHashFunction {
-  override protected def hashInt(i: Int, seed: Long): Long = {
-    HiveHasher.hashInt(i)
-  }
-
-  override protected def hashLong(l: Long, seed: Long): Long = {
-    HiveHasher.hashLong(l)
-  }
-
-  override protected def hashUnsafeBytes(base: AnyRef, offset: Long, len: Int, seed: Long): Long = {
-    HiveHasher.hashUnsafeBytes(base, offset, len)
-  }
-
-  override def hash(value: Any, dataType: DataType, seed: Long): Long = {
-    value match {
-      case null => 0
-      case array: ArrayData =>
-        val elementType = dataType match {
-          case udt: UserDefinedType[_] => udt.sqlType.asInstanceOf[ArrayType].elementType
-          case ArrayType(et, _) => et
-        }
-
-        var result = 0
-        var i = 0
-        val length = array.numElements()
-        while (i < length) {
-          result = (31 * result) + hash(array.get(i, elementType), elementType, 0).toInt
-          i += 1
-        }
-        result
-
-      case map: MapData =>
-        val (kt, vt) = dataType match {
-          case udt: UserDefinedType[_] =>
-            val mapType = udt.sqlType.asInstanceOf[MapType]
-            mapType.keyType -> mapType.valueType
-          case MapType(_kt, _vt, _) => _kt -> _vt
-        }
-        val keys = map.keyArray()
-        val values = map.valueArray()
-
-        var result = 0
-        var i = 0
-        val length = map.numElements()
-        while (i < length) {
-          result += hash(keys.get(i, kt), kt, 0).toInt ^ hash(values.get(i, vt), vt, 0).toInt
-          i += 1
-        }
-        result
-
-      case struct: InternalRow =>
-        val types: Array[DataType] = dataType match {
-          case udt: UserDefinedType[_] =>
-            udt.sqlType.asInstanceOf[StructType].map(_.dataType).toArray
-          case StructType(fields) => fields.map(_.dataType)
-        }
-
-        var result = 0
-        var i = 0
-        val length = struct.numFields
-        while (i < length) {
-          result = (31 * result) + hash(struct.get(i, types(i)), types(i), seed + 1).toInt
-          i += 1
-        }
-        result
-
-      case _ => super.hash(value, dataType, seed)
-    }
-  }
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
new file mode 100644
index 0000000000000..c714bc03dc0d5
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.nio.charset.StandardCharsets
+
+import org.apache.commons.codec.digest.DigestUtils
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{RandomDataGenerator, Row}
+import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
+import org.apache.spark.sql.types._
+
+class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("md5") {
+    checkEvaluation(Md5(Literal("ABC".getBytes(StandardCharsets.UTF_8))),
+      "902fbdd2b1df0c4f70b4a5d23525e932")
+    checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
+      "6ac1e56bc78f031059be7be854522c4c")
+    checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
+    checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType)
+  }
+
+  test("sha1") {
+    checkEvaluation(Sha1(Literal("ABC".getBytes(StandardCharsets.UTF_8))),
+      "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
+    checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
+      "5d211bad8f4ee70e16c7d343a838fc344a1ed961")
+    checkEvaluation(Sha1(Literal.create(null, BinaryType)), null)
+    checkEvaluation(Sha1(Literal("".getBytes(StandardCharsets.UTF_8))),
+      "da39a3ee5e6b4b0d3255bfef95601890afd80709")
+    checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType)
+  }
+
+  test("sha2") {
+    checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)), Literal(256)),
+      DigestUtils.sha256Hex("ABC"))
+    checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)),
+      DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6)))
+    // unsupported bit length
+    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null)
+    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(512)), null)
+    checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)),
+      Literal.create(null, IntegerType)), null)
+    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null)
+  }
+
+  test("crc32") {
+    checkEvaluation(Crc32(Literal("ABC".getBytes(StandardCharsets.UTF_8))), 2743272264L)
+    checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
+      2180413220L)
+    checkEvaluation(Crc32(Literal.create(null, BinaryType)), null)
+    checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType)
+  }
+
+  private val structOfString = new StructType().add("str", StringType)
+  private val structOfUDT = new StructType().add("udt", new ExamplePointUDT, false)
+  private val arrayOfString = ArrayType(StringType)
+  private val arrayOfNull = ArrayType(NullType)
+  private val mapOfString = MapType(StringType, StringType)
+  private val arrayOfUDT = ArrayType(new ExamplePointUDT, false)
+
+  testHash(
+    new StructType()
+      .add("null", NullType)
+      .add("boolean", BooleanType)
+      .add("byte", ByteType)
+      .add("short", ShortType)
+      .add("int", IntegerType)
+      .add("long", LongType)
+      .add("float", FloatType)
+      .add("double", DoubleType)
+      .add("bigDecimal", DecimalType.SYSTEM_DEFAULT)
+      .add("smallDecimal", DecimalType.USER_DEFAULT)
+      .add("string", StringType)
+      .add("binary", BinaryType)
+      .add("date", DateType)
+      .add("timestamp", TimestampType)
+      .add("udt", new ExamplePointUDT))
+
+  testHash(
+    new StructType()
+      .add("arrayOfNull", arrayOfNull)
+      .add("arrayOfString", arrayOfString)
+      .add("arrayOfArrayOfString", ArrayType(arrayOfString))
+      .add("arrayOfArrayOfInt", ArrayType(ArrayType(IntegerType)))
+      .add("arrayOfMap", ArrayType(mapOfString))
+      .add("arrayOfStruct", ArrayType(structOfString))
+      .add("arrayOfUDT", arrayOfUDT))
+
+  testHash(
+    new StructType()
+      .add("mapOfIntAndString", MapType(IntegerType, StringType))
+      .add("mapOfStringAndArray", MapType(StringType, arrayOfString))
+      .add("mapOfArrayAndInt", MapType(arrayOfString, IntegerType))
+      .add("mapOfArray", MapType(arrayOfString, arrayOfString))
+      .add("mapOfStringAndStruct", MapType(StringType, structOfString))
+      .add("mapOfStructAndString", MapType(structOfString, StringType))
+      .add("mapOfStruct", MapType(structOfString, structOfString)))
+
+  testHash(
+    new StructType()
+      .add("structOfString", structOfString)
+      .add("structOfStructOfString", new StructType().add("struct", structOfString))
+      .add("structOfArray", new StructType().add("array", arrayOfString))
+      .add("structOfMap", new StructType().add("map", mapOfString))
+      .add("structOfArrayAndMap",
+        new StructType().add("array", arrayOfString).add("map", mapOfString))
+      .add("structOfUDT", structOfUDT))
+
+  private def testHash(inputSchema: StructType): Unit = {
+    val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
+    val encoder = RowEncoder(inputSchema)
+    val seed = scala.util.Random.nextInt()
+    test(s"murmur3/xxHash64/hive hash: ${inputSchema.simpleString}") {
+      for (_ <- 1 to 10) {
+        val input = encoder.toRow(inputGenerator.apply().asInstanceOf[Row]).asInstanceOf[UnsafeRow]
+        val literals = input.toSeq(inputSchema).zip(inputSchema.map(_.dataType)).map {
+          case (value, dt) => Literal.create(value, dt)
+        }
+        // Only test the interpreted version has same result with codegen version.
+        checkEvaluation(Murmur3Hash(literals, seed), Murmur3Hash(literals, seed).eval())
+        checkEvaluation(XxHash64(literals, seed), XxHash64(literals, seed).eval())
+        checkEvaluation(HiveHash(literals), HiveHash(literals).eval())
+      }
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
index 13ce588462028..ed82efe7be2e8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
@@ -17,58 +17,11 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.nio.charset.StandardCharsets
-
-import org.apache.commons.codec.digest.DigestUtils
-
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{RandomDataGenerator, Row}
-import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
 import org.apache.spark.sql.types._
 
 class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
-  test("md5") {
-    checkEvaluation(Md5(Literal("ABC".getBytes(StandardCharsets.UTF_8))),
-      "902fbdd2b1df0c4f70b4a5d23525e932")
-    checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
-      "6ac1e56bc78f031059be7be854522c4c")
-    checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
-    checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType)
-  }
-
-  test("sha1") {
-    checkEvaluation(Sha1(Literal("ABC".getBytes(StandardCharsets.UTF_8))),
-      "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
-    checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
-      "5d211bad8f4ee70e16c7d343a838fc344a1ed961")
-    checkEvaluation(Sha1(Literal.create(null, BinaryType)), null)
-    checkEvaluation(Sha1(Literal("".getBytes(StandardCharsets.UTF_8))),
-      "da39a3ee5e6b4b0d3255bfef95601890afd80709")
-    checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType)
-  }
-
-  test("sha2") {
-    checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)), Literal(256)),
-      DigestUtils.sha256Hex("ABC"))
-    checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)),
-      DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6)))
-    // unsupported bit length
-    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null)
-    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(512)), null)
-    checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)),
-      Literal.create(null, IntegerType)), null)
-    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null)
-  }
-
-  test("crc32") {
-    checkEvaluation(Crc32(Literal("ABC".getBytes(StandardCharsets.UTF_8))), 2743272264L)
-    checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
-      2180413220L)
-    checkEvaluation(Crc32(Literal.create(null, BinaryType)), null)
-    checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType)
-  }
-
   test("assert_true") {
     intercept[RuntimeException] {
       checkEvaluation(AssertTrue(Literal.create(false, BooleanType)), null)
@@ -86,76 +39,4 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(AssertTrue(Cast(Literal(1), BooleanType)), null)
   }
 
-  private val structOfString = new StructType().add("str", StringType)
-  private val structOfUDT = new StructType().add("udt", new ExamplePointUDT, false)
-  private val arrayOfString = ArrayType(StringType)
-  private val arrayOfNull = ArrayType(NullType)
-  private val mapOfString = MapType(StringType, StringType)
-  private val arrayOfUDT = ArrayType(new ExamplePointUDT, false)
-
-  testHash(
-    new StructType()
-      .add("null", NullType)
-      .add("boolean", BooleanType)
-      .add("byte", ByteType)
-      .add("short", ShortType)
-      .add("int", IntegerType)
-      .add("long", LongType)
-      .add("float", FloatType)
-      .add("double", DoubleType)
-      .add("bigDecimal", DecimalType.SYSTEM_DEFAULT)
-      .add("smallDecimal", DecimalType.USER_DEFAULT)
-      .add("string", StringType)
-      .add("binary", BinaryType)
-      .add("date", DateType)
-      .add("timestamp", TimestampType)
-      .add("udt", new ExamplePointUDT))
-
-  testHash(
-    new StructType()
-      .add("arrayOfNull", arrayOfNull)
-      .add("arrayOfString", arrayOfString)
-      .add("arrayOfArrayOfString", ArrayType(arrayOfString))
-      .add("arrayOfArrayOfInt", ArrayType(ArrayType(IntegerType)))
-      .add("arrayOfMap", ArrayType(mapOfString))
-      .add("arrayOfStruct", ArrayType(structOfString))
-      .add("arrayOfUDT", arrayOfUDT))
-
-  testHash(
-    new StructType()
-      .add("mapOfIntAndString", MapType(IntegerType, StringType))
-      .add("mapOfStringAndArray", MapType(StringType, arrayOfString))
-      .add("mapOfArrayAndInt", MapType(arrayOfString, IntegerType))
-      .add("mapOfArray", MapType(arrayOfString, arrayOfString))
-      .add("mapOfStringAndStruct", MapType(StringType, structOfString))
-      .add("mapOfStructAndString", MapType(structOfString, StringType))
-      .add("mapOfStruct", MapType(structOfString, structOfString)))
-
-  testHash(
-    new StructType()
-      .add("structOfString", structOfString)
-      .add("structOfStructOfString", new StructType().add("struct", structOfString))
-      .add("structOfArray", new StructType().add("array", arrayOfString))
-      .add("structOfMap", new StructType().add("map", mapOfString))
-      .add("structOfArrayAndMap",
-        new StructType().add("array", arrayOfString).add("map", mapOfString))
-      .add("structOfUDT", structOfUDT))
-
-  private def testHash(inputSchema: StructType): Unit = {
-    val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
-    val encoder = RowEncoder(inputSchema)
-    val seed = scala.util.Random.nextInt()
-    test(s"murmur3/xxHash64/hive hash: ${inputSchema.simpleString}") {
-      for (_ <- 1 to 10) {
-        val input = encoder.toRow(inputGenerator.apply().asInstanceOf[Row]).asInstanceOf[UnsafeRow]
-        val literals = input.toSeq(inputSchema).zip(inputSchema.map(_.dataType)).map {
-          case (value, dt) => Literal.create(value, dt)
-        }
-        // Only test the interpreted version has same result with codegen version.
-        checkEvaluation(Murmur3Hash(literals, seed), Murmur3Hash(literals, seed).eval())
-        checkEvaluation(XxHash64(literals, seed), XxHash64(literals, seed).eval())
-        checkEvaluation(HiveHash(literals), HiveHash(literals).eval())
-      }
-    }
-  }
 }

From 3071d876b72eea71b227067204bc754e8555b020 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 5 Nov 2016 13:41:35 +0100
Subject: [PATCH 041/534] [SPARK-18192][MINOR][FOLLOWUP] Missed json test in
 FileStreamSinkSuite

## What changes were proposed in this pull request?

This PR proposes to fix

```diff
 test("FileStreamSink - json") {
-  testFormat(Some("text"))
+  testFormat(Some("json"))
 }
```

`text` is being tested above

```
test("FileStreamSink - text") {
  testFormat(Some("text"))
}
```

## How was this patch tested?

Fixed test in `FileStreamSinkSuite.scala`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15785 from HyukjinKwon/SPARK-18192.

(cherry picked from commit a87471c83006ec11c372b4f915e17a0501f1f536)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../org/apache/spark/sql/streaming/FileStreamSinkSuite.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 0f140f94f630e..fa97d9292e551 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -152,7 +152,7 @@ class FileStreamSinkSuite extends StreamTest {
   }
 
   test("FileStreamSink - json") {
-    testFormat(Some("text"))
+    testFormat(Some("json"))
   }
 
   def testFormat(format: Option[String]): Unit = {

From 446d72c5273177208333042e897c892151aa9eee Mon Sep 17 00:00:00 2001
From: wangyang <wangyang@haizhi.com>
Date: Sat, 5 Nov 2016 14:32:28 +0100
Subject: [PATCH 042/534] [SPARK-17849][SQL] Fix NPE problem when using
 grouping sets

## What changes were proposed in this pull request?

Prior this pr, the following code would cause an NPE:
`case class point(a:String, b:String, c:String, d: Int)`

`val data = Seq(
point("1","2","3", 1),
point("4","5","6", 1),
point("7","8","9", 1)
)`
`sc.parallelize(data).toDF().registerTempTable("table")`
`spark.sql("select a, b, c, count(d) from table group by a, b, c GROUPING SETS ((a)) ").show()`

The reason is that when the grouping_id() behavior was changed in #10677, some code (which should be changed) was left out.

Take the above code for example, prior #10677, the bit mask for set "(a)" was `001`, while after #10677 the bit mask was changed to `011`. However, the `nonNullBitmask` was not changed accordingly.

This pr will fix this problem.
## How was this patch tested?

add integration tests

Author: wangyang <wangyang@haizhi.com>

Closes #15416 from yangw1234/groupingid.

(cherry picked from commit fb0d60814a79747beb68da9613679141c44f2540)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  9 +++-
 .../sql-tests/inputs/grouping_set.sql         | 17 ++++++++
 .../sql-tests/results/grouping_set.sql.out    | 42 +++++++++++++++++++
 3 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 5011f2fdbf9b7..8dbec408002f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -299,10 +299,15 @@ class Analyzer(
           case other => Alias(other, other.toString)()
         }
 
-        val nonNullBitmask = x.bitmasks.reduce(_ & _)
+        // The rightmost bit in the bitmasks corresponds to the last expression in groupByAliases
+        // with 0 indicating this expression is in the grouping set. The following line of code
+        // calculates the bitmask representing the expressions that absent in at least one grouping
+        // set (indicated by 1).
+        val nullBitmask = x.bitmasks.reduce(_ | _)
 
+        val attrLength = groupByAliases.length
         val expandedAttributes = groupByAliases.zipWithIndex.map { case (a, idx) =>
-          a.toAttribute.withNullability((nonNullBitmask & 1 << idx) == 0)
+          a.toAttribute.withNullability(((nullBitmask >> (attrLength - idx - 1)) & 1) == 1)
         }
 
         val expand = Expand(x.bitmasks, groupByAliases, expandedAttributes, gid, x.child)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
new file mode 100644
index 0000000000000..3594283505280
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql
@@ -0,0 +1,17 @@
+CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES
+  ("1", "2", "3", 1),
+  ("4", "5", "6", 1),
+  ("7", "8", "9", 1)
+  as grouping(a, b, c, d);
+
+-- SPARK-17849: grouping set throws NPE #1
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (());
+
+-- SPARK-17849: grouping set throws NPE #2
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a));
+
+-- SPARK-17849: grouping set throws NPE #3
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c));
+
+
+
diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
new file mode 100644
index 0000000000000..edb38a52b7514
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out
@@ -0,0 +1,42 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 4
+
+
+-- !query 0
+CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES
+  ("1", "2", "3", 1),
+  ("4", "5", "6", 1),
+  ("7", "8", "9", 1)
+  as grouping(a, b, c, d)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (())
+-- !query 1 schema
+struct<a:string,b:string,c:string,count(d):bigint>
+-- !query 1 output
+NULL	NULL	NULL	3
+
+
+-- !query 2
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a))
+-- !query 2 schema
+struct<a:string,b:string,c:string,count(d):bigint>
+-- !query 2 output
+1	NULL	NULL	1
+4	NULL	NULL	1
+7	NULL	NULL	1
+
+
+-- !query 3
+SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c))
+-- !query 3 schema
+struct<a:string,b:string,c:string,count(d):bigint>
+-- !query 3 output
+NULL	NULL	3	1
+NULL	NULL	6	1
+NULL	NULL	9	1

From dcbc4265839d8a6f0300af6bcc1e8d8c102ec23f Mon Sep 17 00:00:00 2001
From: "Susan X. Huynh" <xhuynh@mesosphere.com>
Date: Sat, 5 Nov 2016 17:45:15 +0000
Subject: [PATCH 043/534] [SPARK-17964][SPARKR] Enable SparkR with Mesos client
 mode and cluster mode

## What changes were proposed in this pull request?

Enabled SparkR with Mesos client mode and cluster mode. Just a few changes were required to get this working on Mesos: (1) removed the SparkR on Mesos error checks and (2) do not require "--class" to be specified for R apps. The logic to check spark.mesos.executor.home was already in there.

sun-rui

## How was this patch tested?

1. SparkSubmitSuite
2. On local mesos cluster (on laptop): ran SparkR shell, spark-submit client mode, and spark-submit cluster mode, with the "examples/src/main/R/dataframe.R" example application.
3. On multi-node mesos cluster: ran SparkR shell, spark-submit client mode, and spark-submit cluster mode, with the "examples/src/main/R/dataframe.R" example application. I tested with the following --conf values set: spark.mesos.executor.docker.image and spark.mesos.executor.home

This contribution is my original work and I license the work to the project under the project's open source license.

Author: Susan X. Huynh <xhuynh@mesosphere.com>

Closes #15700 from susanxhuynh/susan-r-branch.

(cherry picked from commit 9a87c313859a6557bbf7bca7239043cb77ea23be)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/scala/org/apache/spark/api/r/RUtils.scala |  1 -
 .../org/apache/spark/deploy/SparkSubmit.scala      | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
index 77825e75e5136..fdd8cf62f0e5f 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RUtils.scala
@@ -84,7 +84,6 @@ private[spark] object RUtils {
       }
     } else {
       // Otherwise, assume the package is local
-      // TODO: support this for Mesos
       val sparkRPkgPath = localSparkRPackagePath.getOrElse {
           throw new SparkException("SPARK_HOME not set. Can't locate SparkR package.")
       }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 5c052286099f5..c70061bc5b5bc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -322,7 +322,7 @@ object SparkSubmit {
     }
 
     // Require all R files to be local
-    if (args.isR && !isYarnCluster) {
+    if (args.isR && !isYarnCluster && !isMesosCluster) {
       if (Utils.nonLocalPaths(args.primaryResource).nonEmpty) {
         printErrorAndExit(s"Only local R files are supported: ${args.primaryResource}")
       }
@@ -330,9 +330,6 @@ object SparkSubmit {
 
     // The following modes are not supported or applicable
     (clusterManager, deployMode) match {
-      case (MESOS, CLUSTER) if args.isR =>
-        printErrorAndExit("Cluster deploy mode is currently not supported for R " +
-          "applications on Mesos clusters.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -410,9 +407,9 @@ object SparkSubmit {
       printErrorAndExit("Distributing R packages with standalone cluster is not supported.")
     }
 
-    // TODO: Support SparkR with mesos cluster
-    if (args.isR && clusterManager == MESOS) {
-      printErrorAndExit("SparkR is not supported for Mesos cluster.")
+    // TODO: Support distributing R packages with mesos cluster
+    if (args.isR && clusterManager == MESOS && !RUtils.rPackages.isEmpty) {
+      printErrorAndExit("Distributing R packages with mesos cluster is not supported.")
     }
 
     // If we're running an R app, set the main class to our specific R runner
@@ -598,6 +595,9 @@ object SparkSubmit {
         if (args.pyFiles != null) {
           sysProps("spark.submit.pyFiles") = args.pyFiles
         }
+      } else if (args.isR) {
+        // Second argument is main class
+        childArgs += (args.primaryResource, "")
       } else {
         childArgs += (args.primaryResource, args.mainClass)
       }

From e9f1d4aaa472cf69165ffe75ed9b92618fa3900f Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 5 Nov 2016 21:47:33 -0700
Subject: [PATCH 044/534] [MINOR][DOCUMENTATION] Fix some minor descriptions in
 functions consistently with expressions

## What changes were proposed in this pull request?

This PR proposes to improve documentation and fix some descriptions equivalent to several minor fixes identified in https://github.com/apache/spark/pull/15677

Also, this suggests to change `Note:` and `NOTE:` to `.. note::` consistently with the others which marks up pretty.

## How was this patch tested?

Jenkins tests and manually.

For PySpark, `Note:` and `NOTE:` to `.. note::` make the document as below:

**From**

![2016-11-04 6 53 35](https://cloud.githubusercontent.com/assets/6477701/20002648/42989922-a2c5-11e6-8a32-b73eda49e8c3.png)
![2016-11-04 6 53 45](https://cloud.githubusercontent.com/assets/6477701/20002650/429fb310-a2c5-11e6-926b-e030d7eb0185.png)
![2016-11-04 6 54 11](https://cloud.githubusercontent.com/assets/6477701/20002649/429d570a-a2c5-11e6-9e7e-44090f337e32.png)
![2016-11-04 6 53 51](https://cloud.githubusercontent.com/assets/6477701/20002647/4297fc74-a2c5-11e6-801a-b89fbcbfca44.png)
![2016-11-04 6 53 51](https://cloud.githubusercontent.com/assets/6477701/20002697/749f5780-a2c5-11e6-835f-022e1f2f82e3.png)

**To**

![2016-11-04 7 03 48](https://cloud.githubusercontent.com/assets/6477701/20002659/4961b504-a2c5-11e6-9ee0-ef0751482f47.png)
![2016-11-04 7 04 03](https://cloud.githubusercontent.com/assets/6477701/20002660/49871d3a-a2c5-11e6-85ea-d9a5d11efeff.png)
![2016-11-04 7 04 28](https://cloud.githubusercontent.com/assets/6477701/20002662/498e0f14-a2c5-11e6-803d-c0c5aeda4153.png)
![2016-11-04 7 33 39](https://cloud.githubusercontent.com/assets/6477701/20002731/a76e30d2-a2c5-11e6-993b-0481b8342d6b.png)
![2016-11-04 7 33 39](https://cloud.githubusercontent.com/assets/6477701/20002731/a76e30d2-a2c5-11e6-993b-0481b8342d6b.png)

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15765 from HyukjinKwon/minor-function-doc.

(cherry picked from commit 15d392688456ad9f963417843c52a7b610f771d2)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/functions.R                           | 22 +++++++-----
 python/pyspark/sql/functions.py               | 35 +++++++++++--------
 .../org/apache/spark/sql/functions.scala      | 30 +++++++++-------
 3 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9a545f0647915..f8a9d3ce5d918 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2317,7 +2317,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
 
 #' from_utc_timestamp
 #'
-#' Assumes given timestamp is UTC and converts to given timezone.
+#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+#' that corresponds to the same time of day in the given timezone.
 #'
 #' @param y Column to compute on.
 #' @param x time zone to use.
@@ -2340,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
 #' Locate the position of the first occurrence of substr column in the given string.
 #' Returns null if either of the arguments are null.
 #'
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param y column to check
@@ -2391,7 +2392,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),
 
 #' to_utc_timestamp
 #'
-#' Assumes given timestamp is in given timezone and converts to UTC.
+#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+#' another timestamp that corresponds to the same time of day in UTC.
 #'
 #' @param y Column to compute on
 #' @param x timezone to use
@@ -2539,7 +2541,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
 
 #' shiftRight
 #'
-#' Shift the given value numBits right. If the given value is a long value, it will return
+#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
 #' a long value else it will return an integer value.
 #'
 #' @param y column to compute on.
@@ -2777,7 +2779,7 @@ setMethod("window", signature(x = "Column"),
 #' locate
 #'
 #' Locate the position of the first occurrence of substr.
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param substr a character string to be matched.
@@ -2823,7 +2825,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
 
 #' rand
 #'
-#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
+#' Generate a random column with independent and identically distributed (i.i.d.) samples
+#' from U[0.0, 1.0].
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -2852,7 +2855,8 @@ setMethod("rand", signature(seed = "numeric"),
 
 #' randn
 #'
-#' Generate a column with i.i.d. samples from the standard normal distribution.
+#' Generate a column with independent and identically distributed (i.i.d.) samples from
+#' the standard normal distribution.
 #'
 #' @param seed a random seed. Can be missing.
 #' @family normal_funcs
@@ -3442,8 +3446,8 @@ setMethod("size",
 
 #' sort_array
 #'
-#' Sorts the input array for the given column in ascending order,
-#' according to the natural ordering of the array elements.
+#' Sorts the input array in ascending or descending order according
+#' to the natural ordering of the array elements.
 #'
 #' @param x A Column to sort
 #' @param asc A logical flag indicating the sorting order.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 245357a4bad9f..46a092f16d4fc 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -359,8 +359,8 @@ def grouping_id(*cols):
 
        (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
 
-    Note: the list of columns should match with grouping columns exactly, or empty (means all the
-    grouping columns).
+    .. note:: the list of columns should match with grouping columns exactly, or empty (means all
+        the grouping columns).
 
     >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
     +-----+-------------+--------+
@@ -457,7 +457,8 @@ def nanvl(col1, col2):
 
 @since(1.4)
 def rand(seed=None):
-    """Generates a random column with i.i.d. samples from U[0.0, 1.0].
+    """Generates a random column with independent and identically distributed (i.i.d.) samples
+    from U[0.0, 1.0].
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -469,7 +470,8 @@ def rand(seed=None):
 
 @since(1.4)
 def randn(seed=None):
-    """Generates a column with i.i.d. samples from the standard normal distribution.
+    """Generates a column with independent and identically distributed (i.i.d.) samples from
+    the standard normal distribution.
     """
     sc = SparkContext._active_spark_context
     if seed is not None:
@@ -518,7 +520,7 @@ def shiftLeft(col, numBits):
 
 @since(1.5)
 def shiftRight(col, numBits):
-    """Shift the given value numBits right.
+    """(Signed) shift the given value numBits right.
 
     >>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
     [Row(r=21)]
@@ -777,8 +779,8 @@ def date_format(date, format):
     A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
     pattern letters of the Java class `java.text.SimpleDateFormat` can be used.
 
-    NOTE: Use when ever possible specialized functions like `year`. These benefit from a
-    specialized implementation.
+    .. note:: Use when ever possible specialized functions like `year`. These benefit from a
+        specialized implementation.
 
     >>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
     >>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
@@ -1059,7 +1061,8 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
 @since(1.5)
 def from_utc_timestamp(timestamp, tz):
     """
-    Assumes given timestamp is UTC and converts to given timezone.
+    Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+    that corresponds to the same time of day in the given timezone.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
     >>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1072,7 +1075,8 @@ def from_utc_timestamp(timestamp, tz):
 @since(1.5)
 def to_utc_timestamp(timestamp, tz):
     """
-    Assumes given timestamp is in given timezone and converts to UTC.
+    Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+    another timestamp that corresponds to the same time of day in UTC.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
     >>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1314,8 +1318,8 @@ def instr(str, substr):
     Locate the position of the first occurrence of substr column in the given string.
     Returns null if either of the arguments are null.
 
-    NOTE: The position is not zero based, but 1 based index, returns 0 if substr
-    could not be found in str.
+    .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+        could not be found in str.
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(instr(df.s, 'b').alias('s')).collect()
@@ -1379,8 +1383,8 @@ def locate(substr, str, pos=1):
     """
     Locate the position of the first occurrence of substr in a string column, after position pos.
 
-    NOTE: The position is not zero based, but 1 based index. returns 0 if substr
-    could not be found in str.
+    .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+        could not be found in str.
 
     :param substr: a string
     :param str: a Column of :class:`pyspark.sql.types.StringType`
@@ -1442,7 +1446,7 @@ def split(str, pattern):
     """
     Splits str around pattern (pattern is a regular expression).
 
-    NOTE: pattern is a string represent the regular expression.
+    .. note:: pattern is a string represent the regular expression.
 
     >>> df = spark.createDataFrame([('ab12cd',)], ['s',])
     >>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
@@ -1785,7 +1789,8 @@ def size(col):
 @since(1.5)
 def sort_array(col, asc=True):
     """
-    Collection function: sorts the input array for the given column in ascending order.
+    Collection function: sorts the input array in ascending or descending order according
+    to the natural ordering of the array elements.
 
     :param col: name of column or expression
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 944a476114faf..e221c032b82f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1117,7 +1117,8 @@ object functions {
   def not(e: Column): Column = !e
 
   /**
-   * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+   * Generate a random column with independent and identically distributed (i.i.d.) samples
+   * from U[0.0, 1.0].
    *
    * Note that this is indeterministic when data partitions are not fixed.
    *
@@ -1127,7 +1128,8 @@ object functions {
   def rand(seed: Long): Column = withExpr { Rand(seed) }
 
   /**
-   * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+   * Generate a random column with independent and identically distributed (i.i.d.) samples
+   * from U[0.0, 1.0].
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1135,7 +1137,8 @@ object functions {
   def rand(): Column = rand(Utils.random.nextLong)
 
   /**
-   * Generate a column with i.i.d. samples from the standard normal distribution.
+   * Generate a column with independent and identically distributed (i.i.d.) samples from
+   * the standard normal distribution.
    *
    * Note that this is indeterministic when data partitions are not fixed.
    *
@@ -1145,7 +1148,8 @@ object functions {
   def randn(seed: Long): Column = withExpr { Randn(seed) }
 
   /**
-   * Generate a column with i.i.d. samples from the standard normal distribution.
+   * Generate a column with independent and identically distributed (i.i.d.) samples from
+   * the standard normal distribution.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1153,7 +1157,7 @@ object functions {
   def randn(): Column = randn(Utils.random.nextLong)
 
   /**
-   * Partition ID of the Spark task.
+   * Partition ID.
    *
    * Note that this is indeterministic because it depends on data partitioning and task scheduling.
    *
@@ -1877,8 +1881,8 @@ object functions {
   def shiftLeft(e: Column, numBits: Int): Column = withExpr { ShiftLeft(e.expr, lit(numBits).expr) }
 
   /**
-   * Shift the given value numBits right. If the given value is a long value, it will return
-   * a long value else it will return an integer value.
+   * (Signed) shift the given value numBits right. If the given value is a long value, it will
+   * return a long value else it will return an integer value.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -2203,7 +2207,7 @@ object functions {
    * Locate the position of the first occurrence of substr column in the given string.
    * Returns null if either of the arguments are null.
    *
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2238,7 +2242,7 @@ object functions {
 
   /**
    * Locate the position of the first occurrence of substr.
-   * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+   * NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2666,7 +2670,8 @@ object functions {
   }
 
   /**
-   * Assumes given timestamp is UTC and converts to given timezone.
+   * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+   * that corresponds to the same time of day in the given timezone.
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2675,7 +2680,8 @@ object functions {
   }
 
   /**
-   * Assumes given timestamp is in given timezone and converts to UTC.
+   * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+   * another timestamp that corresponds to the same time of day in UTC.
    * @group datetime_funcs
    * @since 1.5.0
    */
@@ -2996,7 +3002,7 @@ object functions {
   def sort_array(e: Column): Column = sort_array(e, asc = true)
 
   /**
-   * Sorts the input array for the given column in ascending / descending order,
+   * Sorts the input array for the given column in ascending or descending order,
    * according to the natural ordering of the array elements.
    *
    * @group collection_funcs

From c42301f1eb09565cfaa044b05984ed67879bd946 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Sat, 5 Nov 2016 22:38:07 -0700
Subject: [PATCH 045/534] [SPARK-18276][ML] ML models should copy the training
 summary and set parent

## What changes were proposed in this pull request?

Only some of the models which contain a training summary currently set the summaries in the copy method. Linear/Logistic regression do, GLR, GMM, KM, and BKM do not. Additionally, these copy methods did not set the parent pointer of the copied model. This patch modifies the copy methods of the four models mentioned above to copy the training summary and set the parent.

## How was this patch tested?

Add unit tests in Linear/Logistic/GeneralizedLinear regression and GaussianMixture/KMeans/BisectingKMeans to check the parent pointer of the copied model and check that the copied model has a summary.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15773 from sethah/SPARK-18276.

(cherry picked from commit 23ce0d1e91076d90c1a87d698a94d283d08cf899)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../apache/spark/ml/clustering/BisectingKMeans.scala  |  5 +++--
 .../apache/spark/ml/clustering/GaussianMixture.scala  |  5 +++--
 .../scala/org/apache/spark/ml/clustering/KMeans.scala |  5 +++--
 .../ml/regression/GeneralizedLinearRegression.scala   |  6 ++++--
 .../apache/spark/ml/tuning/TrainValidationSplit.scala |  2 +-
 .../ml/classification/LogisticRegressionSuite.scala   | 11 +++++++----
 .../spark/ml/clustering/BisectingKMeansSuite.scala    | 10 +++++++++-
 .../spark/ml/clustering/GaussianMixtureSuite.scala    | 10 +++++++++-
 .../org/apache/spark/ml/clustering/KMeansSuite.scala  | 10 +++++++++-
 .../regression/GeneralizedLinearRegressionSuite.scala |  5 ++++-
 .../spark/ml/regression/LinearRegressionSuite.scala   |  5 ++++-
 .../spark/ml/tuning/TrainValidationSplitSuite.scala   |  8 ++++++--
 12 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 2718dd93dcb5a..f8a606d60b2aa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -94,8 +94,9 @@ class BisectingKMeansModel private[ml] (
 
   @Since("2.0.0")
   override def copy(extra: ParamMap): BisectingKMeansModel = {
-    val copied = new BisectingKMeansModel(uid, parentModel)
-    copyValues(copied, extra)
+    val copied = copyValues(new BisectingKMeansModel(uid, parentModel), extra)
+    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
+    copied.setParent(this.parent)
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 8fac63fefbb55..a0bd66e731a1d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -89,8 +89,9 @@ class GaussianMixtureModel private[ml] (
 
   @Since("2.0.0")
   override def copy(extra: ParamMap): GaussianMixtureModel = {
-    val copied = new GaussianMixtureModel(uid, weights, gaussians)
-    copyValues(copied, extra).setParent(this.parent)
+    val copied = copyValues(new GaussianMixtureModel(uid, weights, gaussians), extra)
+    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
+    copied.setParent(this.parent)
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 85bb8c93b3fa9..a0d481b294ac7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -108,8 +108,9 @@ class KMeansModel private[ml] (
 
   @Since("1.5.0")
   override def copy(extra: ParamMap): KMeansModel = {
-    val copied = new KMeansModel(uid, parentModel)
-    copyValues(copied, extra)
+    val copied = copyValues(new KMeansModel(uid, parentModel), extra)
+    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
+    copied.setParent(this.parent)
   }
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 8656ecf609ea4..1938e8ecc513d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -776,8 +776,10 @@ class GeneralizedLinearRegressionModel private[ml] (
 
   @Since("2.0.0")
   override def copy(extra: ParamMap): GeneralizedLinearRegressionModel = {
-    copyValues(new GeneralizedLinearRegressionModel(uid, coefficients, intercept), extra)
-      .setParent(parent)
+    val copied = copyValues(new GeneralizedLinearRegressionModel(uid, coefficients, intercept),
+      extra)
+    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
+    copied.setParent(parent)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index 0fdba1cb8814a..5d1a39f7c16db 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -221,7 +221,7 @@ class TrainValidationSplitModel private[ml] (
       uid,
       bestModel.copy(extra).asInstanceOf[Model[_]],
       validationMetrics.clone())
-    copyValues(copied, extra)
+    copyValues(copied, extra).setParent(parent)
   }
 
   @Since("2.0.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 8771fd2e9d2b2..2877285eb4d59 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
 import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
-import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -141,6 +141,12 @@ class LogisticRegressionSuite
     assert(model.getProbabilityCol === "probability")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
+
+    // copied model must have the same parent.
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
   }
 
   test("empty probabilityCol") {
@@ -251,9 +257,6 @@ class LogisticRegressionSuite
     mlr.setFitIntercept(false)
     val mlrModel = mlr.fit(smallMultinomialDataset)
     assert(mlrModel.interceptVector === Vectors.sparse(3, Seq()))
-
-    // copied model must have the same parent.
-    MLTestingUtils.checkCopy(model)
   }
 
   test("logistic regression with setters") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index f2368a9f8dad5..49797d938d751 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
@@ -41,6 +42,13 @@ class BisectingKMeansSuite
     assert(bkm.getPredictionCol === "prediction")
     assert(bkm.getMaxIter === 20)
     assert(bkm.getMinDivisibleClusterSize === 1.0)
+    val model = bkm.setMaxIter(1).fit(dataset)
+
+    // copied model must have the same parent
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
   }
 
   test("setter/getter") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 003fa6abf6597..7165b63ed3b96 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
@@ -43,6 +44,13 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(gm.getPredictionCol === "prediction")
     assert(gm.getMaxIter === 100)
     assert(gm.getTol === 0.01)
+    val model = gm.setMaxIter(1).fit(dataset)
+
+    // copied model must have the same parent
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
   }
 
   test("set parameters") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index ca392653557c4..73972557d2631 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -19,7 +19,8 @@ package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
@@ -47,6 +48,13 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     assert(kmeans.getInitMode === MLlibKMeans.K_MEANS_PARALLEL)
     assert(kmeans.getInitSteps === 2)
     assert(kmeans.getTol === 1e-4)
+    val model = kmeans.setMaxIter(1).fit(dataset)
+
+    // copied model must have the same parent
+    MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
   }
 
   test("set parameters") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index ac1ef5feb95ba..111bc974642d9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
-import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.random._
@@ -183,6 +183,9 @@ class GeneralizedLinearRegressionSuite
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
 
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index c0e8afbf5e346..df97d0b2ae7ad 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
-import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
@@ -143,6 +143,9 @@ class LinearRegressionSuite
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
+    assert(model.hasSummary)
+    val copiedModel = model.copy(ParamMap.empty)
+    assert(copiedModel.hasSummary)
 
     model.transform(datasetWithDenseFeature)
       .select("label", "prediction")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 87100ae2e342f..4463a9b6e543a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -22,11 +22,11 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
 import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared.HasInputCol
 import org.apache.spark.ml.regression.LinearRegression
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.StructType
@@ -78,6 +78,10 @@ class TrainValidationSplitSuite
       .setTrainRatio(0.5)
       .setSeed(42L)
     val cvModel = cv.fit(dataset)
+
+    // copied model must have the same paren.
+    MLTestingUtils.checkCopy(cvModel)
+
     val parent = cvModel.bestModel.parent.asInstanceOf[LinearRegression]
     assert(parent.getRegParam === 0.001)
     assert(parent.getMaxIter === 10)

From dcbf3fd4bd42059aed9c966d4f0cdf58815eb802 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 6 Nov 2016 14:11:37 +0000
Subject: [PATCH 046/534] [SPARK-17854][SQL] rand/randn allows null/long as
 input seed

## What changes were proposed in this pull request?

This PR proposes `rand`/`randn` accept `null` as input in Scala/SQL and `LongType` as input in SQL. In this case, it treats the values as `0`.

So, this PR includes both changes below:
- `null` support

  It seems MySQL also accepts this.

  ``` sql
  mysql> select rand(0);
  +---------------------+
  | rand(0)             |
  +---------------------+
  | 0.15522042769493574 |
  +---------------------+
  1 row in set (0.00 sec)

  mysql> select rand(NULL);
  +---------------------+
  | rand(NULL)          |
  +---------------------+
  | 0.15522042769493574 |
  +---------------------+
  1 row in set (0.00 sec)
  ```

  and also Hive does according to [HIVE-14694](https://issues.apache.org/jira/browse/HIVE-14694)

  So the codes below:

  ``` scala
  spark.range(1).selectExpr("rand(null)").show()
  ```

  prints..

  **Before**

  ```
    Input argument to rand must be an integer literal.;; line 1 pos 0
  org.apache.spark.sql.AnalysisException: Input argument to rand must be an integer literal.;; line 1 pos 0
  at org.apache.spark.sql.catalyst.analysis.FunctionRegistry$$anonfun$5.apply(FunctionRegistry.scala:465)
  at org.apache.spark.sql.catalyst.analysis.FunctionRegistry$$anonfun$5.apply(FunctionRegistry.scala:444)
  ```

  **After**

  ```
    +-----------------------+
    |rand(CAST(NULL AS INT))|
    +-----------------------+
    |    0.13385709732307427|
    +-----------------------+
  ```
- `LongType` support in SQL.

  In addition, it make the function allows to take `LongType` consistently within Scala/SQL.

  In more details, the codes below:

  ``` scala
  spark.range(1).select(rand(1), rand(1L)).show()
  spark.range(1).selectExpr("rand(1)", "rand(1L)").show()
  ```

  prints..

  **Before**

  ```
  +------------------+------------------+
  |           rand(1)|           rand(1)|
  +------------------+------------------+
  |0.2630967864682161|0.2630967864682161|
  +------------------+------------------+

  Input argument to rand must be an integer literal.;; line 1 pos 0
  org.apache.spark.sql.AnalysisException: Input argument to rand must be an integer literal.;; line 1 pos 0
  at org.apache.spark.sql.catalyst.analysis.FunctionRegistry$$anonfun$5.apply(FunctionRegistry.scala:465)
  at
  ```

  **After**

  ```
  +------------------+------------------+
  |           rand(1)|           rand(1)|
  +------------------+------------------+
  |0.2630967864682161|0.2630967864682161|
  +------------------+------------------+

  +------------------+------------------+
  |           rand(1)|           rand(1)|
  +------------------+------------------+
  |0.2630967864682161|0.2630967864682161|
  +------------------+------------------+
  ```
## How was this patch tested?

Unit tests in `DataFrameSuite.scala` and `RandomSuite.scala`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15432 from HyukjinKwon/SPARK-17854.

(cherry picked from commit 340f09d100cb669bc6795f085aac6fa05630a076)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../expressions/randomExpressions.scala       | 50 ++++++-----
 .../catalyst/expressions/RandomSuite.scala    |  6 ++
 .../resources/sql-tests/inputs/random.sql     | 17 ++++
 .../sql-tests/results/random.sql.out          | 84 +++++++++++++++++++
 4 files changed, 135 insertions(+), 22 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/random.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/random.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index a331a5557b455..1d7a3c7356075 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.TaskContext
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
-import org.apache.spark.sql.types.{DataType, DoubleType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
 
@@ -32,10 +31,7 @@ import org.apache.spark.util.random.XORShiftRandom
  *
  * Since this expression is stateful, it cannot be a case object.
  */
-abstract class RDG extends LeafExpression with Nondeterministic {
-
-  protected def seed: Long
-
+abstract class RDG extends UnaryExpression with ExpectsInputTypes with Nondeterministic {
   /**
    * Record ID within each partition. By being transient, the Random Number Generator is
    * reset every time we serialize and deserialize and initialize it.
@@ -46,12 +42,18 @@ abstract class RDG extends LeafExpression with Nondeterministic {
     rng = new XORShiftRandom(seed + partitionIndex)
   }
 
+  @transient protected lazy val seed: Long = child match {
+    case Literal(s, IntegerType) => s.asInstanceOf[Int]
+    case Literal(s, LongType) => s.asInstanceOf[Long]
+    case _ => throw new AnalysisException(
+      s"Input argument to $prettyName must be an integer, long or null literal.")
+  }
+
   override def nullable: Boolean = false
 
   override def dataType: DataType = DoubleType
 
-  // NOTE: Even if the user doesn't provide a seed, Spark SQL adds a default seed.
-  override def sql: String = s"$prettyName($seed)"
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(IntegerType, LongType))
 }
 
 /** Generate a random column with i.i.d. uniformly distributed values in [0, 1). */
@@ -64,17 +66,15 @@ abstract class RDG extends LeafExpression with Nondeterministic {
        0.9629742951434543
       > SELECT _FUNC_(0);
        0.8446490682263027
+      > SELECT _FUNC_(null);
+       0.8446490682263027
   """)
 // scalastyle:on line.size.limit
-case class Rand(seed: Long) extends RDG {
-  override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
+case class Rand(child: Expression) extends RDG {
 
-  def this() = this(Utils.random.nextLong())
+  def this() = this(Literal(Utils.random.nextLong(), LongType))
 
-  def this(seed: Expression) = this(seed match {
-    case IntegerLiteral(s) => s
-    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
-  })
+  override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rngTerm = ctx.freshName("rng")
@@ -87,6 +87,10 @@ case class Rand(seed: Long) extends RDG {
   }
 }
 
+object Rand {
+  def apply(seed: Long): Rand = Rand(Literal(seed, LongType))
+}
+
 /** Generate a random column with i.i.d. values drawn from the standard normal distribution. */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
@@ -97,17 +101,15 @@ case class Rand(seed: Long) extends RDG {
        -0.3254147983080288
       > SELECT _FUNC_(0);
        1.1164209726833079
+      > SELECT _FUNC_(null);
+       1.1164209726833079
   """)
 // scalastyle:on line.size.limit
-case class Randn(seed: Long) extends RDG {
-  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
+case class Randn(child: Expression) extends RDG {
 
-  def this() = this(Utils.random.nextLong())
+  def this() = this(Literal(Utils.random.nextLong(), LongType))
 
-  def this(seed: Expression) = this(seed match {
-    case IntegerLiteral(s) => s
-    case _ => throw new AnalysisException("Input argument to randn must be an integer literal.")
-  })
+  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rngTerm = ctx.freshName("rng")
@@ -119,3 +121,7 @@ case class Randn(seed: Long) extends RDG {
       final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false")
   }
 }
+
+object Randn {
+  def apply(seed: Long): Randn = Randn(Literal(seed, LongType))
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
index b7a0d44fa7e57..752c9d5449ee2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala
@@ -20,12 +20,18 @@ package org.apache.spark.sql.catalyst.expressions
 import org.scalatest.Matchers._
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.{IntegerType, LongType}
 
 class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("random") {
     checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
     checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)
+
+    checkDoubleEvaluation(
+      new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
+    checkDoubleEvaluation(
+      new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
   }
 
   test("SPARK-9127 codegen with long seed") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/random.sql b/sql/core/src/test/resources/sql-tests/inputs/random.sql
new file mode 100644
index 0000000000000..a1aae7b8759dc
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/random.sql
@@ -0,0 +1,17 @@
+-- rand with the seed 0
+SELECT rand(0);
+SELECT rand(cast(3 / 7 AS int));
+SELECT rand(NULL);
+SELECT rand(cast(NULL AS int));
+
+-- rand unsupported data type
+SELECT rand(1.0);
+
+-- randn with the seed 0
+SELECT randn(0L);
+SELECT randn(cast(3 / 7 AS long));
+SELECT randn(NULL);
+SELECT randn(cast(NULL AS long));
+
+-- randn unsupported data type
+SELECT rand('1')
diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out
new file mode 100644
index 0000000000000..bca67320fe7bb
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out
@@ -0,0 +1,84 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 10
+
+
+-- !query 0
+SELECT rand(0)
+-- !query 0 schema
+struct<rand(0):double>
+-- !query 0 output
+0.8446490682263027
+
+
+-- !query 1
+SELECT rand(cast(3 / 7 AS int))
+-- !query 1 schema
+struct<rand(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS INT)):double>
+-- !query 1 output
+0.8446490682263027
+
+
+-- !query 2
+SELECT rand(NULL)
+-- !query 2 schema
+struct<rand(CAST(NULL AS INT)):double>
+-- !query 2 output
+0.8446490682263027
+
+
+-- !query 3
+SELECT rand(cast(NULL AS int))
+-- !query 3 schema
+struct<rand(CAST(NULL AS INT)):double>
+-- !query 3 output
+0.8446490682263027
+
+
+-- !query 4
+SELECT rand(1.0)
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'rand(1.0BD)' due to data type mismatch: argument 1 requires (int or bigint) type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7
+
+
+-- !query 5
+SELECT randn(0L)
+-- !query 5 schema
+struct<randn(0):double>
+-- !query 5 output
+1.1164209726833079
+
+
+-- !query 6
+SELECT randn(cast(3 / 7 AS long))
+-- !query 6 schema
+struct<randn(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS BIGINT)):double>
+-- !query 6 output
+1.1164209726833079
+
+
+-- !query 7
+SELECT randn(NULL)
+-- !query 7 schema
+struct<randn(CAST(NULL AS INT)):double>
+-- !query 7 output
+1.1164209726833079
+
+
+-- !query 8
+SELECT randn(cast(NULL AS long))
+-- !query 8 schema
+struct<randn(CAST(NULL AS BIGINT)):double>
+-- !query 8 output
+1.1164209726833079
+
+
+-- !query 9
+SELECT rand('1')
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'rand('1')' due to data type mismatch: argument 1 requires (int or bigint) type, however, ''1'' is of string type.; line 1 pos 7

From d2f2cf68a62a3f8beb7cdfef8393acfdcb785975 Mon Sep 17 00:00:00 2001
From: Wojciech Szymanski <wk.szymanski@gmail.com>
Date: Sun, 6 Nov 2016 07:43:13 -0800
Subject: [PATCH 047/534] [SPARK-18210][ML] Pipeline.copy does not create an
 instance with the same UID

## What changes were proposed in this pull request?

Motivation:
`org.apache.spark.ml.Pipeline.copy(extra: ParamMap)` does not create an instance with the same UID. It does not conform to the method specification from its base class `org.apache.spark.ml.param.Params.copy(extra: ParamMap)`

Solution:
- fix for Pipeline UID
- introduced new tests for `org.apache.spark.ml.Pipeline.copy`
- minor improvements in test for `org.apache.spark.ml.PipelineModel.copy`

## How was this patch tested?

Introduced new unit test: `org.apache.spark.ml.PipelineSuite."Pipeline.copy"`
Improved existing unit test: `org.apache.spark.ml.PipelineSuite."PipelineModel.copy"`

Author: Wojciech Szymanski <wk.szymanski@gmail.com>

Closes #15759 from wojtek-szymanski/SPARK-18210.

(cherry picked from commit b89d0556dff0520ab35882382242fbfa7d9478eb)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../scala/org/apache/spark/ml/Pipeline.scala  |  2 +-
 .../org/apache/spark/ml/PipelineSuite.scala   | 22 +++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 195a93e086725..f406f8c426d0c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -169,7 +169,7 @@ class Pipeline @Since("1.4.0") (
   override def copy(extra: ParamMap): Pipeline = {
     val map = extractParamMap(extra)
     val newStages = map(stages).map(_.copy(extra))
-    new Pipeline().setStages(newStages)
+    new Pipeline(uid).setStages(newStages)
   }
 
   @Since("1.2.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
index 6413ca1f8b19e..dafc6c200f95f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala
@@ -101,13 +101,31 @@ class PipelineSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     }
   }
 
+  test("Pipeline.copy") {
+    val hashingTF = new HashingTF()
+      .setNumFeatures(100)
+    val pipeline = new Pipeline("pipeline").setStages(Array[Transformer](hashingTF))
+    val copied = pipeline.copy(ParamMap(hashingTF.numFeatures -> 10))
+
+    assert(copied.uid === pipeline.uid,
+      "copy should create an instance with the same UID")
+    assert(copied.getStages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
+      "copy should handle extra stage params")
+  }
+
   test("PipelineModel.copy") {
     val hashingTF = new HashingTF()
       .setNumFeatures(100)
-    val model = new PipelineModel("pipeline", Array[Transformer](hashingTF))
+    val model = new PipelineModel("pipelineModel", Array[Transformer](hashingTF))
+      .setParent(new Pipeline())
     val copied = model.copy(ParamMap(hashingTF.numFeatures -> 10))
-    require(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
+
+    assert(copied.uid === model.uid,
+      "copy should create an instance with the same UID")
+    assert(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
       "copy should handle extra stage params")
+    assert(copied.parent === model.parent,
+      "copy should create an instance with the same parent")
   }
 
   test("pipeline model constructors") {

From a8fbcdbf252634b1ebc910d8f5e86c16c39167f8 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 6 Nov 2016 18:52:05 -0800
Subject: [PATCH 048/534] [SPARK-18269][SQL] CSV datasource should read null
 properly when schema is lager than parsed tokens

## What changes were proposed in this pull request?

Currently, there are the three cases when reading CSV by datasource when it is `PERMISSIVE` parse mode.

- schema == parsed tokens (from each line)
  No problem to cast the value in the tokens to the field in the schema as they are equal.

- schema < parsed tokens (from each line)
  It slices the tokens into the number of fields in schema.

- schema > parsed tokens (from each line)
  It appends `null` into parsed tokens so that safely values can be casted with the schema.

However, when `null` is appended in the third case, we should take `null` into account when casting the values.

In case of `StringType`, it is fine as `UTF8String.fromString(datum)` produces `null` when the input is `null`. Therefore, this case will happen only when schema is explicitly given and schema includes data types that are not `StringType`.

The codes below:

```scala
val path = "/tmp/a"
Seq("1").toDF().write.text(path.getAbsolutePath)
val schema = StructType(
  StructField("a", IntegerType, true) ::
  StructField("b", IntegerType, true) :: Nil)
spark.read.schema(schema).option("header", "false").csv(path).show()
```

prints

**Before**

```
java.lang.NumberFormatException: null
at java.lang.Integer.parseInt(Integer.java:542)
at java.lang.Integer.parseInt(Integer.java:615)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:29)
at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$.castTo(CSVInferSchema.scala:24)
```

**After**

```
+---+----+
|  a|   b|
+---+----+
|  1|null|
+---+----+
```

## How was this patch tested?

Unit test in `CSVSuite.scala` and `CSVTypeCastSuite.scala`

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15767 from HyukjinKwon/SPARK-18269.

(cherry picked from commit 556a3b7d07f36c29ceb88fb6c24cc229e0e53ee4)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../datasources/csv/CSVInferSchema.scala      | 17 +++-
 .../datasources/csv/CSVRelation.scala         |  1 +
 .../execution/datasources/csv/CSVSuite.scala  | 15 +++
 .../datasources/csv/CSVTypeCastSuite.scala    | 93 +++++++++++--------
 4 files changed, 81 insertions(+), 45 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index 1981d8607c0c6..c63aae9d83855 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -221,18 +221,27 @@ private[csv] object CSVTypeCast {
    * Currently we do not support complex types (ArrayType, MapType, StructType).
    *
    * For string types, this is simply the datum. For other types.
-   * For other nullable types, this is null if the string datum is empty.
+   * For other nullable types, returns null if it is null or equals to the value specified
+   * in `nullValue` option.
    *
    * @param datum string value
-   * @param castType SparkSQL type
+   * @param name field name in schema.
+   * @param castType data type to cast `datum` into.
+   * @param nullable nullability for the field.
+   * @param options CSV options.
    */
   def castTo(
       datum: String,
+      name: String,
       castType: DataType,
       nullable: Boolean = true,
       options: CSVOptions = CSVOptions()): Any = {
 
-    if (nullable && datum == options.nullValue) {
+    // datum can be null if the number of fields found is less than the length of the schema
+    if (datum == options.nullValue || datum == null) {
+      if (!nullable) {
+        throw new RuntimeException(s"null value found but field $name is not nullable.")
+      }
       null
     } else {
       castType match {
@@ -281,7 +290,7 @@ private[csv] object CSVTypeCast {
               DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
             }
         case _: StringType => UTF8String.fromString(datum)
-        case udt: UserDefinedType[_] => castTo(datum, udt.sqlType, nullable, options)
+        case udt: UserDefinedType[_] => castTo(datum, name, udt.sqlType, nullable, options)
         case _ => throw new RuntimeException(s"Unsupported type: ${castType.typeName}")
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index a249b9d9d59b8..a47b4141531fd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -124,6 +124,7 @@ object CSVRelation extends Logging {
             // value is not stored in the row.
             val value = CSVTypeCast.castTo(
               indexSafeTokens(index),
+              field.name,
               field.dataType,
               field.nullable,
               params)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 8209b5bd7f9de..491ff72337a81 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -890,4 +890,19 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       }
     }
   }
+
+  test("load null when the schema is larger than parsed tokens ") {
+    withTempPath { path =>
+      Seq("1").toDF().write.text(path.getAbsolutePath)
+      val schema = StructType(
+        StructField("a", IntegerType, true) ::
+        StructField("b", IntegerType, true) :: Nil)
+      val df = spark.read
+        .schema(schema)
+        .option("header", "false")
+        .csv(path.getAbsolutePath)
+
+      checkAnswer(df, Row(1, null))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
index c74406b9cbfbb..46333d12138fb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
@@ -36,7 +36,7 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
     stringValues.zip(decimalValues).foreach { case (strVal, decimalVal) =>
       val decimalValue = new BigDecimal(decimalVal.toString)
-      assert(CSVTypeCast.castTo(strVal, decimalType) ===
+      assert(CSVTypeCast.castTo(strVal, "_1", decimalType) ===
         Decimal(decimalValue, decimalType.precision, decimalType.scale))
     }
   }
@@ -67,80 +67,91 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
   test("Nullable types are handled") {
     assertNull(
-      CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", ByteType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", ShortType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", LongType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", FloatType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", DateType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", DateType, nullable = true, CSVOptions("nullValue", "-")))
     assertNull(
-      CSVTypeCast.castTo("-", StringType, nullable = true, CSVOptions("nullValue", "-")))
+      CSVTypeCast.castTo("-", "_1", StringType, nullable = true, CSVOptions("nullValue", "-")))
+    assertNull(
+      CSVTypeCast.castTo(null, "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+
+    // casting a null to not nullable field should throw an exception.
+    var message = intercept[RuntimeException] {
+      CSVTypeCast.castTo(null, "_1", IntegerType, nullable = false, CSVOptions("nullValue", "-"))
+    }.getMessage
+    assert(message.contains("null value found but field _1 is not nullable."))
+
+    message = intercept[RuntimeException] {
+      CSVTypeCast.castTo("-", "_1", StringType, nullable = false, CSVOptions("nullValue", "-"))
+    }.getMessage
+    assert(message.contains("null value found but field _1 is not nullable."))
   }
 
   test("String type should also respect `nullValue`") {
     assertNull(
-      CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()))
-    assert(
-      CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) ==
-        UTF8String.fromString(""))
+      CSVTypeCast.castTo("", "_1", StringType, nullable = true, CSVOptions()))
 
     assert(
-      CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
+      CSVTypeCast.castTo("", "_1", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
         UTF8String.fromString(""))
     assert(
-      CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
+      CSVTypeCast.castTo("", "_1", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
         UTF8String.fromString(""))
 
     assertNull(
-      CSVTypeCast.castTo(null, StringType, nullable = true, CSVOptions("nullValue", "null")))
+      CSVTypeCast.castTo(null, "_1", StringType, nullable = true, CSVOptions("nullValue", "null")))
   }
 
   test("Throws exception for empty string with non null type") {
-    val exception = intercept[NumberFormatException]{
-      CSVTypeCast.castTo("", IntegerType, nullable = false, CSVOptions())
+    val exception = intercept[RuntimeException]{
+      CSVTypeCast.castTo("", "_1", IntegerType, nullable = false, CSVOptions())
     }
-    assert(exception.getMessage.contains("For input string: \"\""))
+    assert(exception.getMessage.contains("null value found but field _1 is not nullable."))
   }
 
   test("Types are cast correctly") {
-    assert(CSVTypeCast.castTo("10", ByteType) == 10)
-    assert(CSVTypeCast.castTo("10", ShortType) == 10)
-    assert(CSVTypeCast.castTo("10", IntegerType) == 10)
-    assert(CSVTypeCast.castTo("10", LongType) == 10)
-    assert(CSVTypeCast.castTo("1.00", FloatType) == 1.0)
-    assert(CSVTypeCast.castTo("1.00", DoubleType) == 1.0)
-    assert(CSVTypeCast.castTo("true", BooleanType) == true)
+    assert(CSVTypeCast.castTo("10", "_1", ByteType) == 10)
+    assert(CSVTypeCast.castTo("10", "_1", ShortType) == 10)
+    assert(CSVTypeCast.castTo("10", "_1", IntegerType) == 10)
+    assert(CSVTypeCast.castTo("10", "_1", LongType) == 10)
+    assert(CSVTypeCast.castTo("1.00", "_1", FloatType) == 1.0)
+    assert(CSVTypeCast.castTo("1.00", "_1", DoubleType) == 1.0)
+    assert(CSVTypeCast.castTo("true", "_1", BooleanType) == true)
 
     val timestampsOptions = CSVOptions("timestampFormat", "dd/MM/yyyy hh:mm")
     val customTimestamp = "31/01/2015 00:00"
     val expectedTime = timestampsOptions.timestampFormat.parse(customTimestamp).getTime
     val castedTimestamp =
-      CSVTypeCast.castTo(customTimestamp, TimestampType, nullable = true, timestampsOptions)
+      CSVTypeCast.castTo(customTimestamp, "_1", TimestampType, nullable = true, timestampsOptions)
     assert(castedTimestamp == expectedTime * 1000L)
 
     val customDate = "31/01/2015"
     val dateOptions = CSVOptions("dateFormat", "dd/MM/yyyy")
     val expectedDate = dateOptions.dateFormat.parse(customDate).getTime
-    val castedDate = CSVTypeCast.castTo(customTimestamp, DateType, nullable = true, dateOptions)
+    val castedDate =
+      CSVTypeCast.castTo(customTimestamp, "_1", DateType, nullable = true, dateOptions)
     assert(castedDate == DateTimeUtils.millisToDays(expectedDate))
 
     val timestamp = "2015-01-01 00:00:00"
-    assert(CSVTypeCast.castTo(timestamp, TimestampType) ==
+    assert(CSVTypeCast.castTo(timestamp, "_1", TimestampType) ==
       DateTimeUtils.stringToTime(timestamp).getTime  * 1000L)
-    assert(CSVTypeCast.castTo("2015-01-01", DateType) ==
+    assert(CSVTypeCast.castTo("2015-01-01", "_1", DateType) ==
       DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
   }
 
@@ -148,8 +159,8 @@ class CSVTypeCastSuite extends SparkFunSuite {
     val originalLocale = Locale.getDefault
     try {
       Locale.setDefault(new Locale("fr", "FR"))
-      assert(CSVTypeCast.castTo("1,00", FloatType) == 100.0) // Would parse as 1.0 in fr-FR
-      assert(CSVTypeCast.castTo("1,00", DoubleType) == 100.0)
+      assert(CSVTypeCast.castTo("1,00", "_1", FloatType) == 100.0) // Would parse as 1.0 in fr-FR
+      assert(CSVTypeCast.castTo("1,00", "_1", DoubleType) == 100.0)
     } finally {
       Locale.setDefault(originalLocale)
     }
@@ -157,7 +168,7 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
   test("Float NaN values are parsed correctly") {
     val floatVal: Float = CSVTypeCast.castTo(
-      "nn", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float]
+      "nn", "_1", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float]
 
     // Java implements the IEEE-754 floating point standard which guarantees that any comparison
     // against NaN will return false (except != which returns true)
@@ -166,32 +177,32 @@ class CSVTypeCastSuite extends SparkFunSuite {
 
   test("Double NaN values are parsed correctly") {
     val doubleVal: Double = CSVTypeCast.castTo(
-      "-", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double]
+      "-", "_1", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double]
 
     assert(doubleVal.isNaN)
   }
 
   test("Float infinite values can be parsed") {
     val floatVal1 = CSVTypeCast.castTo(
-      "max", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float]
+      "max", "_1", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float]
 
     assert(floatVal1 == Float.NegativeInfinity)
 
     val floatVal2 = CSVTypeCast.castTo(
-      "max", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float]
+      "max", "_1", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float]
 
     assert(floatVal2 == Float.PositiveInfinity)
   }
 
   test("Double infinite values can be parsed") {
     val doubleVal1 = CSVTypeCast.castTo(
-      "max", DoubleType, nullable = true, CSVOptions("negativeInf", "max")
+      "max", "_1", DoubleType, nullable = true, CSVOptions("negativeInf", "max")
     ).asInstanceOf[Double]
 
     assert(doubleVal1 == Double.NegativeInfinity)
 
     val doubleVal2 = CSVTypeCast.castTo(
-      "max", DoubleType, nullable = true, CSVOptions("positiveInf", "max")
+      "max", "_1", DoubleType, nullable = true, CSVOptions("positiveInf", "max")
     ).asInstanceOf[Double]
 
     assert(doubleVal2 == Double.PositiveInfinity)

From 9c78d355c541c2abfb4945e5d67bf0d2ba4b4d16 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 6 Nov 2016 18:57:13 -0800
Subject: [PATCH 049/534] [SPARK-18173][SQL] data source tables should support
 truncating partition

## What changes were proposed in this pull request?

Previously `TRUNCATE TABLE ... PARTITION` will always truncate the whole table for data source tables, this PR fixes it and improve `InMemoryCatalog` to make this command work with it.
## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15688 from cloud-fan/truncate.

(cherry picked from commit 46b2e499935386e28899d860110a6ab16c107c0c)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalyst/catalog/InMemoryCatalog.scala    | 23 +++++--
 .../catalog/ExternalCatalogSuite.scala        | 11 ++++
 .../spark/sql/execution/command/tables.scala  | 16 +++--
 .../sql/execution/command/DDLSuite.scala      | 49 +++++++++++---
 .../sql/hive/execution/HiveDDLSuite.scala     | 64 +++++++++++++++++++
 5 files changed, 146 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index bc396880f22a3..20db81e6f9060 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -487,11 +487,26 @@ class InMemoryCatalog(
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = synchronized {
     requireTableExists(db, table)
-    if (partialSpec.nonEmpty) {
-      throw new UnsupportedOperationException(
-        "listPartition with partial partition spec is not implemented")
+
+    partialSpec match {
+      case None => catalog(db).tables(table).partitions.values.toSeq
+      case Some(partial) =>
+        catalog(db).tables(table).partitions.toSeq.collect {
+          case (spec, partition) if isPartialPartitionSpec(partial, spec) => partition
+        }
+    }
+  }
+
+  /**
+   * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a
+   * partial partition spec w.r.t. PARTITION (a=1,b=2).
+   */
+  private def isPartialPartitionSpec(
+      spec1: TablePartitionSpec,
+      spec2: TablePartitionSpec): Boolean = {
+    spec1.forall {
+      case (partitionColumn, value) => spec2(partitionColumn) == value
     }
-    catalog(db).tables(table).partitions.values.toSeq
   }
 
   override def listPartitionsByFilter(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 66f92d1b1b0af..34bdfc8a98710 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -320,6 +320,17 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     catalog.createPartitions("db2", "tbl2", Seq(part1), ignoreIfExists = true)
   }
 
+  test("list partitions with partial partition spec") {
+    val catalog = newBasicCatalog()
+    val parts = catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "1")))
+    assert(parts.length == 1)
+    assert(parts.head.spec == part1.spec)
+
+    // if no partition is matched for the given partition spec, an empty list should be returned.
+    assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown", "b" -> "1"))).isEmpty)
+    assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown"))).isEmpty)
+  }
+
   test("drop partitions") {
     val catalog = newBasicCatalog()
     assert(catalogPartitionsEqual(catalog, "db2", "tbl2", Seq(part1, part2)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 00c646b9185b3..3cfa639a2fc1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -343,13 +343,19 @@ case class TruncateTableCommand(
       DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
     }
     val locations =
-      // TODO: The `InMemoryCatalog` doesn't support listPartition with partial partition spec.
-      if (spark.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") {
-        Seq(table.storage.locationUri)
-      } else if (table.partitionColumnNames.isEmpty) {
+      if (table.partitionColumnNames.isEmpty) {
         Seq(table.storage.locationUri)
       } else {
-        catalog.listPartitions(table.identifier, partitionSpec).map(_.storage.locationUri)
+        // Here we diverge from Hive when the given partition spec contains all partition columns
+        // but no partition is matched: Hive will throw an exception and we just do nothing.
+        val normalizedSpec = partitionSpec.map { spec =>
+          PartitioningUtils.normalizePartitionSpec(
+            spec,
+            table.partitionColumnNames,
+            table.identifier.quotedString,
+            spark.sessionState.conf.resolver)
+        }
+        catalog.listPartitions(table.identifier, normalizedSpec).map(_.storage.locationUri)
       }
     val hadoopConf = spark.sessionState.newHadoopConf()
     locations.foreach { location =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 52b09c54464e7..864af8d578b17 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1628,29 +1628,62 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
 
   test("truncate table - datasource table") {
     import testImplicits._
-    val data = (1 to 10).map { i => (i, i) }.toDF("width", "length")
 
+    val data = (1 to 10).map { i => (i, i) }.toDF("width", "length")
     // Test both a Hive compatible and incompatible code path.
     Seq("json", "parquet").foreach { format =>
       withTable("rectangles") {
         data.write.format(format).saveAsTable("rectangles")
         assume(spark.table("rectangles").collect().nonEmpty,
           "bad test; table was empty to begin with")
+
         sql("TRUNCATE TABLE rectangles")
         assert(spark.table("rectangles").collect().isEmpty)
+
+        // not supported since the table is not partitioned
+        assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
       }
     }
+  }
 
-    withTable("rectangles", "rectangles2") {
-      data.write.saveAsTable("rectangles")
-      data.write.partitionBy("length").saveAsTable("rectangles2")
+  test("truncate partitioned table - datasource table") {
+    import testImplicits._
 
-      // not supported since the table is not partitioned
-      assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)")
+    val data = (1 to 10).map { i => (i % 3, i % 5, i) }.toDF("width", "length", "height")
 
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
       // supported since partitions are stored in the metastore
-      sql("TRUNCATE TABLE rectangles2 PARTITION (width=1)")
-      assert(spark.table("rectangles2").collect().isEmpty)
+      sql("TRUNCATE TABLE partTable PARTITION (width=1, length=1)")
+      assert(spark.table("partTable").filter($"width" === 1).collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1 && $"length" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // support partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=1)")
+      assert(spark.table("partTable").collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // do nothing if no partition is matched for the given partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=100)")
+      assert(spark.table("partTable").count() == data.count())
+
+      // do nothing if no partition is matched for the given non-partial partition spec
+      // TODO: This behaviour is different from Hive, we should decide whether we need to follow
+      // Hive's behaviour or stick with our existing behaviour later.
+      sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
+      assert(spark.table("partTable").count() == data.count())
+
+      // throw exception if the column in partition spec is not a partition column.
+      val e = intercept[AnalysisException] {
+        sql("TRUNCATE TABLE partTable PARTITION (unknown=1)")
+      }
+      assert(e.message.contains("unknown is not a valid partition column"))
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 682d7d4b163dd..4150e649bef84 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1098,4 +1098,68 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("truncate table - datasource table") {
+    import testImplicits._
+
+    val data = (1 to 10).map { i => (i, i) }.toDF("width", "length")
+    // Test both a Hive compatible and incompatible code path.
+    Seq("json", "parquet").foreach { format =>
+      withTable("rectangles") {
+        data.write.format(format).saveAsTable("rectangles")
+        assume(spark.table("rectangles").collect().nonEmpty,
+          "bad test; table was empty to begin with")
+
+        sql("TRUNCATE TABLE rectangles")
+        assert(spark.table("rectangles").collect().isEmpty)
+
+        // not supported since the table is not partitioned
+        val e = intercept[AnalysisException] {
+          sql("TRUNCATE TABLE rectangles PARTITION (width=1)")
+        }
+        assert(e.message.contains("Operation not allowed"))
+      }
+    }
+  }
+
+  test("truncate partitioned table - datasource table") {
+    import testImplicits._
+
+    val data = (1 to 10).map { i => (i % 3, i % 5, i) }.toDF("width", "length", "height")
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // supported since partitions are stored in the metastore
+      sql("TRUNCATE TABLE partTable PARTITION (width=1, length=1)")
+      assert(spark.table("partTable").filter($"width" === 1).collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1 && $"length" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // support partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=1)")
+      assert(spark.table("partTable").collect().nonEmpty)
+      assert(spark.table("partTable").filter($"width" === 1).collect().isEmpty)
+    }
+
+    withTable("partTable") {
+      data.write.partitionBy("width", "length").saveAsTable("partTable")
+      // do nothing if no partition is matched for the given partial partition spec
+      sql("TRUNCATE TABLE partTable PARTITION (width=100)")
+      assert(spark.table("partTable").count() == data.count())
+
+      // do nothing if no partition is matched for the given non-partial partition spec
+      // TODO: This behaviour is different from Hive, we should decide whether we need to follow
+      // Hive's behaviour or stick with our existing behaviour later.
+      sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
+      assert(spark.table("partTable").count() == data.count())
+
+      // throw exception if the column in partition spec is not a partition column.
+      val e = intercept[AnalysisException] {
+        sql("TRUNCATE TABLE partTable PARTITION (unknown=1)")
+      }
+      assert(e.message.contains("unknown is not a valid partition column"))
+    }
+  }
 }

From 9ebd5e563d26cf42b9d32e8926de109101360d43 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 6 Nov 2016 22:42:05 -0800
Subject: [PATCH 050/534] [SPARK-18167][SQL] Disable flaky hive partition
 pruning test.

(cherry picked from commit 07ac3f09daf2b28436bc69f76badd1e36d756e4d)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/sql/hive/execution/SQLQuerySuite.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 28e5dffb11523..5e08ef31121fd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1569,7 +1569,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     ).map(i => Row(i._1, i._2, i._3, i._4)))
   }
 
-  test("SPARK-10562: partition by column with mixed case name") {
+  ignore("SPARK-10562: partition by column with mixed case name") {
     withTable("tbl10562") {
       val df = Seq(2012 -> "a").toDF("Year", "val")
       df.write.partitionBy("Year").saveAsTable("tbl10562")

From 2fa1a632ae4e68ffa01fad0d6150219c13355724 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 6 Nov 2016 22:44:55 -0800
Subject: [PATCH 051/534] [SPARK-18296][SQL] Use consistent naming for
 expression test suites

## What changes were proposed in this pull request?
We have an undocumented naming convention to call expression unit tests ExpressionsSuite, and the end-to-end tests FunctionsSuite. It'd be great to make all test suites consistent with this naming convention.

## How was this patch tested?
This is a test-only naming change.

Author: Reynold Xin <rxin@databricks.com>

Closes #15793 from rxin/SPARK-18296.

(cherry picked from commit 9db06c442cf85e41d51c7b167817f4e7971bf0da)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 ...seFunctionsSuite.scala => BitwiseExpressionsSuite.scala} | 2 +-
 ...unctionsSuite.scala => CollectionExpressionsSuite.scala} | 3 +--
 ...{MathFunctionsSuite.scala => MathExpressionsSuite.scala} | 2 +-
 ...{MiscFunctionsSuite.scala => MiscExpressionsSuite.scala} | 2 +-
 ...{NullFunctionsSuite.scala => NullExpressionsSuite.scala} | 2 +-
 ...{MathExpressionsSuite.scala => MathFunctionsSuite.scala} | 6 +++---
 6 files changed, 8 insertions(+), 9 deletions(-)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{BitwiseFunctionsSuite.scala => BitwiseExpressionsSuite.scala} (98%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{CollectionFunctionsSuite.scala => CollectionExpressionsSuite.scala} (98%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{MathFunctionsSuite.scala => MathExpressionsSuite.scala} (99%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{MiscFunctionsSuite.scala => MiscExpressionsSuite.scala} (95%)
 rename sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/{NullFunctionsSuite.scala => NullExpressionsSuite.scala} (98%)
 rename sql/core/src/test/scala/org/apache/spark/sql/{MathExpressionsSuite.scala => MathFunctionsSuite.scala} (98%)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala
index 3a310c0e9a7a6..4188dade3fe65 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseExpressionsSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
 
-class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class BitwiseExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   import IntegralLiteralTestUtils._
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index c76dad208ea1e..020687e4b3a27 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -20,8 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
-
-class CollectionFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("Array and Map Size") {
     val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
similarity index 99%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
index f88c9e8df16d0..6b5bfac94645c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
 import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
 import org.apache.spark.sql.types._
 
-class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   import IntegralLiteralTestUtils._
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
similarity index 95%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
index ed82efe7be2e8..a26d070a99c52 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
-class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   test("assert_true") {
     intercept[RuntimeException] {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
similarity index 98%
rename from sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
rename to sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index 62c9ab3b67fb6..5064a1f63f83d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
 import org.apache.spark.sql.types._
 
-class NullFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+class NullExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
   def testAllTypes(testFunc: (Any, DataType) => Unit): Unit = {
     testFunc(false, BooleanType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
similarity index 98%
rename from sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
index 6944c6f848179..37443d0342980 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/MathExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala
@@ -23,13 +23,13 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.functions.{log => logarithm}
 import org.apache.spark.sql.test.SharedSQLContext
 
-private object MathExpressionsTestData {
+private object MathFunctionsTestData {
   case class DoubleData(a: java.lang.Double, b: java.lang.Double)
   case class NullDoubles(a: java.lang.Double)
 }
 
-class MathExpressionsSuite extends QueryTest with SharedSQLContext {
-  import MathExpressionsTestData._
+class MathFunctionsSuite extends QueryTest with SharedSQLContext {
+  import MathFunctionsTestData._
   import testImplicits._
 
   private lazy val doubleData = (1 to 10).map(i => DoubleData(i * 0.2 - 1, i * -0.2 + 1)).toDF()

From 4101029579de920215b426ca6537c1f0e4e4e5ae Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 7 Nov 2016 01:16:37 -0800
Subject: [PATCH 052/534] [SPARK-16904][SQL] Removal of Hive Built-in Hash
 Functions and TestHiveFunctionRegistry

### What changes were proposed in this pull request?

Currently, the Hive built-in `hash` function is not being used in Spark since Spark 2.0. The public interface does not allow users to unregister the Spark built-in functions. Thus, users will never use Hive's built-in `hash` function.

The only exception here is `TestHiveFunctionRegistry`, which allows users to unregister the built-in functions. Thus, we can load Hive's hash function in the test cases. If we disable it, 10+ test cases will fail because the results are different from the Hive golden answer files.

This PR is to remove `hash` from the list of `hiveFunctions` in `HiveSessionCatalog`. It will also remove `TestHiveFunctionRegistry`. This removal makes us easier to remove `TestHiveSessionState` in the future.
### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #14498 from gatorsmile/removeHash.

(cherry picked from commit 57626a55703a189e03148398f67c36cd0e557044)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../execution/HiveCompatibilitySuite.scala    | 41 +++++++++----------
 .../spark/sql/hive/HiveSessionCatalog.scala   |  1 -
 .../apache/spark/sql/hive/test/TestHive.scala | 28 -------------
 3 files changed, 20 insertions(+), 50 deletions(-)

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index f5d10de8cd2bf..5cd4935e225ee 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -57,8 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5)
     // Enable in-memory partition pruning for testing purposes
     TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
-    // Use Hive hash expression instead of the native one
-    TestHive.sessionState.functionRegistry.unregisterFunction("hash")
     // Ensures that the plans generation use metastore relation and not OrcRelation
     // Was done because SqlBuilder does not work with plans having logical relation
     TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false)
@@ -76,7 +74,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
       TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
       TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc)
       TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
-      TestHive.sessionState.functionRegistry.restore()
 
       // For debugging dump some statistics about how much time was spent in various optimizer rules
       logWarning(RuleExecutor.dumpTimeSpent())
@@ -581,7 +578,26 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "auto_join6",
     "auto_join7",
     "auto_join8",
-    "auto_join9"
+    "auto_join9",
+
+    // These tests are based on the Hive's hash function, which is different from Spark
+    "auto_join19",
+    "auto_join22",
+    "auto_join25",
+    "auto_join26",
+    "auto_join27",
+    "auto_join28",
+    "auto_join30",
+    "auto_join31",
+    "auto_join_nulls",
+    "auto_join_reordering_values",
+    "correlationoptimizer1",
+    "correlationoptimizer2",
+    "correlationoptimizer3",
+    "correlationoptimizer4",
+    "multiMapJoin1",
+    "orc_dictionary_threshold",
+    "udf_hash"
   )
 
   /**
@@ -601,16 +617,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "annotate_stats_part",
     "annotate_stats_table",
     "annotate_stats_union",
-    "auto_join19",
-    "auto_join22",
-    "auto_join25",
-    "auto_join26",
-    "auto_join27",
-    "auto_join28",
-    "auto_join30",
-    "auto_join31",
-    "auto_join_nulls",
-    "auto_join_reordering_values",
     "binary_constant",
     "binarysortable_1",
     "cast1",
@@ -623,15 +629,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "compute_stats_long",
     "compute_stats_string",
     "convert_enum_to_string",
-    "correlationoptimizer1",
     "correlationoptimizer10",
     "correlationoptimizer11",
     "correlationoptimizer13",
     "correlationoptimizer14",
     "correlationoptimizer15",
-    "correlationoptimizer2",
-    "correlationoptimizer3",
-    "correlationoptimizer4",
     "correlationoptimizer6",
     "correlationoptimizer7",
     "correlationoptimizer8",
@@ -871,7 +873,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "merge2",
     "merge4",
     "mergejoins",
-    "multiMapJoin1",
     "multiMapJoin2",
     "multi_insert_gby",
     "multi_insert_gby3",
@@ -893,7 +894,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "nullinput2",
     "nullscript",
     "optional_outer",
-    "orc_dictionary_threshold",
     "order",
     "order2",
     "outer_join_ppr",
@@ -1026,7 +1026,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_from_unixtime",
     "udf_greaterthan",
     "udf_greaterthanorequal",
-    "udf_hash",
     "udf_hex",
     "udf_if",
     "udf_index",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 4f2910abfd216..9df20ce1553ec 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -233,7 +233,6 @@ private[sql] class HiveSessionCatalog(
   // in_file, index, matchpath, ngrams, noop, noopstreaming, noopwithmap,
   // noopwithmapstreaming, parse_url_tuple, reflect2, windowingtablefunction.
   private val hiveFunctions = Seq(
-    "hash",
     "histogram_numeric",
     "percentile"
   )
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 90000445dffb2..a8dd5102b750d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -492,24 +492,6 @@ private[hive] class TestHiveQueryExecution(
   }
 }
 
-
-private[hive] class TestHiveFunctionRegistry extends SimpleFunctionRegistry {
-
-  private val removedFunctions =
-    collection.mutable.ArrayBuffer.empty[(String, (ExpressionInfo, FunctionBuilder))]
-
-  def unregisterFunction(name: String): Unit = synchronized {
-    functionBuilders.remove(name).foreach(f => removedFunctions += name -> f)
-  }
-
-  def restore(): Unit = synchronized {
-    removedFunctions.foreach {
-      case (name, (info, builder)) => registerFunction(name, info, builder)
-    }
-  }
-}
-
-
 private[hive] class TestHiveSessionState(
     sparkSession: TestHiveSparkSession)
   extends HiveSessionState(sparkSession) { self =>
@@ -525,16 +507,6 @@ private[hive] class TestHiveSessionState(
     }
   }
 
-  override lazy val functionRegistry: TestHiveFunctionRegistry = {
-    // We use TestHiveFunctionRegistry at here to track functions that have been explicitly
-    // unregistered (through TestHiveFunctionRegistry.unregisterFunction method).
-    val fr = new TestHiveFunctionRegistry
-    org.apache.spark.sql.catalyst.analysis.FunctionRegistry.expressions.foreach {
-      case (name, (info, builder)) => fr.registerFunction(name, info, builder)
-    }
-    fr
-  }
-
   override def executePlan(plan: LogicalPlan): TestHiveQueryExecution = {
     new TestHiveQueryExecution(sparkSession, plan)
   }

From df40ee2b483989a47cb85d248280cc02f527112d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 7 Nov 2016 12:18:19 +0100
Subject: [PATCH 053/534] [SPARK-18125][SQL] Fix a compilation error in codegen
 due to splitExpression

## What changes were proposed in this pull request?

As reported in the jira, sometimes the generated java code in codegen will cause compilation error.

Code snippet to test it:

    case class Route(src: String, dest: String, cost: Int)
    case class GroupedRoutes(src: String, dest: String, routes: Seq[Route])

    val ds = sc.parallelize(Array(
      Route("a", "b", 1),
      Route("a", "b", 2),
      Route("a", "c", 2),
      Route("a", "d", 10),
      Route("b", "a", 1),
      Route("b", "a", 5),
      Route("b", "c", 6))
    ).toDF.as[Route]

    val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r)))
      .groupByKey(r => (r.src, r.dest))
      .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) =>
        GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes)
      }.map(_._2)

The problem here is, in `ReferenceToExpressions` we evaluate the children vars to local variables. Then the result expression is evaluated to use those children variables. In the above case, the result expression code is too long and will be split by `CodegenContext.splitExpression`. So those local variables cannot be accessed and cause compilation error.

## How was this patch tested?

Jenkins tests.

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #15693 from viirya/fix-codege-compilation-error.

(cherry picked from commit a814eeac6b3c38d1294b88c60cd083fc4d01bd25)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../expressions/ReferenceToExpressions.scala  | 27 +++++++++++---
 .../org/apache/spark/sql/DatasetSuite.scala   | 37 +++++++++++++++++++
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
index 127797c0974bb..6c75a7a50214f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
@@ -63,15 +63,30 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression])
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val childrenGen = children.map(_.genCode(ctx))
-    val childrenVars = childrenGen.zip(children).map {
-      case (childGen, child) => LambdaVariable(childGen.value, childGen.isNull, child.dataType)
-    }
+    val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map {
+      case (childGen, child) =>
+        // SPARK-18125: The children vars are local variables. If the result expression uses
+        // splitExpression, those variables cannot be accessed so compilation fails.
+        // To fix it, we use class variables to hold those local variables.
+        val classChildVarName = ctx.freshName("classChildVar")
+        val classChildVarIsNull = ctx.freshName("classChildVarIsNull")
+        ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "")
+        ctx.addMutableState("boolean", classChildVarIsNull, "")
+
+        val classChildVar =
+          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType)
+
+        val initCode = s"${classChildVar.value} = ${childGen.value};\n" +
+          s"${classChildVar.isNull} = ${childGen.isNull};"
+
+        (classChildVar, initCode)
+    }.unzip
 
     val resultGen = result.transform {
-      case b: BoundReference => childrenVars(b.ordinal)
+      case b: BoundReference => classChildrenVars(b.ordinal)
     }.genCode(ctx)
 
-    ExprCode(code = childrenGen.map(_.code).mkString("\n") + "\n" + resultGen.code,
-      isNull = resultGen.isNull, value = resultGen.value)
+    ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") +
+      resultGen.code, isNull = resultGen.isNull, value = resultGen.value)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 6fa7b0487732e..a8dd422aa0c85 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -923,6 +923,40 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
         .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() })
   }
 
+  test("SPARK-18125: Spark generated code causes CompileException") {
+    val data = Array(
+      Route("a", "b", 1),
+      Route("a", "b", 2),
+      Route("a", "c", 2),
+      Route("a", "d", 10),
+      Route("b", "a", 1),
+      Route("b", "a", 5),
+      Route("b", "c", 6))
+    val ds = sparkContext.parallelize(data).toDF.as[Route]
+
+    val grped = ds.map(r => GroupedRoutes(r.src, r.dest, Seq(r)))
+      .groupByKey(r => (r.src, r.dest))
+      .reduceGroups { (g1: GroupedRoutes, g2: GroupedRoutes) =>
+        GroupedRoutes(g1.src, g1.dest, g1.routes ++ g2.routes)
+      }.map(_._2)
+
+    val expected = Seq(
+      GroupedRoutes("a", "d", Seq(Route("a", "d", 10))),
+      GroupedRoutes("b", "c", Seq(Route("b", "c", 6))),
+      GroupedRoutes("a", "b", Seq(Route("a", "b", 1), Route("a", "b", 2))),
+      GroupedRoutes("b", "a", Seq(Route("b", "a", 1), Route("b", "a", 5))),
+      GroupedRoutes("a", "c", Seq(Route("a", "c", 2)))
+    )
+
+    implicit def ordering[GroupedRoutes]: Ordering[GroupedRoutes] = new Ordering[GroupedRoutes] {
+      override def compare(x: GroupedRoutes, y: GroupedRoutes): Int = {
+        x.toString.compareTo(y.toString)
+      }
+    }
+
+    checkDatasetUnorderly(grped, expected: _*)
+  }
+
   test("SPARK-18189: Fix serialization issue in KeyValueGroupedDataset") {
     val resultValue = 12345
     val keyValueGrouped = Seq((1, 2), (3, 4)).toDS().groupByKey(_._1)
@@ -1071,3 +1105,6 @@ object DatasetTransform {
     ds.map(_ + 1)
   }
 }
+
+case class Route(src: String, dest: String, cost: Int)
+case class GroupedRoutes(src: String, dest: String, routes: Seq[Route])

From 6b332909f044f2d47f49cbf699f2f2f22206decf Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 7 Nov 2016 04:07:19 -0800
Subject: [PATCH 054/534] [SPARK-18291][SPARKR][ML] SparkR glm predict should
 output original label when family = binomial.

## What changes were proposed in this pull request?
SparkR ```spark.glm``` predict should output original label when family = "binomial".

## How was this patch tested?
Add unit test.
You can also run the following code to test:
```R
training <- suppressWarnings(createDataFrame(iris))
training <- training[training$Species %in% c("versicolor", "virginica"), ]
model <- spark.glm(training, Species ~ Sepal_Length + Sepal_Width,family = binomial(link = "logit"))
showDF(predict(model, training))
```
Before this change:
```
+------------+-----------+------------+-----------+----------+-----+-------------------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|   Species|label|         prediction|
+------------+-----------+------------+-----------+----------+-----+-------------------+
|         7.0|        3.2|         4.7|        1.4|versicolor|  0.0| 0.8271421517601544|
|         6.4|        3.2|         4.5|        1.5|versicolor|  0.0| 0.6044595910413112|
|         6.9|        3.1|         4.9|        1.5|versicolor|  0.0| 0.7916340858281998|
|         5.5|        2.3|         4.0|        1.3|versicolor|  0.0|0.16080518180591158|
|         6.5|        2.8|         4.6|        1.5|versicolor|  0.0| 0.6112229217050189|
|         5.7|        2.8|         4.5|        1.3|versicolor|  0.0| 0.2555087295500885|
|         6.3|        3.3|         4.7|        1.6|versicolor|  0.0| 0.5681507664364834|
|         4.9|        2.4|         3.3|        1.0|versicolor|  0.0|0.05990570219972002|
|         6.6|        2.9|         4.6|        1.3|versicolor|  0.0| 0.6644434078306246|
|         5.2|        2.7|         3.9|        1.4|versicolor|  0.0|0.11293577405862379|
|         5.0|        2.0|         3.5|        1.0|versicolor|  0.0|0.06152372321585971|
|         5.9|        3.0|         4.2|        1.5|versicolor|  0.0|0.35250697207602555|
|         6.0|        2.2|         4.0|        1.0|versicolor|  0.0|0.32267018290814303|
|         6.1|        2.9|         4.7|        1.4|versicolor|  0.0|  0.433391153814592|
|         5.6|        2.9|         3.6|        1.3|versicolor|  0.0| 0.2280744262436993|
|         6.7|        3.1|         4.4|        1.4|versicolor|  0.0| 0.7219848389339459|
|         5.6|        3.0|         4.5|        1.5|versicolor|  0.0|0.23527698971404695|
|         5.8|        2.7|         4.1|        1.0|versicolor|  0.0|  0.285024533520016|
|         6.2|        2.2|         4.5|        1.5|versicolor|  0.0| 0.4107047877447493|
|         5.6|        2.5|         3.9|        1.1|versicolor|  0.0|0.20083561961645083|
+------------+-----------+------------+-----------+----------+-----+-------------------+
```
After this change:
```
+------------+-----------+------------+-----------+----------+-----+----------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|   Species|label|prediction|
+------------+-----------+------------+-----------+----------+-----+----------+
|         7.0|        3.2|         4.7|        1.4|versicolor|  0.0| virginica|
|         6.4|        3.2|         4.5|        1.5|versicolor|  0.0| virginica|
|         6.9|        3.1|         4.9|        1.5|versicolor|  0.0| virginica|
|         5.5|        2.3|         4.0|        1.3|versicolor|  0.0|versicolor|
|         6.5|        2.8|         4.6|        1.5|versicolor|  0.0| virginica|
|         5.7|        2.8|         4.5|        1.3|versicolor|  0.0|versicolor|
|         6.3|        3.3|         4.7|        1.6|versicolor|  0.0| virginica|
|         4.9|        2.4|         3.3|        1.0|versicolor|  0.0|versicolor|
|         6.6|        2.9|         4.6|        1.3|versicolor|  0.0| virginica|
|         5.2|        2.7|         3.9|        1.4|versicolor|  0.0|versicolor|
|         5.0|        2.0|         3.5|        1.0|versicolor|  0.0|versicolor|
|         5.9|        3.0|         4.2|        1.5|versicolor|  0.0|versicolor|
|         6.0|        2.2|         4.0|        1.0|versicolor|  0.0|versicolor|
|         6.1|        2.9|         4.7|        1.4|versicolor|  0.0|versicolor|
|         5.6|        2.9|         3.6|        1.3|versicolor|  0.0|versicolor|
|         6.7|        3.1|         4.4|        1.4|versicolor|  0.0| virginica|
|         5.6|        3.0|         4.5|        1.5|versicolor|  0.0|versicolor|
|         5.8|        2.7|         4.1|        1.0|versicolor|  0.0|versicolor|
|         6.2|        2.2|         4.5|        1.5|versicolor|  0.0|versicolor|
|         5.6|        2.5|         3.9|        1.1|versicolor|  0.0|versicolor|
+------------+-----------+------------+-----------+----------+-----+----------+
```

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15788 from yanboliang/spark-18291.

(cherry picked from commit daa975f4bfa4f904697bf3365a4be9987032e490)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/inst/tests/testthat/test_mllib.R        | 20 +++--
 .../GeneralizedLinearRegressionWrapper.scala  | 77 +++++++++++++++++--
 2 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index db98d0e45547e..27c59f0b9624c 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -64,6 +64,16 @@ test_that("spark.glm and predict", {
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
+  # binomial family
+  binomialTraining <- training[training$Species %in% c("versicolor", "virginica"), ]
+  model <- spark.glm(binomialTraining, Species ~ Sepal_Length + Sepal_Width,
+    family = binomial(link = "logit"))
+  prediction <- predict(model, binomialTraining)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
+    "versicolor", "virginica", "versicolor", "virginica", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
   # poisson family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
   family = poisson(link = identity))
@@ -128,10 +138,10 @@ test_that("spark.glm summary", {
   expect_equal(stats$aic, rStats$aic)
 
   # Test spark.glm works with weighted dataset
-  a1 <- c(0, 1, 2, 3)
-  a2 <- c(5, 2, 1, 3)
-  w <- c(1, 2, 3, 4)
-  b <- c(1, 0, 1, 0)
+  a1 <- c(0, 1, 2, 3, 4)
+  a2 <- c(5, 2, 1, 3, 2)
+  w <- c(1, 2, 3, 4, 5)
+  b <- c(1, 0, 1, 0, 0)
   data <- as.data.frame(cbind(a1, a2, w, b))
   df <- suppressWarnings(createDataFrame(data))
 
@@ -158,7 +168,7 @@ test_that("spark.glm summary", {
   data <- as.data.frame(cbind(a1, a2, b))
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
-  expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
+  expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
 })
 
 test_that("spark.glm save/load", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index b1bb577e1ffe4..995b1ef03bcec 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -23,11 +23,16 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.regression._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
 
 private[r] class GeneralizedLinearRegressionWrapper private (
     val pipeline: PipelineModel,
@@ -42,6 +47,8 @@ private[r] class GeneralizedLinearRegressionWrapper private (
     val rNumIterations: Int,
     val isLoaded: Boolean = false) extends MLWritable {
 
+  import GeneralizedLinearRegressionWrapper._
+
   private val glm: GeneralizedLinearRegressionModel =
     pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
 
@@ -52,7 +59,15 @@ private[r] class GeneralizedLinearRegressionWrapper private (
   def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(glm.getFeaturesCol)
+    if (rFamily == "binomial") {
+      pipeline.transform(dataset)
+        .drop(PREDICTED_LABEL_PROB_COL)
+        .drop(PREDICTED_LABEL_INDEX_COL)
+        .drop(glm.getFeaturesCol)
+    } else {
+      pipeline.transform(dataset)
+        .drop(glm.getFeaturesCol)
+    }
   }
 
   override def write: MLWriter =
@@ -62,6 +77,10 @@ private[r] class GeneralizedLinearRegressionWrapper private (
 private[r] object GeneralizedLinearRegressionWrapper
   extends MLReadable[GeneralizedLinearRegressionWrapper] {
 
+  val PREDICTED_LABEL_PROB_COL = "pred_label_prob"
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
   def fit(
       formula: String,
       data: DataFrame,
@@ -71,8 +90,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       maxIter: Int,
       weightCol: String,
       regParam: Double): GeneralizedLinearRegressionWrapper = {
-    val rFormula = new RFormula()
-      .setFormula(formula)
+    val rFormula = new RFormula().setFormula(formula)
+    if (family == "binomial") rFormula.setForceIndexLabel(true)
     RWrapperUtils.checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
@@ -90,9 +109,27 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setWeightCol(weightCol)
       .setRegParam(regParam)
       .setFeaturesCol(rFormula.getFeaturesCol)
-    val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, glr))
-      .fit(data)
+    val pipeline = if (family == "binomial") {
+      // Convert prediction from probability to label index.
+      val probToPred = new ProbabilityToPrediction()
+        .setInputCol(PREDICTED_LABEL_PROB_COL)
+        .setOutputCol(PREDICTED_LABEL_INDEX_COL)
+      // Convert prediction from label index to original label.
+      val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
+        .asInstanceOf[NominalAttribute]
+      val labels = labelAttr.values.get
+      val idxToStr = new IndexToString()
+        .setInputCol(PREDICTED_LABEL_INDEX_COL)
+        .setOutputCol(PREDICTED_LABEL_COL)
+        .setLabels(labels)
+
+      new Pipeline()
+        .setStages(Array(rFormulaModel, glr.setPredictionCol(PREDICTED_LABEL_PROB_COL),
+          probToPred, idxToStr))
+        .fit(data)
+    } else {
+      new Pipeline().setStages(Array(rFormulaModel, glr)).fit(data)
+    }
 
     val glm: GeneralizedLinearRegressionModel =
       pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
@@ -200,3 +237,27 @@ private[r] object GeneralizedLinearRegressionWrapper
     }
   }
 }
+
+/**
+ * This utility transformer converts the predicted value of GeneralizedLinearRegressionModel
+ * with "binomial" family from probability to prediction according to threshold 0.5.
+ */
+private[r] class ProbabilityToPrediction private[r] (override val uid: String)
+  extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable {
+
+  def this() = this(Identifiable.randomUID("probToPred"))
+
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  override def transformSchema(schema: StructType): StructType = {
+    StructType(schema.fields :+ StructField($(outputCol), DoubleType))
+  }
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    dataset.withColumn($(outputCol), round(col($(inputCol))))
+  }
+
+  override def copy(extra: ParamMap): ProbabilityToPrediction = defaultCopy(extra)
+}

From 7a84edb2475446ff3a98e8cc8dcf62ee801fbbb9 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 7 Nov 2016 10:43:36 -0800
Subject: [PATCH 055/534] [SPARK-18283][STRUCTURED STREAMING][KAFKA] Added test
 to check whether default starting offset in latest

## What changes were proposed in this pull request?

Added test to check whether default starting offset in latest

## How was this patch tested?
new unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15778 from tdas/SPARK-18283.

(cherry picked from commit b06c23db9aedae48c9eba9d702ae82fa5647cfe5)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index ed4cc75920e8e..89e713f92df46 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -306,6 +306,30 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("starting offset is latest by default") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("0"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    val kafka = reader.load()
+      .selectExpr("CAST(value AS STRING)")
+      .as[String]
+    val mapped = kafka.map(_.toInt)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(1, 2, 3)  // should not have 0
+    )
+  }
+
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {

From d1eac3ef4af2f8c58395ff6f8bb58a1806a8c09b Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Mon, 7 Nov 2016 21:33:01 +0100
Subject: [PATCH 056/534] [SPARK-17108][SQL] Fix BIGINT and INT comparison
 failure in spark sql

## What changes were proposed in this pull request?

Add a function to check if two integers are compatible when invoking `acceptsType()` in `DataType`.
## How was this patch tested?

Manually.
E.g.

```
    spark.sql("create table t3(a map<bigint, array<string>>)")
    spark.sql("select * from t3 where a[1] is not null")
```

Before:

```
cannot resolve 't.`a`[1]' due to data type mismatch: argument 2 requires bigint type, however, '1' is of int type.; line 1 pos 22
org.apache.spark.sql.AnalysisException: cannot resolve 't.`a`[1]' due to data type mismatch: argument 2 requires bigint type, however, '1' is of int type.; line 1 pos 22
    at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
    at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:82)
    at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:307)
```

After:
 Run the sql queries above. No errors.

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15448 from weiqingy/SPARK_17108.

(cherry picked from commit 0d95662e7fff26669d4f70e88fdac7a4128a4f49)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../catalyst/expressions/complexTypeExtractors.scala |  2 +-
 .../spark/sql/hive/execution/SQLQuerySuite.scala     | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index abb5594bfa7f8..0c256c3d890f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -260,7 +260,7 @@ case class GetArrayItem(child: Expression, ordinal: Expression)
  * We need to do type checking here as `key` expression maybe unresolved.
  */
 case class GetMapValue(child: Expression, key: Expression)
-  extends BinaryExpression with ExpectsInputTypes with ExtractValue {
+  extends BinaryExpression with ImplicitCastInputTypes with ExtractValue {
 
   private def keyType = child.dataType.asInstanceOf[MapType].keyType
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 5e08ef31121fd..c21db3595fa19 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1939,6 +1939,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+
+  test("SPARK-17108: Fix BIGINT and INT comparison failure in spark sql") {
+    sql("create table t1(a map<bigint, array<string>>)")
+    sql("select * from t1 where a[1] is not null")
+
+    sql("create table t2(a map<int, array<string>>)")
+    sql("select * from t2 where a[1] is not null")
+
+    sql("create table t3(a map<bigint, array<string>>)")
+    sql("select * from t3 where a[1L] is not null")
+  }
+
   test("SPARK-17796 Support wildcard character in filename for LOAD DATA LOCAL INPATH") {
     withTempDir { dir =>
       for (i <- 1 to 3) {

From 9873d57f2c76d1a6995c4ff5a45be1259a7948f0 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 8 Nov 2016 00:14:57 +0100
Subject: [PATCH 057/534] [SPARK-17490][SQL] Optimize SerializeFromObject() for
 a primitive array

Waiting for merging #13680

This PR optimizes `SerializeFromObject()` for an primitive array. This is derived from #13758 to address one of problems by using a simple way in #13758.

The current implementation always generates `GenericArrayData` from `SerializeFromObject()` for any type of an array in a logical plan. This involves a boxing at a constructor of `GenericArrayData` when `SerializedFromObject()` has an primitive array.

This PR enables to generate `UnsafeArrayData` from `SerializeFromObject()` for a primitive array. It can avoid boxing to create an instance of `ArrayData` in the generated code by Catalyst.

This PR also generate `UnsafeArrayData` in a case for `RowEncoder.serializeFor` or `CatalystTypeConverters.createToCatalystConverter`.

Performance improvement of `SerializeFromObject()` is up to 2.0x

```
OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64
Intel Xeon E3-12xx v2 (Ivy Bridge)

Without this PR
Write an array in Dataset:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------
Int                                            556 /  608         15.1          66.3       1.0X
Double                                        1668 / 1746          5.0         198.8       0.3X

with this PR
Write an array in Dataset:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------
Int                                            352 /  401         23.8          42.0       1.0X
Double                                         821 /  885         10.2          97.9       0.4X
```

Here is an example program that will happen in mllib as described in [SPARK-16070](https://issues.apache.org/jira/browse/SPARK-16070).

```
sparkContext.parallelize(Seq(Array(1, 2)), 1).toDS.map(e => e).show
```

Generated code before applying this PR

``` java
/* 039 */   protected void processNext() throws java.io.IOException {
/* 040 */     while (inputadapter_input.hasNext()) {
/* 041 */       InternalRow inputadapter_row = (InternalRow) inputadapter_input.next();
/* 042 */       int[] inputadapter_value = (int[])inputadapter_row.get(0, null);
/* 043 */
/* 044 */       Object mapelements_obj = ((Expression) references[0]).eval(null);
/* 045 */       scala.Function1 mapelements_value1 = (scala.Function1) mapelements_obj;
/* 046 */
/* 047 */       boolean mapelements_isNull = false || false;
/* 048 */       int[] mapelements_value = null;
/* 049 */       if (!mapelements_isNull) {
/* 050 */         Object mapelements_funcResult = null;
/* 051 */         mapelements_funcResult = mapelements_value1.apply(inputadapter_value);
/* 052 */         if (mapelements_funcResult == null) {
/* 053 */           mapelements_isNull = true;
/* 054 */         } else {
/* 055 */           mapelements_value = (int[]) mapelements_funcResult;
/* 056 */         }
/* 057 */
/* 058 */       }
/* 059 */       mapelements_isNull = mapelements_value == null;
/* 060 */
/* 061 */       serializefromobject_argIsNulls[0] = mapelements_isNull;
/* 062 */       serializefromobject_argValue = mapelements_value;
/* 063 */
/* 064 */       boolean serializefromobject_isNull = false;
/* 065 */       for (int idx = 0; idx < 1; idx++) {
/* 066 */         if (serializefromobject_argIsNulls[idx]) { serializefromobject_isNull = true; break; }
/* 067 */       }
/* 068 */
/* 069 */       final ArrayData serializefromobject_value = serializefromobject_isNull ? null : new org.apache.spark.sql.catalyst.util.GenericArrayData(serializefromobject_argValue);
/* 070 */       serializefromobject_holder.reset();
/* 071 */
/* 072 */       serializefromobject_rowWriter.zeroOutNullBytes();
/* 073 */
/* 074 */       if (serializefromobject_isNull) {
/* 075 */         serializefromobject_rowWriter.setNullAt(0);
/* 076 */       } else {
/* 077 */         // Remember the current cursor so that we can calculate how many bytes are
/* 078 */         // written later.
/* 079 */         final int serializefromobject_tmpCursor = serializefromobject_holder.cursor;
/* 080 */
/* 081 */         if (serializefromobject_value instanceof UnsafeArrayData) {
/* 082 */           final int serializefromobject_sizeInBytes = ((UnsafeArrayData) serializefromobject_value).getSizeInBytes();
/* 083 */           // grow the global buffer before writing data.
/* 084 */           serializefromobject_holder.grow(serializefromobject_sizeInBytes);
/* 085 */           ((UnsafeArrayData) serializefromobject_value).writeToMemory(serializefromobject_holder.buffer, serializefromobject_holder.cursor);
/* 086 */           serializefromobject_holder.cursor += serializefromobject_sizeInBytes;
/* 087 */
/* 088 */         } else {
/* 089 */           final int serializefromobject_numElements = serializefromobject_value.numElements();
/* 090 */           serializefromobject_arrayWriter.initialize(serializefromobject_holder, serializefromobject_numElements, 4);
/* 091 */
/* 092 */           for (int serializefromobject_index = 0; serializefromobject_index < serializefromobject_numElements; serializefromobject_index++) {
/* 093 */             if (serializefromobject_value.isNullAt(serializefromobject_index)) {
/* 094 */               serializefromobject_arrayWriter.setNullInt(serializefromobject_index);
/* 095 */             } else {
/* 096 */               final int serializefromobject_element = serializefromobject_value.getInt(serializefromobject_index);
/* 097 */               serializefromobject_arrayWriter.write(serializefromobject_index, serializefromobject_element);
/* 098 */             }
/* 099 */           }
/* 100 */         }
/* 101 */
/* 102 */         serializefromobject_rowWriter.setOffsetAndSize(0, serializefromobject_tmpCursor, serializefromobject_holder.cursor - serializefromobject_tmpCursor);
/* 103 */       }
/* 104 */       serializefromobject_result.setTotalSize(serializefromobject_holder.totalSize());
/* 105 */       append(serializefromobject_result);
/* 106 */       if (shouldStop()) return;
/* 107 */     }
/* 108 */   }
/* 109 */ }
```

Generated code after applying this PR

``` java
/* 035 */   protected void processNext() throws java.io.IOException {
/* 036 */     while (inputadapter_input.hasNext()) {
/* 037 */       InternalRow inputadapter_row = (InternalRow) inputadapter_input.next();
/* 038 */       int[] inputadapter_value = (int[])inputadapter_row.get(0, null);
/* 039 */
/* 040 */       Object mapelements_obj = ((Expression) references[0]).eval(null);
/* 041 */       scala.Function1 mapelements_value1 = (scala.Function1) mapelements_obj;
/* 042 */
/* 043 */       boolean mapelements_isNull = false || false;
/* 044 */       int[] mapelements_value = null;
/* 045 */       if (!mapelements_isNull) {
/* 046 */         Object mapelements_funcResult = null;
/* 047 */         mapelements_funcResult = mapelements_value1.apply(inputadapter_value);
/* 048 */         if (mapelements_funcResult == null) {
/* 049 */           mapelements_isNull = true;
/* 050 */         } else {
/* 051 */           mapelements_value = (int[]) mapelements_funcResult;
/* 052 */         }
/* 053 */
/* 054 */       }
/* 055 */       mapelements_isNull = mapelements_value == null;
/* 056 */
/* 057 */       boolean serializefromobject_isNull = mapelements_isNull;
/* 058 */       final ArrayData serializefromobject_value = serializefromobject_isNull ? null : org.apache.spark.sql.catalyst.expressions.UnsafeArrayData.fromPrimitiveArray(mapelements_value);
/* 059 */       serializefromobject_isNull = serializefromobject_value == null;
/* 060 */       serializefromobject_holder.reset();
/* 061 */
/* 062 */       serializefromobject_rowWriter.zeroOutNullBytes();
/* 063 */
/* 064 */       if (serializefromobject_isNull) {
/* 065 */         serializefromobject_rowWriter.setNullAt(0);
/* 066 */       } else {
/* 067 */         // Remember the current cursor so that we can calculate how many bytes are
/* 068 */         // written later.
/* 069 */         final int serializefromobject_tmpCursor = serializefromobject_holder.cursor;
/* 070 */
/* 071 */         if (serializefromobject_value instanceof UnsafeArrayData) {
/* 072 */           final int serializefromobject_sizeInBytes = ((UnsafeArrayData) serializefromobject_value).getSizeInBytes();
/* 073 */           // grow the global buffer before writing data.
/* 074 */           serializefromobject_holder.grow(serializefromobject_sizeInBytes);
/* 075 */           ((UnsafeArrayData) serializefromobject_value).writeToMemory(serializefromobject_holder.buffer, serializefromobject_holder.cursor);
/* 076 */           serializefromobject_holder.cursor += serializefromobject_sizeInBytes;
/* 077 */
/* 078 */         } else {
/* 079 */           final int serializefromobject_numElements = serializefromobject_value.numElements();
/* 080 */           serializefromobject_arrayWriter.initialize(serializefromobject_holder, serializefromobject_numElements, 4);
/* 081 */
/* 082 */           for (int serializefromobject_index = 0; serializefromobject_index < serializefromobject_numElements; serializefromobject_index++) {
/* 083 */             if (serializefromobject_value.isNullAt(serializefromobject_index)) {
/* 084 */               serializefromobject_arrayWriter.setNullInt(serializefromobject_index);
/* 085 */             } else {
/* 086 */               final int serializefromobject_element = serializefromobject_value.getInt(serializefromobject_index);
/* 087 */               serializefromobject_arrayWriter.write(serializefromobject_index, serializefromobject_element);
/* 088 */             }
/* 089 */           }
/* 090 */         }
/* 091 */
/* 092 */         serializefromobject_rowWriter.setOffsetAndSize(0, serializefromobject_tmpCursor, serializefromobject_holder.cursor - serializefromobject_tmpCursor);
/* 093 */       }
/* 094 */       serializefromobject_result.setTotalSize(serializefromobject_holder.totalSize());
/* 095 */       append(serializefromobject_result);
/* 096 */       if (shouldStop()) return;
/* 097 */     }
/* 098 */   }
/* 099 */ }
```

Added a test in `DatasetSuite`, `RowEncoderSuite`, and `CatalystTypeConvertersSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #15044 from kiszk/SPARK-17490.

(cherry picked from commit 19cf208063f035d793d2306295a251a9af7e32f6)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/catalyst/ScalaReflection.scala  | 16 ++++
 .../sql/catalyst/encoders/RowEncoder.scala    | 27 +++---
 .../spark/sql/catalyst/util/ArrayData.scala   | 15 +++-
 .../CatalystTypeConvertersSuite.scala         | 33 ++++++++
 .../catalyst/encoders/RowEncoderSuite.scala   | 26 ++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 18 ++++
 .../benchmark/PrimitiveArrayBenchmark.scala   | 82 +++++++++++++++++++
 7 files changed, 203 insertions(+), 14 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 31c6e5def143b..7bcaea7ea2f79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -441,6 +441,22 @@ object ScalaReflection extends ScalaReflection {
           val newPath = s"""- array element class: "$clsName"""" +: walkedTypePath
           MapObjects(serializerFor(_, elementType, newPath), input, dt)
 
+         case dt @ (BooleanType | ByteType | ShortType | IntegerType | LongType |
+                    FloatType | DoubleType) =>
+          val cls = input.dataType.asInstanceOf[ObjectType].cls
+          if (cls.isArray && cls.getComponentType.isPrimitive) {
+            StaticInvoke(
+              classOf[UnsafeArrayData],
+              ArrayType(dt, false),
+              "fromPrimitiveArray",
+              input :: Nil)
+          } else {
+            NewInstance(
+              classOf[GenericArrayData],
+              input :: Nil,
+              dataType = ArrayType(dt, schemaFor(elementType).nullable))
+          }
+
         case dt =>
           NewInstance(
             classOf[GenericArrayData],
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
index 2a6fcd03a26b0..e95e97b9dc6cb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -23,7 +23,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.SparkException
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
 import org.apache.spark.sql.catalyst.expressions.objects._
@@ -119,18 +119,19 @@ object RowEncoder {
         "fromString",
         inputObject :: Nil)
 
-    case t @ ArrayType(et, _) => et match {
-      case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
-        // TODO: validate input type for primitive array.
-        NewInstance(
-          classOf[GenericArrayData],
-          inputObject :: Nil,
-          dataType = t)
-      case _ => MapObjects(
-        element => serializerFor(ValidateExternalType(element, et), et),
-        inputObject,
-        ObjectType(classOf[Object]))
-    }
+    case t @ ArrayType(et, cn) =>
+      et match {
+        case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
+          StaticInvoke(
+            classOf[ArrayData],
+            t,
+            "toArrayData",
+            inputObject :: Nil)
+        case _ => MapObjects(
+          element => serializerFor(ValidateExternalType(element, et), et),
+          inputObject,
+          ObjectType(classOf[Object]))
+      }
 
     case t @ MapType(kt, vt, valueNullable) =>
       val keys =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
index cad4a08b0d839..140e86d670a5b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala
@@ -19,9 +19,22 @@ package org.apache.spark.sql.catalyst.util
 
 import scala.reflect.ClassTag
 
-import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, UnsafeArrayData}
 import org.apache.spark.sql.types.DataType
 
+object ArrayData {
+  def toArrayData(input: Any): ArrayData = input match {
+    case a: Array[Boolean] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Byte] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Short] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Int] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Long] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Float] => UnsafeArrayData.fromPrimitiveArray(a)
+    case a: Array[Double] => UnsafeArrayData.fromPrimitiveArray(a)
+    case other => new GenericArrayData(other)
+  }
+}
+
 abstract class ArrayData extends SpecializedGetters with Serializable {
   def numElements(): Int
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
index 03bb102c67fe7..f3702ec92b425 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystTypeConvertersSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.UnsafeArrayData
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 class CatalystTypeConvertersSuite extends SparkFunSuite {
@@ -61,4 +63,35 @@ class CatalystTypeConvertersSuite extends SparkFunSuite {
   test("option handling in createToCatalystConverter") {
     assert(CatalystTypeConverters.createToCatalystConverter(IntegerType)(Some(123)) === 123)
   }
+
+  test("primitive array handling") {
+    val intArray = Array(1, 100, 10000)
+    val intUnsafeArray = UnsafeArrayData.fromPrimitiveArray(intArray)
+    val intArrayType = ArrayType(IntegerType, false)
+    assert(CatalystTypeConverters.createToScalaConverter(intArrayType)(intUnsafeArray) === intArray)
+
+    val doubleArray = Array(1.1, 111.1, 11111.1)
+    val doubleUnsafeArray = UnsafeArrayData.fromPrimitiveArray(doubleArray)
+    val doubleArrayType = ArrayType(DoubleType, false)
+    assert(CatalystTypeConverters.createToScalaConverter(doubleArrayType)(doubleUnsafeArray)
+      === doubleArray)
+  }
+
+  test("An array with null handling") {
+    val intArray = Array(1, null, 100, null, 10000)
+    val intGenericArray = new GenericArrayData(intArray)
+    val intArrayType = ArrayType(IntegerType, true)
+    assert(CatalystTypeConverters.createToScalaConverter(intArrayType)(intGenericArray)
+      === intArray)
+    assert(CatalystTypeConverters.createToCatalystConverter(intArrayType)(intArray)
+      == intGenericArray)
+
+    val doubleArray = Array(1.1, null, 111.1, null, 11111.1)
+    val doubleGenericArray = new GenericArrayData(doubleArray)
+    val doubleArrayType = ArrayType(DoubleType, true)
+    assert(CatalystTypeConverters.createToScalaConverter(doubleArrayType)(doubleGenericArray)
+      === doubleArray)
+    assert(CatalystTypeConverters.createToCatalystConverter(doubleArrayType)(doubleArray)
+      == doubleGenericArray)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
index 2e513ea22c151..1a5569a77dc7a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -191,6 +191,32 @@ class RowEncoderSuite extends SparkFunSuite {
     assert(encoder.serializer.head.nullable == false)
   }
 
+  test("RowEncoder should support primitive arrays") {
+    val schema = new StructType()
+      .add("booleanPrimitiveArray", ArrayType(BooleanType, false))
+      .add("bytePrimitiveArray", ArrayType(ByteType, false))
+      .add("shortPrimitiveArray", ArrayType(ShortType, false))
+      .add("intPrimitiveArray", ArrayType(IntegerType, false))
+      .add("longPrimitiveArray", ArrayType(LongType, false))
+      .add("floatPrimitiveArray", ArrayType(FloatType, false))
+      .add("doublePrimitiveArray", ArrayType(DoubleType, false))
+    val encoder = RowEncoder(schema).resolveAndBind()
+    val input = Seq(
+      Array(true, false),
+      Array(1.toByte, 64.toByte, Byte.MaxValue),
+      Array(1.toShort, 255.toShort, Short.MaxValue),
+      Array(1, 10000, Int.MaxValue),
+      Array(1.toLong, 1000000.toLong, Long.MaxValue),
+      Array(1.1.toFloat, 123.456.toFloat, Float.MaxValue),
+      Array(11.1111, 123456.7890123, Double.MaxValue)
+    )
+    val row = encoder.toRow(Row.fromSeq(input))
+    val convertedBack = encoder.fromRow(row)
+    input.zipWithIndex.map { case (array, index) =>
+      assert(convertedBack.getSeq(index) === array)
+    }
+  }
+
   test("RowEncoder should support array as the external type for ArrayType") {
     val schema = new StructType()
       .add("array", ArrayType(IntegerType))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index a8dd422aa0c85..81fa8cbf22384 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1033,6 +1033,24 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       checkAnswer(agg, ds.groupBy('id % 2).agg(count('id)))
     }
   }
+
+  test("identity map for primitive arrays") {
+    val arrayByte = Array(1.toByte, 2.toByte, 3.toByte)
+    val arrayInt = Array(1, 2, 3)
+    val arrayLong = Array(1.toLong, 2.toLong, 3.toLong)
+    val arrayDouble = Array(1.1, 2.2, 3.3)
+    val arrayString = Array("a", "b", "c")
+    val dsByte = sparkContext.parallelize(Seq(arrayByte), 1).toDS.map(e => e)
+    val dsInt = sparkContext.parallelize(Seq(arrayInt), 1).toDS.map(e => e)
+    val dsLong = sparkContext.parallelize(Seq(arrayLong), 1).toDS.map(e => e)
+    val dsDouble = sparkContext.parallelize(Seq(arrayDouble), 1).toDS.map(e => e)
+    val dsString = sparkContext.parallelize(Seq(arrayString), 1).toDS.map(e => e)
+    checkDataset(dsByte, arrayByte)
+    checkDataset(dsInt, arrayInt)
+    checkDataset(dsLong, arrayLong)
+    checkDataset(dsDouble, arrayDouble)
+    checkDataset(dsString, arrayString)
+  }
 }
 
 case class Generic[T](id: T, value: Double)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala
new file mode 100644
index 0000000000000..e7c8f2717fd74
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.benchmark
+
+import scala.concurrent.duration._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.util.Benchmark
+
+/**
+ * Benchmark [[PrimitiveArray]] for DataFrame and Dataset program using primitive array
+ * To run this:
+ *  1. replace ignore(...) with test(...)
+ *  2. build/sbt "sql/test-only *benchmark.PrimitiveArrayBenchmark"
+ *
+ * Benchmarks in this file are skipped in normal builds.
+ */
+class PrimitiveArrayBenchmark extends BenchmarkBase {
+
+  def writeDatasetArray(iters: Int): Unit = {
+    import sparkSession.implicits._
+
+    val count = 1024 * 1024 * 2
+
+    val sc = sparkSession.sparkContext
+    val primitiveIntArray = Array.fill[Int](count)(65535)
+    val dsInt = sc.parallelize(Seq(primitiveIntArray), 1).toDS
+    dsInt.count  // force to build dataset
+    val intArray = { i: Int =>
+      var n = 0
+      var len = 0
+      while (n < iters) {
+        len += dsInt.map(e => e).queryExecution.toRdd.collect.length
+        n += 1
+      }
+    }
+    val primitiveDoubleArray = Array.fill[Double](count)(65535.0)
+    val dsDouble = sc.parallelize(Seq(primitiveDoubleArray), 1).toDS
+    dsDouble.count  // force to build dataset
+    val doubleArray = { i: Int =>
+      var n = 0
+      var len = 0
+      while (n < iters) {
+        len += dsDouble.map(e => e).queryExecution.toRdd.collect.length
+        n += 1
+      }
+    }
+
+    val benchmark = new Benchmark("Write an array in Dataset", count * iters)
+    benchmark.addCase("Int   ")(intArray)
+    benchmark.addCase("Double")(doubleArray)
+    benchmark.run
+    /*
+    OpenJDK 64-Bit Server VM 1.8.0_91-b14 on Linux 4.4.11-200.fc22.x86_64
+    Intel Xeon E3-12xx v2 (Ivy Bridge)
+    Write an array in Dataset:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    Int                                            352 /  401         23.8          42.0       1.0X
+    Double                                         821 /  885         10.2          97.9       0.4X
+    */
+  }
+
+  ignore("Write an array in Dataset") {
+    writeDatasetArray(4)
+  }
+}

From 4af82d56f79ac3cceb08b702413ae2b35dfea48b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 7 Nov 2016 16:54:40 -0800
Subject: [PATCH 058/534] [SPARK-18295][SQL] Make to_json function null safe
 (matching it to from_json)

## What changes were proposed in this pull request?

This PR proposes to match up the behaviour of `to_json` to `from_json` function for null-safety.

Currently, it throws `NullPointException` but this PR fixes this to produce `null` instead.

with the data below:

```scala
import spark.implicits._

val df = Seq(Some(Tuple1(Tuple1(1))), None).toDF("a")
df.show()
```

```
+----+
|   a|
+----+
| [1]|
|null|
+----+
```

the codes below

```scala
import org.apache.spark.sql.functions._

df.select(to_json($"a")).show()
```

produces..

**Before**

throws `NullPointException` as below:

```
java.lang.NullPointerException
  at org.apache.spark.sql.catalyst.json.JacksonGenerator.org$apache$spark$sql$catalyst$json$JacksonGenerator$$writeFields(JacksonGenerator.scala:138)
  at org.apache.spark.sql.catalyst.json.JacksonGenerator$$anonfun$write$1.apply$mcV$sp(JacksonGenerator.scala:194)
  at org.apache.spark.sql.catalyst.json.JacksonGenerator.org$apache$spark$sql$catalyst$json$JacksonGenerator$$writeObject(JacksonGenerator.scala:131)
  at org.apache.spark.sql.catalyst.json.JacksonGenerator.write(JacksonGenerator.scala:193)
  at org.apache.spark.sql.catalyst.expressions.StructToJson.eval(jsonExpressions.scala:544)
  at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:142)
  at org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:48)
  at org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:30)
  at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
```

**After**

```
+---------------+
|structtojson(a)|
+---------------+
|       {"_1":1}|
|           null|
+---------------+
```

## How was this patch tested?

Unit test in `JsonExpressionsSuite.scala` and `JsonFunctionsSuite.scala`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15792 from HyukjinKwon/SPARK-18295.

(cherry picked from commit 3eda05703f02413540f180ade01f0f114e70b9cc)
Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 .../sql/catalyst/expressions/jsonExpressions.scala | 14 +++++---------
 .../expressions/JsonExpressionsSuite.scala         | 13 +++++++++++--
 .../org/apache/spark/sql/JsonFunctionsSuite.scala  | 14 ++++++++++++++
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 89fe7c48c000e..b61583d0dafb6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -484,7 +484,7 @@ case class JsonTuple(children: Seq[Expression])
  * Converts an json input string to a [[StructType]] with the specified schema.
  */
 case class JsonToStruct(schema: StructType, options: Map[String, String], child: Expression)
-  extends Expression with CodegenFallback with ExpectsInputTypes {
+  extends UnaryExpression with CodegenFallback with ExpectsInputTypes {
   override def nullable: Boolean = true
 
   @transient
@@ -495,11 +495,8 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child:
       new JSONOptions(options ++ Map("mode" -> ParseModes.FAIL_FAST_MODE)))
 
   override def dataType: DataType = schema
-  override def children: Seq[Expression] = child :: Nil
 
-  override def eval(input: InternalRow): Any = {
-    val json = child.eval(input)
-    if (json == null) return null
+  override def nullSafeEval(json: Any): Any = {
     try parser.parse(json.toString).head catch {
       case _: SparkSQLJsonProcessingException => null
     }
@@ -512,7 +509,7 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child:
  * Converts a [[StructType]] to a json output string.
  */
 case class StructToJson(options: Map[String, String], child: Expression)
-  extends Expression with CodegenFallback with ExpectsInputTypes {
+  extends UnaryExpression with CodegenFallback with ExpectsInputTypes {
   override def nullable: Boolean = true
 
   @transient
@@ -523,7 +520,6 @@ case class StructToJson(options: Map[String, String], child: Expression)
     new JacksonGenerator(child.dataType.asInstanceOf[StructType], writer)
 
   override def dataType: DataType = StringType
-  override def children: Seq[Expression] = child :: Nil
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (StructType.acceptsType(child.dataType)) {
@@ -540,8 +536,8 @@ case class StructToJson(options: Map[String, String], child: Expression)
     }
   }
 
-  override def eval(input: InternalRow): Any = {
-    gen.write(child.eval(input).asInstanceOf[InternalRow])
+  override def nullSafeEval(row: Any): Any = {
+    gen.write(row.asInstanceOf[InternalRow])
     gen.flush()
     val json = writer.toString
     writer.reset()
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 3bfa0bfda6209..3b0e90824b766 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.ParseModes
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.unsafe.types.UTF8String
 
 class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -347,7 +347,7 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("from_json null input column") {
     val schema = StructType(StructField("a", IntegerType) :: Nil)
     checkEvaluation(
-      JsonToStruct(schema, Map.empty, Literal(null)),
+      JsonToStruct(schema, Map.empty, Literal.create(null, StringType)),
       null
     )
   }
@@ -360,4 +360,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       """{"a":1}"""
     )
   }
+
+  test("to_json null input column") {
+    val schema = StructType(StructField("a", IntegerType) :: Nil)
+    val struct = Literal.create(null, schema)
+    checkEvaluation(
+      StructToJson(Map.empty, struct),
+      null
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 59ae889cf3b92..7d63d31d9b979 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -141,4 +141,18 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
     assert(e.getMessage.contains(
       "Unable to convert column a of type calendarinterval to JSON."))
   }
+
+  test("roundtrip in to_json and from_json") {
+    val dfOne = Seq(Some(Tuple1(Tuple1(1))), None).toDF("struct")
+    val schemaOne = dfOne.schema(0).dataType.asInstanceOf[StructType]
+    val readBackOne = dfOne.select(to_json($"struct").as("json"))
+      .select(from_json($"json", schemaOne).as("struct"))
+    checkAnswer(dfOne, readBackOne)
+
+    val dfTwo = Seq(Some("""{"a":1}"""), None).toDF("json")
+    val schemaTwo = new StructType().add("a", IntegerType)
+    val readBackTwo = dfTwo.select(from_json($"json", schemaTwo).as("struct"))
+      .select(to_json($"struct").as("json"))
+    checkAnswer(dfTwo, readBackTwo)
+  }
 }

From 29f59c73301628fb63086660f64fdb5272a312fe Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Mon, 7 Nov 2016 17:36:15 -0800
Subject: [PATCH 059/534] [SPARK-18086] Add support for Hive session vars.

## What changes were proposed in this pull request?

This adds support for Hive variables:

* Makes values set via `spark-sql --hivevar name=value` accessible
* Adds `getHiveVar` and `setHiveVar` to the `HiveClient` interface
* Adds a SessionVariables trait for sessions like Hive that support variables (including Hive vars)
* Adds SessionVariables support to variable substitution
* Adds SessionVariables support to the SET command

## How was this patch tested?

* Adds a test to all supported Hive versions for accessing Hive variables
* Adds HiveVariableSubstitutionSuite

Author: Ryan Blue <blue@apache.org>

Closes #15738 from rdblue/SPARK-18086-add-hivevar-support.

(cherry picked from commit 9b0593d5e99bb919c4abb8d0836a126ec2eaf1d5)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/execution/command/SetCommand.scala    | 11 ++++
 .../sql/internal/VariableSubstitution.scala   |  5 +-
 .../hive/thriftserver/SparkSQLCLIDriver.scala |  6 ++-
 .../hive/HiveVariableSubstitutionSuite.scala  | 50 +++++++++++++++++++
 4 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
index af6def52d07d1..dc8d97594c7a7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
@@ -60,6 +60,13 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
       }
       (keyValueOutput, runFunc)
 
+    case Some((key @ SetCommand.VariableName(name), Some(value))) =>
+      val runFunc = (sparkSession: SparkSession) => {
+        sparkSession.conf.set(name, value)
+        Seq(Row(key, value))
+      }
+      (keyValueOutput, runFunc)
+
     // Configures a single property.
     case Some((key, Some(value))) =>
       val runFunc = (sparkSession: SparkSession) => {
@@ -117,6 +124,10 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
 
 }
 
+object SetCommand {
+  val VariableName = """hivevar:([^=]+)""".r
+}
+
 /**
  * This command is for resetting SQLConf to the default values. Command that runs
  * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
index 50725a09c42b3..791a9cf813b6a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
@@ -17,10 +17,7 @@
 
 package org.apache.spark.sql.internal
 
-import java.util.regex.Pattern
-
 import org.apache.spark.internal.config._
-import org.apache.spark.sql.AnalysisException
 
 /**
  * A helper class that enables substitution using syntax like
@@ -37,6 +34,7 @@ class VariableSubstitution(conf: SQLConf) {
   private val reader = new ConfigReader(provider)
     .bind("spark", provider)
     .bind("sparkconf", provider)
+    .bind("hivevar", provider)
     .bind("hiveconf", provider)
 
   /**
@@ -49,5 +47,4 @@ class VariableSubstitution(conf: SQLConf) {
       input
     }
   }
-
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 5dafec1c3021b..0c79b6f4211ff 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -38,7 +38,7 @@ import org.apache.thrift.transport.TSocket
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.hive.{HiveSessionState, HiveUtils}
 import org.apache.spark.util.ShutdownHookManager
 
 /**
@@ -291,6 +291,10 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
     throw new RuntimeException("Remote operations not supported")
   }
 
+  override def setHiveVariables(hiveVariables: java.util.Map[String, String]): Unit = {
+    hiveVariables.asScala.foreach(kv => SparkSQLEnv.sqlContext.conf.setConfString(kv._1, kv._2))
+  }
+
   override def processCmd(cmd: String): Int = {
     val cmd_trimmed: String = cmd.trim()
     val cmd_lower = cmd_trimmed.toLowerCase(Locale.ENGLISH)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala
new file mode 100644
index 0000000000000..84d3946ca5c6f
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveVariableSubstitutionSuite.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+
+class HiveVariableSubstitutionSuite extends QueryTest with TestHiveSingleton {
+  test("SET hivevar with prefix") {
+    spark.sql("SET hivevar:county=gram")
+    assert(spark.conf.getOption("county") === Some("gram"))
+  }
+
+  test("SET hivevar with dotted name") {
+    spark.sql("SET hivevar:eloquent.mosquito.alphabet=zip")
+    assert(spark.conf.getOption("eloquent.mosquito.alphabet") === Some("zip"))
+  }
+
+  test("hivevar substitution") {
+    spark.conf.set("pond", "bus")
+    checkAnswer(spark.sql("SELECT '${hivevar:pond}'"), Row("bus") :: Nil)
+  }
+
+  test("variable substitution without a prefix") {
+    spark.sql("SET hivevar:flask=plaid")
+    checkAnswer(spark.sql("SELECT '${flask}'"), Row("plaid") :: Nil)
+  }
+
+  test("variable substitution precedence") {
+    spark.conf.set("turn.aloof", "questionable")
+    spark.sql("SET hivevar:turn.aloof=dime")
+    // hivevar clobbers the conf setting
+    checkAnswer(spark.sql("SELECT '${turn.aloof}'"), Row("dime") :: Nil)
+  }
+}

From 4943929d85a2aaf404c140d2d2589a597f484976 Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Mon, 7 Nov 2016 17:49:24 -0800
Subject: [PATCH 060/534] [SPARK-18261][STRUCTURED STREAMING] Add statistics to
 MemorySink for joining

## What changes were proposed in this pull request?

Right now, there is no way to join the output of a memory sink with any table:

> UnsupportedOperationException: LeafNode MemoryPlan must implement statistics

This patch adds statistics to MemorySink, making joining snapshots of memory streams with tables possible.

## How was this patch tested?

Added a test case.

Author: Liwei Lin <lwlin7@gmail.com>

Closes #15786 from lw-lin/memory-sink-stat.

(cherry picked from commit c1a0c66bd2662bc40f312da474c3b95229fe92d0)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/execution/streaming/memory.scala   |  6 +++++-
 .../spark/sql/streaming/MemorySinkSuite.scala    | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 48d9791faf1e9..613c7ccdd226a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -27,7 +27,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -212,4 +212,8 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
  */
 case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode {
   def this(sink: MemorySink) = this(sink, sink.schema.toAttributes)
+
+  private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum
+
+  override def statistics: Statistics = Statistics(sizePerRow * sink.allData.size)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala
index 310d75630272b..4e9fba9dbaa1e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala
@@ -187,6 +187,22 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
     query.stop()
   }
 
+  test("MemoryPlan statistics") {
+    implicit val schema = new StructType().add(new StructField("value", IntegerType))
+    val sink = new MemorySink(schema, InternalOutputModes.Append)
+    val plan = new MemoryPlan(sink)
+
+    // Before adding data, check output
+    checkAnswer(sink.allData, Seq.empty)
+    assert(plan.statistics.sizeInBytes === 0)
+
+    sink.addBatch(0, 1 to 3)
+    assert(plan.statistics.sizeInBytes === 12)
+
+    sink.addBatch(1, 4 to 6)
+    assert(plan.statistics.sizeInBytes === 24)
+  }
+
   ignore("stress test") {
     // Ignore the stress test as it takes several minutes to run
     (0 until 1000).foreach { _ =>

From 4cb4e5ff0ab9537758bf0b418ddd40dfe9537609 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 7 Nov 2016 18:34:21 -0800
Subject: [PATCH 061/534] [SPARK-18217][SQL] Disallow creating permanent views
 based on temporary views or UDFs

### What changes were proposed in this pull request?
Based on the discussion in [SPARK-18209](https://issues.apache.org/jira/browse/SPARK-18209). It doesn't really make sense to create permanent views based on temporary views or temporary UDFs.

To disallow the supports and issue the exceptions, this PR needs to detect whether a temporary view/UDF is being used when defining a permanent view. Basically, this PR can be split to two sub-tasks:

**Task 1:** detecting a temporary view from the query plan of view definition.
When finding an unresolved temporary view, Analyzer replaces it by a `SubqueryAlias` with the corresponding logical plan, which is stored in an in-memory HashMap. After replacement, it is impossible to detect whether the `SubqueryAlias` is added/generated from a temporary view. Thus, to detect the usage of a temporary view in view definition, this PR traverses the unresolved logical plan and uses the name of an `UnresolvedRelation` to detect whether it is a (global) temporary view.

**Task 2:** detecting a temporary UDF from the query plan of view definition.
Detecting usage of a temporary UDF in view definition is not straightfoward.

First, in the analyzed plan, we are having different forms to represent the functions. More importantly, some classes (e.g., `HiveGenericUDF`) are not accessible from `CreateViewCommand`, which is part of  `sql/core`. Thus, we used the unanalyzed plan `child` of `CreateViewCommand` to detect the usage of a temporary UDF. Because the plan has already been successfully analyzed, we can assume the functions have been defined/registered.

Second, in Spark, the functions have four forms: Spark built-in functions, built-in hash functions, permanent UDFs and temporary UDFs. We do not have any direct way to determine whether a function is temporary or not. Thus, we introduced a function `isTemporaryFunction` in `SessionCatalog`. This function contains the detailed logics to determine whether a function is temporary or not.

### How was this patch tested?
Added test cases.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15764 from gatorsmile/blockTempFromPermViewCreation.

(cherry picked from commit 1da64e1fa0970277d1fb47dec8adca47b068b1ec)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/catalog/SessionCatalog.scala | 18 ++++
 .../catalog/SessionCatalogSuite.scala         | 28 ++++++
 .../spark/sql/execution/command/views.scala   | 38 ++++++-
 .../spark/sql/hive/HiveSessionCatalog.scala   |  1 +
 .../sql/hive/execution/SQLViewSuite.scala     | 99 +++++++++++++++++--
 5 files changed, 172 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 2d2120dda8bde..c8b61d8df3585 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -923,6 +923,24 @@ class SessionCatalog(
     }
   }
 
+  /**
+   * Returns whether it is a temporary function. If not existed, returns false.
+   */
+  def isTemporaryFunction(name: FunctionIdentifier): Boolean = {
+    // copied from HiveSessionCatalog
+    val hiveFunctions = Seq(
+      "hash",
+      "histogram_numeric",
+      "percentile")
+
+    // A temporary function is a function that has been registered in functionRegistry
+    // without a database name, and is neither a built-in function nor a Hive function
+    name.database.isEmpty &&
+      functionRegistry.functionExists(name.funcName) &&
+      !FunctionRegistry.builtin.functionExists(name.funcName) &&
+      !hiveFunctions.contains(name.funcName.toLowerCase)
+  }
+
   protected def failFunctionLookup(name: String): Nothing = {
     throw new NoSuchFunctionException(db = currentDb, func = name)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index b77fef225a0c8..001d9c47785d2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -919,6 +919,34 @@ class SessionCatalogSuite extends SparkFunSuite {
       catalog.lookupFunction(FunctionIdentifier("temp1"), arguments) === Literal(arguments.length))
   }
 
+  test("isTemporaryFunction") {
+    val externalCatalog = newBasicCatalog()
+    val sessionCatalog = new SessionCatalog(externalCatalog)
+
+    // Returns false when the function does not exist
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("temp1")))
+
+    val tempFunc1 = (e: Seq[Expression]) => e.head
+    val info1 = new ExpressionInfo("tempFunc1", "temp1")
+    sessionCatalog.createTempFunction("temp1", info1, tempFunc1, ignoreIfExists = false)
+
+    // Returns true when the function is temporary
+    assert(sessionCatalog.isTemporaryFunction(FunctionIdentifier("temp1")))
+
+    // Returns false when the function is permanent
+    assert(externalCatalog.listFunctions("db2", "*").toSet == Set("func1"))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("func1", Some("db2"))))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("db2.func1")))
+    sessionCatalog.setCurrentDatabase("db2")
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("func1")))
+
+    // Returns false when the function is built-in or hive
+    assert(FunctionRegistry.builtin.functionExists("sum"))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("sum")))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("histogram_numeric")))
+    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("percentile")))
+  }
+
   test("drop function") {
     val externalCatalog = newBasicCatalog()
     val sessionCatalog = new SessionCatalog(externalCatalog)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index bbcd9c4ef564c..30472ec45ce44 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -19,14 +19,14 @@ package org.apache.spark.sql.execution.command
 
 import scala.util.control.NonFatal
 
-import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.{SQLBuilder, TableIdentifier}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.expressions.Alias
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
-import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
-import org.apache.spark.sql.types.{MetadataBuilder, StructType}
+import org.apache.spark.sql.types.MetadataBuilder
 
 
 /**
@@ -131,6 +131,10 @@ case class CreateViewCommand(
         s"specified by CREATE VIEW (num: `${userSpecifiedColumns.length}`).")
     }
 
+    // When creating a permanent view, not allowed to reference temporary objects.
+    // This should be called after `qe.assertAnalyzed()` (i.e., `child` can be resolved)
+    verifyTemporaryObjectsNotExists(sparkSession)
+
     val aliasedPlan = if (userSpecifiedColumns.isEmpty) {
       analyzedPlan
     } else {
@@ -172,6 +176,34 @@ case class CreateViewCommand(
     Seq.empty[Row]
   }
 
+  /**
+   * Permanent views are not allowed to reference temp objects, including temp function and views
+   */
+  private def verifyTemporaryObjectsNotExists(sparkSession: SparkSession): Unit = {
+    if (!isTemporary) {
+      // This func traverses the unresolved plan `child`. Below are the reasons:
+      // 1) Analyzer replaces unresolved temporary views by a SubqueryAlias with the corresponding
+      // logical plan. After replacement, it is impossible to detect whether the SubqueryAlias is
+      // added/generated from a temporary view.
+      // 2) The temp functions are represented by multiple classes. Most are inaccessible from this
+      // package (e.g., HiveGenericUDF).
+      child.collect {
+        // Disallow creating permanent views based on temporary views.
+        case s: UnresolvedRelation
+          if sparkSession.sessionState.catalog.isTemporaryTable(s.tableIdentifier) =>
+          throw new AnalysisException(s"Not allowed to create a permanent view $name by " +
+            s"referencing a temporary view ${s.tableIdentifier}")
+        case other if !other.resolved => other.expressions.flatMap(_.collect {
+          // Disallow creating permanent views based on temporary UDFs.
+          case e: UnresolvedFunction
+            if sparkSession.sessionState.catalog.isTemporaryFunction(e.name) =>
+            throw new AnalysisException(s"Not allowed to create a permanent view $name by " +
+              s"referencing a temporary function `${e.name}`")
+        })
+      }
+    }
+  }
+
   /**
    * Returns a [[CatalogTable]] that can be used to save in the catalog. This comment canonicalize
    * SQL based on the analyzed plan, and also creates the proper schema for the view.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 9df20ce1553ec..4a9b28a455a44 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -232,6 +232,7 @@ private[sql] class HiveSessionCatalog(
   // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
   // in_file, index, matchpath, ngrams, noop, noopstreaming, noopwithmap,
   // noopwithmapstreaming, parse_url_tuple, reflect2, windowingtablefunction.
+  // Note: don't forget to update SessionCatalog.isTemporaryFunction
   private val hiveFunctions = Seq(
     "histogram_numeric",
     "percentile"
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala
index 2af935da689c9..ba65db71ede7f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLViewSuite.scala
@@ -38,21 +38,46 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     spark.sql(s"DROP TABLE IF EXISTS jt")
   }
 
-  test("nested views (interleaved with temporary views)") {
-    withView("jtv1", "jtv2", "jtv3", "temp_jtv1", "temp_jtv2", "temp_jtv3") {
+  test("create a permanent view on a permanent view") {
+    withView("jtv1", "jtv2") {
       sql("CREATE VIEW jtv1 AS SELECT * FROM jt WHERE id > 3")
       sql("CREATE VIEW jtv2 AS SELECT * FROM jtv1 WHERE id < 6")
       checkAnswer(sql("select count(*) FROM jtv2"), Row(2))
+    }
+  }
 
-      // Checks temporary views
+  test("create a temp view on a permanent view") {
+    withView("jtv1", "temp_jtv1") {
+      sql("CREATE VIEW jtv1 AS SELECT * FROM jt WHERE id > 3")
+      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jtv1 WHERE id < 6")
+      checkAnswer(sql("select count(*) FROM temp_jtv1"), Row(2))
+    }
+  }
+
+  test("create a temp view on a temp view") {
+    withView("temp_jtv1", "temp_jtv2") {
       sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
       sql("CREATE TEMPORARY VIEW temp_jtv2 AS SELECT * FROM temp_jtv1 WHERE id < 6")
       checkAnswer(sql("select count(*) FROM temp_jtv2"), Row(2))
+    }
+  }
+
+  test("create a permanent view on a temp view") {
+    withView("jtv1", "temp_jtv1", "global_temp_jtv1") {
+      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
+      var e = intercept[AnalysisException] {
+        sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
+      }.getMessage
+      assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
+        "referencing a temporary view `temp_jtv1`"))
 
-      // Checks interleaved temporary view and normal view
-      sql("CREATE TEMPORARY VIEW temp_jtv3 AS SELECT * FROM jt WHERE id > 3")
-      sql("CREATE VIEW jtv3 AS SELECT * FROM temp_jtv3 WHERE id < 6")
-      checkAnswer(sql("select count(*) FROM jtv3"), Row(2))
+      val globalTempDB = spark.sharedState.globalTempViewManager.database
+      sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
+      e = intercept[AnalysisException] {
+        sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
+      }.getMessage
+      assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
+        s"a temporary view `global_temp`.`global_temp_jtv1`"))
     }
   }
 
@@ -439,7 +464,7 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-14933 - create view from hive parquet tabale") {
+  test("SPARK-14933 - create view from hive parquet table") {
     withTable("t_part") {
       withView("v_part") {
         spark.sql("create table t_part stored as parquet as select 1 as a, 2 as b")
@@ -451,7 +476,7 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-14933 - create view from hive orc tabale") {
+  test("SPARK-14933 - create view from hive orc table") {
     withTable("t_orc") {
       withView("v_orc") {
         spark.sql("create table t_orc stored as orc as select 1 as a, 2 as b")
@@ -462,4 +487,60 @@ class SQLViewSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("create a permanent/temp view using a hive, built-in, and permanent user function") {
+    val permanentFuncName = "myUpper"
+    val permanentFuncClass =
+      classOf[org.apache.hadoop.hive.ql.udf.generic.GenericUDFUpper].getCanonicalName
+    val builtInFuncNameInLowerCase = "abs"
+    val builtInFuncNameInMixedCase = "aBs"
+    val hiveFuncName = "histogram_numeric"
+
+    withUserDefinedFunction(permanentFuncName -> false) {
+      sql(s"CREATE FUNCTION $permanentFuncName AS '$permanentFuncClass'")
+      withTable("tab1") {
+        (1 to 10).map(i => (s"$i", i)).toDF("str", "id").write.saveAsTable("tab1")
+        Seq("VIEW", "TEMPORARY VIEW").foreach { viewMode =>
+          withView("view1") {
+            sql(
+              s"""
+                 |CREATE $viewMode view1
+                 |AS SELECT
+                 |$permanentFuncName(str),
+                 |$builtInFuncNameInLowerCase(id),
+                 |$builtInFuncNameInMixedCase(id) as aBs,
+                 |$hiveFuncName(id, 5) over()
+                 |FROM tab1
+               """.stripMargin)
+            checkAnswer(sql("select count(*) FROM view1"), Row(10))
+          }
+        }
+      }
+    }
+  }
+
+  test("create a permanent/temp view using a temporary function") {
+    val tempFunctionName = "temp"
+    val functionClass =
+      classOf[org.apache.hadoop.hive.ql.udf.generic.GenericUDFUpper].getCanonicalName
+    withUserDefinedFunction(tempFunctionName -> true) {
+      sql(s"CREATE TEMPORARY FUNCTION $tempFunctionName AS '$functionClass'")
+      withView("view1", "tempView1") {
+        withTable("tab1") {
+          (1 to 10).map(i => s"$i").toDF("id").write.saveAsTable("tab1")
+
+          // temporary view
+          sql(s"CREATE TEMPORARY VIEW tempView1 AS SELECT $tempFunctionName(id) from tab1")
+          checkAnswer(sql("select count(*) FROM tempView1"), Row(10))
+
+          // permanent view
+          val e = intercept[AnalysisException] {
+            sql(s"CREATE VIEW view1 AS SELECT $tempFunctionName(id) from tab1")
+          }.getMessage
+          assert(e.contains("Not allowed to create a permanent view `view1` by referencing " +
+            s"a temporary function `$tempFunctionName`"))
+        }
+      }
+    }
+  }
 }

From c8879bf1ee2af9ccd5d5656571d931d2fc1da024 Mon Sep 17 00:00:00 2001
From: fidato <fidato.july13@gmail.com>
Date: Mon, 7 Nov 2016 18:41:17 -0800
Subject: [PATCH 062/534] [SPARK-16575][CORE] partition calculation mismatch
 with sc.binaryFiles

## What changes were proposed in this pull request?

This Pull request comprises of the critical bug SPARK-16575 changes. This change rectifies the issue with BinaryFileRDD partition calculations as  upon creating an RDD with sc.binaryFiles, the resulting RDD always just consisted of two partitions only.
## How was this patch tested?

The original issue ie. getNumPartitions on binary Files RDD (always having two partitions) was first replicated and then tested upon the changes. Also the unit tests have been checked and passed.

This contribution is my original work and I licence the work to the project under the project's open source license

srowen hvanhovell rxin vanzin skyluc kmader zsxwing datafarmer Please have a look .

Author: fidato <fidato.july13@gmail.com>

Closes #15327 from fidato13/SPARK-16575.

(cherry picked from commit 6f3697136aa68dc39d3ce42f43a7af554d2a3bf9)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/input/PortableDataStream.scala  | 14 +++++++++++---
 .../apache/spark/internal/config/package.scala   | 13 +++++++++++++
 .../org/apache/spark/rdd/BinaryFileRDD.scala     |  4 ++--
 docs/configuration.md                            | 16 ++++++++++++++++
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
index f66510b6f977f..59404e08895a3 100644
--- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
+++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala
@@ -27,6 +27,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext}
 import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit}
 
+import org.apache.spark.internal.config
+import org.apache.spark.SparkContext
+
 /**
  * A general format for reading whole files in as streams, byte arrays,
  * or other functions to be added
@@ -40,9 +43,14 @@ private[spark] abstract class StreamFileInputFormat[T]
    * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API
    * which is set through setMaxSplitSize
    */
-  def setMinPartitions(context: JobContext, minPartitions: Int) {
-    val totalLen = listStatus(context).asScala.filterNot(_.isDirectory).map(_.getLen).sum
-    val maxSplitSize = math.ceil(totalLen / math.max(minPartitions, 1.0)).toLong
+  def setMinPartitions(sc: SparkContext, context: JobContext, minPartitions: Int) {
+    val defaultMaxSplitBytes = sc.getConf.get(config.FILES_MAX_PARTITION_BYTES)
+    val openCostInBytes = sc.getConf.get(config.FILES_OPEN_COST_IN_BYTES)
+    val defaultParallelism = sc.defaultParallelism
+    val files = listStatus(context).asScala
+    val totalBytes = files.filterNot(_.isDirectory).map(_.getLen + openCostInBytes).sum
+    val bytesPerCore = totalBytes / defaultParallelism
+    val maxSplitSize = Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))
     super.setMaxSplitSize(maxSplitSize)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 497ca92c7bc60..4a3e3d5c79eff 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -206,4 +206,17 @@ package object config {
       "encountering corrupt files and contents that have been read will still be returned.")
     .booleanConf
     .createWithDefault(false)
+
+  private[spark] val FILES_MAX_PARTITION_BYTES = ConfigBuilder("spark.files.maxPartitionBytes")
+    .doc("The maximum number of bytes to pack into a single partition when reading files.")
+    .longConf
+    .createWithDefault(128 * 1024 * 1024)
+
+  private[spark] val FILES_OPEN_COST_IN_BYTES = ConfigBuilder("spark.files.openCostInBytes")
+    .doc("The estimated cost to open a file, measured by the number of bytes could be scanned in" +
+      " the same time. This is used when putting multiple files into a partition. It's better to" +
+      " over estimate, then the partitions with small files will be faster than partitions with" +
+      " bigger files.")
+    .longConf
+    .createWithDefault(4 * 1024 * 1024)
 }
diff --git a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
index 41832e8354741..50d977a92da51 100644
--- a/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/BinaryFileRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{Partition, SparkContext}
 import org.apache.spark.input.StreamFileInputFormat
 
 private[spark] class BinaryFileRDD[T](
-    sc: SparkContext,
+    @transient private val sc: SparkContext,
     inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
     keyClass: Class[String],
     valueClass: Class[T],
@@ -43,7 +43,7 @@ private[spark] class BinaryFileRDD[T](
       case _ =>
     }
     val jobContext = new JobContextImpl(conf, jobId)
-    inputFormat.setMinPartitions(jobContext, minPartitions)
+    inputFormat.setMinPartitions(sc, jobContext, minPartitions)
     val rawSplits = inputFormat.getSplits(jobContext).toArray
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
diff --git a/docs/configuration.md b/docs/configuration.md
index 0017219e07261..d0acd944dd6b9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1034,6 +1034,22 @@ Apart from these, the following properties are also available, and may be useful
     its contents do not match those of the source.
   </td>
 </tr>
+<tr>
+  <td><code>spark.files.maxPartitionBytes</code></td>
+  <td>134217728 (128 MB)</td>
+  <td>
+    The maximum number of bytes to pack into a single partition when reading files.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.files.openCostInBytes</code></td>
+  <td>4194304 (4 MB)</td>
+  <td>
+    The estimated cost to open a file, measured by the number of bytes could be scanned in the same
+    time. This is used when putting multiple files into a partition. It is better to over estimate,
+    then the partitions with small files will be faster than partitions with bigger files.
+  </td>
+</tr>
 <tr>
     <td><code>spark.hadoop.cloneConf</code></td>
     <td>false</td>

From ee400f67a471c9445d9d7e4957113fc62bff6abf Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 8 Nov 2016 12:01:54 +0100
Subject: [PATCH 063/534] [SPARK-18207][SQL] Fix a compilation error due to
 HashExpression.doGenCode

This PR avoids a compilation error due to more than 64KB Java byte code size. This error occur since  generate java code for computing a hash value for a row is too big. This PR fixes this compilation error by splitting a big code chunk into multiple methods by calling `CodegenContext.splitExpression` at `HashExpression.doGenCode`

The test case requires a calculation of hash code for a row that includes 1000 String fields. `HashExpression.doGenCode` generate a lot of Java code for this computation into one function. As a result, the size of the corresponding Java bytecode is more than 64 KB.

Generated code without this PR
````java
/* 027 */   public UnsafeRow apply(InternalRow i) {
/* 028 */     boolean isNull = false;
/* 029 */
/* 030 */     int value1 = 42;
/* 031 */
/* 032 */     boolean isNull2 = i.isNullAt(0);
/* 033 */     UTF8String value2 = isNull2 ? null : (i.getUTF8String(0));
/* 034 */     if (!isNull2) {
/* 035 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value2.getBaseObject(), value2.getBaseOffset(), value2.numBytes(), value1);
/* 036 */     }
/* 037 */
/* 038 */
/* 039 */     boolean isNull3 = i.isNullAt(1);
/* 040 */     UTF8String value3 = isNull3 ? null : (i.getUTF8String(1));
/* 041 */     if (!isNull3) {
/* 042 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value3.getBaseObject(), value3.getBaseOffset(), value3.numBytes(), value1);
/* 043 */     }
/* 044 */
/* 045 */
...
/* 7024 */
/* 7025 */     boolean isNull1001 = i.isNullAt(999);
/* 7026 */     UTF8String value1001 = isNull1001 ? null : (i.getUTF8String(999));
/* 7027 */     if (!isNull1001) {
/* 7028 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1001.getBaseObject(), value1001.getBaseOffset(), value1001.numBytes(), value1);
/* 7029 */     }
/* 7030 */
/* 7031 */
/* 7032 */     boolean isNull1002 = i.isNullAt(1000);
/* 7033 */     UTF8String value1002 = isNull1002 ? null : (i.getUTF8String(1000));
/* 7034 */     if (!isNull1002) {
/* 7035 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1002.getBaseObject(), value1002.getBaseOffset(), value1002.numBytes(), value1);
/* 7036 */     }
````

Generated code with this PR
````java
/* 3807 */   private void apply_249(InternalRow i) {
/* 3808 */
/* 3809 */     boolean isNull998 = i.isNullAt(996);
/* 3810 */     UTF8String value998 = isNull998 ? null : (i.getUTF8String(996));
/* 3811 */     if (!isNull998) {
/* 3812 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value998.getBaseObject(), value998.getBaseOffset(), value998.numBytes(), value1);
/* 3813 */     }
/* 3814 */
/* 3815 */     boolean isNull999 = i.isNullAt(997);
/* 3816 */     UTF8String value999 = isNull999 ? null : (i.getUTF8String(997));
/* 3817 */     if (!isNull999) {
/* 3818 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value999.getBaseObject(), value999.getBaseOffset(), value999.numBytes(), value1);
/* 3819 */     }
/* 3820 */
/* 3821 */     boolean isNull1000 = i.isNullAt(998);
/* 3822 */     UTF8String value1000 = isNull1000 ? null : (i.getUTF8String(998));
/* 3823 */     if (!isNull1000) {
/* 3824 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1000.getBaseObject(), value1000.getBaseOffset(), value1000.numBytes(), value1);
/* 3825 */     }
/* 3826 */
/* 3827 */     boolean isNull1001 = i.isNullAt(999);
/* 3828 */     UTF8String value1001 = isNull1001 ? null : (i.getUTF8String(999));
/* 3829 */     if (!isNull1001) {
/* 3830 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value1001.getBaseObject(), value1001.getBaseOffset(), value1001.numBytes(), value1);
/* 3831 */     }
/* 3832 */
/* 3833 */   }
/* 3834 */
...
/* 4532 */   private void apply_0(InternalRow i) {
/* 4533 */
/* 4534 */     boolean isNull2 = i.isNullAt(0);
/* 4535 */     UTF8String value2 = isNull2 ? null : (i.getUTF8String(0));
/* 4536 */     if (!isNull2) {
/* 4537 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value2.getBaseObject(), value2.getBaseOffset(), value2.numBytes(), value1);
/* 4538 */     }
/* 4539 */
/* 4540 */     boolean isNull3 = i.isNullAt(1);
/* 4541 */     UTF8String value3 = isNull3 ? null : (i.getUTF8String(1));
/* 4542 */     if (!isNull3) {
/* 4543 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value3.getBaseObject(), value3.getBaseOffset(), value3.numBytes(), value1);
/* 4544 */     }
/* 4545 */
/* 4546 */     boolean isNull4 = i.isNullAt(2);
/* 4547 */     UTF8String value4 = isNull4 ? null : (i.getUTF8String(2));
/* 4548 */     if (!isNull4) {
/* 4549 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value4.getBaseObject(), value4.getBaseOffset(), value4.numBytes(), value1);
/* 4550 */     }
/* 4551 */
/* 4552 */     boolean isNull5 = i.isNullAt(3);
/* 4553 */     UTF8String value5 = isNull5 ? null : (i.getUTF8String(3));
/* 4554 */     if (!isNull5) {
/* 4555 */       value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value5.getBaseObject(), value5.getBaseOffset(), value5.numBytes(), value1);
/* 4556 */     }
/* 4557 */
/* 4558 */   }
...
/* 7344 */   public UnsafeRow apply(InternalRow i) {
/* 7345 */     boolean isNull = false;
/* 7346 */
/* 7347 */     value1 = 42;
/* 7348 */     apply_0(i);
/* 7349 */     apply_1(i);
...
/* 7596 */     apply_248(i);
/* 7597 */     apply_249(i);
/* 7598 */     apply_250(i);
/* 7599 */     apply_251(i);
...
````

Add a new test in `DataFrameSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #15745 from kiszk/SPARK-18207.

(cherry picked from commit 47731e1865fa1e3a8881a1f4420017bdc026e455)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/catalyst/expressions/hash.scala | 18 +++++++++------
 .../expressions/HashExpressionsSuite.scala    | 22 +++++++++++++++++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 415ef4e4a37ec..e14f0544c2b81 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -268,15 +268,16 @@ abstract class HashExpression[E] extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     ev.isNull = "false"
-    val childrenHash = children.map { child =>
+    val childrenHash = ctx.splitExpressions(ctx.INPUT_ROW, children.map { child =>
       val childGen = child.genCode(ctx)
       childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
         computeHash(childGen.value, child.dataType, ev.value, ctx)
       }
-    }.mkString("\n")
+    })
 
+    ctx.addMutableState(ctx.javaType(dataType), ev.value, "")
     ev.copy(code = s"""
-      ${ctx.javaType(dataType)} ${ev.value} = $seed;
+      ${ev.value} = $seed;
       $childrenHash""")
   }
 
@@ -600,15 +601,18 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     ev.isNull = "false"
     val childHash = ctx.freshName("childHash")
-    val childrenHash = children.map { child =>
+    val childrenHash = ctx.splitExpressions(ctx.INPUT_ROW, children.map { child =>
       val childGen = child.genCode(ctx)
       childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
         computeHash(childGen.value, child.dataType, childHash, ctx)
-      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;"
-    }.mkString(s"int $childHash = 0;", s"\n$childHash = 0;\n", "")
+      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;" +
+        s"\n$childHash = 0;"
+    })
 
+    ctx.addMutableState(ctx.javaType(dataType), ev.value, "")
+    ctx.addMutableState("int", childHash, s"$childHash = 0;")
     ev.copy(code = s"""
-      ${ctx.javaType(dataType)} ${ev.value} = $seed;
+      ${ev.value} = $seed;
       $childrenHash""")
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index c714bc03dc0d5..032629265269a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -24,7 +24,9 @@ import org.apache.commons.codec.digest.DigestUtils
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
@@ -124,6 +126,26 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
         new StructType().add("array", arrayOfString).add("map", mapOfString))
       .add("structOfUDT", structOfUDT))
 
+  test("SPARK-18207: Compute hash for a lot of expressions") {
+    val N = 1000
+    val wideRow = new GenericInternalRow(
+      Seq.tabulate(N)(i => UTF8String.fromString(i.toString)).toArray[Any])
+    val schema = StructType((1 to N).map(i => StructField("", StringType)))
+
+    val exprs = schema.fields.zipWithIndex.map { case (f, i) =>
+      BoundReference(i, f.dataType, true)
+    }
+    val murmur3HashExpr = Murmur3Hash(exprs, 42)
+    val murmur3HashPlan = GenerateMutableProjection.generate(Seq(murmur3HashExpr))
+    val murmursHashEval = Murmur3Hash(exprs, 42).eval(wideRow)
+    assert(murmur3HashPlan(wideRow).getInt(0) == murmursHashEval)
+
+    val hiveHashExpr = HiveHash(exprs)
+    val hiveHashPlan = GenerateMutableProjection.generate(Seq(hiveHashExpr))
+    val hiveHashEval = HiveHash(exprs).eval(wideRow)
+    assert(hiveHashPlan(wideRow).getInt(0) == hiveHashEval)
+  }
+
   private def testHash(inputSchema: StructType): Unit = {
     val inputGenerator = RandomDataGenerator.forType(inputSchema, nullable = false).get
     val encoder = RowEncoder(inputSchema)

From 3b360e57a249b33b1b50e58d01a1b78a1c922d88 Mon Sep 17 00:00:00 2001
From: root <root@iZbp1gsnrlfzjxh82cz80vZ.(none)>
Date: Tue, 8 Nov 2016 12:09:32 +0100
Subject: [PATCH 064/534] [SPARK-18137][SQL] Fix RewriteDistinctAggregates
 UnresolvedException when a UDAF has a foldable TypeCheck

## What changes were proposed in this pull request?

In RewriteDistinctAggregates rewrite funtion,after the UDAF's childs are mapped to AttributeRefference, If the UDAF(such as ApproximatePercentile) has a foldable TypeCheck for the input, It will failed because the AttributeRefference is not foldable,then the UDAF is not resolved, and then nullify on the unresolved object will throw a Exception.

In this PR, only map Unfoldable child to AttributeRefference, this can avoid the UDAF's foldable TypeCheck. and then only Expand Unfoldable child, there is no need to Expand a static value(foldable value).

**Before sql result**

> select percentile_approxy(key,0.99999),count(distinct key),sume(distinc key) from src limit 1
> org.apache.spark.sql.catalyst.analysis.UnresolvedException: Invalid call to dataType on unresolved object, tree: 'percentile_approx(CAST(src.`key` AS DOUBLE), CAST(0.99999BD AS DOUBLE), 10000)
> at org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute.dataType(unresolved.scala:92)
>     at org.apache.spark.sql.catalyst.optimizer.RewriteDistinctAggregates$.org$apache$spark$sql$catalyst$optimizer$RewriteDistinctAggregates$$nullify(RewriteDistinctAggregates.scala:261)

**After sql result**

> select percentile_approxy(key,0.99999),count(distinct key),sume(distinc key) from src limit 1
> [498.0,309,79136]
## How was this patch tested?

Add a test case in HiveUDFSuit.

Author: root <root@iZbp1gsnrlfzjxh82cz80vZ.(none)>

Closes #15668 from windpiger/RewriteDistinctUDAFUnresolveExcep.

(cherry picked from commit c291bd2745a8a2e4ba91d8697879eb8da10287e2)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../optimizer/RewriteDistinctAggregates.scala | 35 ++++++++++++++-----
 .../sql/hive/execution/HiveUDFSuite.scala     | 35 +++++++++++++++++++
 2 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
index d6a39ecf53b86..cd8912f793f89 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
@@ -115,9 +115,21 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
     }
 
     // Extract distinct aggregate expressions.
-    val distinctAggGroups = aggExpressions
-      .filter(_.isDistinct)
-      .groupBy(_.aggregateFunction.children.toSet)
+    val distinctAggGroups = aggExpressions.filter(_.isDistinct).groupBy { e =>
+        val unfoldableChildren = e.aggregateFunction.children.filter(!_.foldable).toSet
+        if (unfoldableChildren.nonEmpty) {
+          // Only expand the unfoldable children
+          unfoldableChildren
+        } else {
+          // If aggregateFunction's children are all foldable
+          // we must expand at least one of the children (here we take the first child),
+          // or If we don't, we will get the wrong result, for example:
+          // count(distinct 1) will be explained to count(1) after the rewrite function.
+          // Generally, the distinct aggregateFunction should not run
+          // foldable TypeCheck for the first child.
+          e.aggregateFunction.children.take(1).toSet
+        }
+    }
 
     // Check if the aggregates contains functions that do not support partial aggregation.
     val existsNonPartial = aggExpressions.exists(!_.aggregateFunction.supportsPartial)
@@ -136,8 +148,9 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       def evalWithinGroup(id: Literal, e: Expression) = If(EqualTo(gid, id), e, nullify(e))
       def patchAggregateFunctionChildren(
           af: AggregateFunction)(
-          attrs: Expression => Expression): AggregateFunction = {
-        af.withNewChildren(af.children.map(attrs)).asInstanceOf[AggregateFunction]
+          attrs: Expression => Option[Expression]): AggregateFunction = {
+        val newChildren = af.children.map(c => attrs(c).getOrElse(c))
+        af.withNewChildren(newChildren).asInstanceOf[AggregateFunction]
       }
 
       // Setup unique distinct aggregate children.
@@ -161,7 +174,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
           val operators = expressions.map { e =>
             val af = e.aggregateFunction
             val naf = patchAggregateFunctionChildren(af) { x =>
-              evalWithinGroup(id, distinctAggChildAttrLookup(x))
+              distinctAggChildAttrLookup.get(x).map(evalWithinGroup(id, _))
             }
             (e, e.copy(aggregateFunction = naf, isDistinct = false))
           }
@@ -170,8 +183,12 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       }
 
       // Setup expand for the 'regular' aggregate expressions.
-      val regularAggExprs = aggExpressions.filter(!_.isDistinct)
-      val regularAggChildren = regularAggExprs.flatMap(_.aggregateFunction.children).distinct
+      // only expand unfoldable children
+      val regularAggExprs = aggExpressions
+        .filter(e => !e.isDistinct && e.children.exists(!_.foldable))
+      val regularAggChildren = regularAggExprs
+        .flatMap(_.aggregateFunction.children.filter(!_.foldable))
+        .distinct
       val regularAggChildAttrMap = regularAggChildren.map(expressionAttributePair)
 
       // Setup aggregates for 'regular' aggregate expressions.
@@ -179,7 +196,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] {
       val regularAggChildAttrLookup = regularAggChildAttrMap.toMap
       val regularAggOperatorMap = regularAggExprs.map { e =>
         // Perform the actual aggregation in the initial aggregate.
-        val af = patchAggregateFunctionChildren(e.aggregateFunction)(regularAggChildAttrLookup)
+        val af = patchAggregateFunctionChildren(e.aggregateFunction)(regularAggChildAttrLookup.get)
         val operator = Alias(e.copy(aggregateFunction = af), e.sql)()
 
         // Select the result of the first aggregate in the last aggregate.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index f690035c845f7..48adc833f4b22 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -150,6 +150,41 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
   }
 
   test("Generic UDAF aggregates") {
+
+    checkAnswer(sql(
+     """
+       |SELECT percentile_approx(2, 0.99999),
+       |       sum(distinct 1),
+       |       count(distinct 1,2,3,4) FROM src LIMIT 1
+     """.stripMargin), sql("SELECT 2, 1, 1 FROM src LIMIT 1").collect().toSeq)
+
+    checkAnswer(sql(
+      """
+        |SELECT ceiling(percentile_approx(distinct key, 0.99999)),
+        |       count(distinct key),
+        |       sum(distinct key),
+        |       count(distinct 1),
+        |       sum(distinct 1),
+        |       sum(1) FROM src LIMIT 1
+      """.stripMargin),
+      sql(
+        """
+          |SELECT max(key),
+          |       count(distinct key),
+          |       sum(distinct key),
+          |       1, 1, sum(1) FROM src LIMIT 1
+        """.stripMargin).collect().toSeq)
+
+    checkAnswer(sql(
+      """
+        |SELECT ceiling(percentile_approx(distinct key, 0.9 + 0.09999)),
+        |       count(distinct key), sum(distinct key),
+        |       count(distinct 1), sum(distinct 1),
+        |       sum(1) FROM src LIMIT 1
+      """.stripMargin),
+      sql("SELECT max(key), count(distinct key), sum(distinct key), 1, 1, sum(1) FROM src LIMIT 1")
+        .collect().toSeq)
+
     checkAnswer(sql("SELECT ceiling(percentile_approx(key, 0.99999D)) FROM src LIMIT 1"),
       sql("SELECT max(key) FROM src LIMIT 1").collect().toSeq)
 

From ef6b6d3d4790c1da7e3fddb961dd8efc977e033f Mon Sep 17 00:00:00 2001
From: chie8842 <hayashidac@nttdata.co.jp>
Date: Tue, 8 Nov 2016 13:45:37 +0000
Subject: [PATCH 065/534] [SPARK-13770][DOCUMENTATION][ML] Document the ML
 feature Interaction

I created Scala and Java example and added documentation.

Author: chie8842 <hayashidac@nttdata.co.jp>

Closes #15658 from hayashidac/SPARK-13770.

(cherry picked from commit ee2e741ac16b01d9cae0eadd35af774547bbd415)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/ml-features.md                           | 52 +++++++++++
 .../examples/ml/JavaInteractionExample.java   | 88 +++++++++++++++++++
 .../examples/ml/InteractionExample.scala      | 68 ++++++++++++++
 3 files changed, 208 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 352887d3ba6e3..903177210d820 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -729,6 +729,58 @@ for more details on the API.
 </div>
 </div>
 
+## Interaction
+
+`Interaction` is a `Transformer` which takes vector or double-valued columns, and generates a single vector column that contains the product of all combinations of one value from each input column.
+
+For example, if you have 2 vector type columns each of which has 3 dimensions as input columns, then then you'll get a 9-dimensional vector as the output column.
+
+**Examples**
+
+Assume that we have the following DataFrame with the columns "id1", "vec1", and "vec2":
+
+~~~~
+  id1|vec1          |vec2          
+  ---|--------------|--------------
+  1  |[1.0,2.0,3.0] |[8.0,4.0,5.0] 
+  2  |[4.0,3.0,8.0] |[7.0,9.0,8.0] 
+  3  |[6.0,1.0,9.0] |[2.0,3.0,6.0] 
+  4  |[10.0,8.0,6.0]|[9.0,4.0,5.0] 
+  5  |[9.0,2.0,7.0] |[10.0,7.0,3.0]
+  6  |[1.0,1.0,4.0] |[2.0,8.0,4.0]     
+~~~~
+
+Applying `Interaction` with those input columns,
+then `interactedCol` as the output column contains:
+
+~~~~
+  id1|vec1          |vec2          |interactedCol                                         
+  ---|--------------|--------------|------------------------------------------------------
+  1  |[1.0,2.0,3.0] |[8.0,4.0,5.0] |[8.0,4.0,5.0,16.0,8.0,10.0,24.0,12.0,15.0]            
+  2  |[4.0,3.0,8.0] |[7.0,9.0,8.0] |[56.0,72.0,64.0,42.0,54.0,48.0,112.0,144.0,128.0]     
+  3  |[6.0,1.0,9.0] |[2.0,3.0,6.0] |[36.0,54.0,108.0,6.0,9.0,18.0,54.0,81.0,162.0]        
+  4  |[10.0,8.0,6.0]|[9.0,4.0,5.0] |[360.0,160.0,200.0,288.0,128.0,160.0,216.0,96.0,120.0]
+  5  |[9.0,2.0,7.0] |[10.0,7.0,3.0]|[450.0,315.0,135.0,100.0,70.0,30.0,350.0,245.0,105.0] 
+  6  |[1.0,1.0,4.0] |[2.0,8.0,4.0] |[12.0,48.0,24.0,12.0,48.0,24.0,48.0,192.0,96.0]       
+~~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Interaction Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Interaction)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/InteractionExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Interaction Java docs](api/java/org/apache/spark/ml/feature/Interaction.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaInteractionExample.java %}
+</div>
+</div>
 
 ## Normalizer
 
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
new file mode 100644
index 0000000000000..4213c05703cc6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.ml.feature.Interaction;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.Arrays;
+import java.util.List;
+
+// $example on$
+// $example off$
+
+public class JavaInteractionExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaInteractionExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(1, 1, 2, 3, 8, 4, 5),
+      RowFactory.create(2, 4, 3, 8, 7, 9, 8),
+      RowFactory.create(3, 6, 1, 9, 2, 3, 6),
+      RowFactory.create(4, 10, 8, 6, 9, 4, 5),
+      RowFactory.create(5, 9, 2, 7, 10, 7, 3),
+      RowFactory.create(6, 1, 1, 4, 2, 8, 4)
+    );
+    
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("id7", DataTypes.IntegerType, false, Metadata.empty())
+    });
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    VectorAssembler assembler1 = new VectorAssembler()
+            .setInputCols(new String[]{"id2", "id3", "id4"})
+            .setOutputCol("vec1");
+
+    Dataset<Row> assembled1 = assembler1.transform(df);
+
+    VectorAssembler assembler2 = new VectorAssembler()
+            .setInputCols(new String[]{"id5", "id6", "id7"})
+            .setOutputCol("vec2");
+
+    Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2");
+
+    Interaction interaction = new Interaction()
+            .setInputCols(new String[]{"id1","vec1","vec2"})
+            .setOutputCol("interactedCol");
+
+    Dataset<Row> interacted = interaction.transform(assembled2);
+
+    interacted.show(false);
+    // $example off$
+
+    spark.stop();
+  }
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala
new file mode 100644
index 0000000000000..8113c992b1d69
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/InteractionExample.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Interaction
+import org.apache.spark.ml.feature.VectorAssembler
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object InteractionExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("InteractionExample")
+      .getOrCreate()
+
+    // $example on$
+    val df = spark.createDataFrame(Seq(
+      (1, 1, 2, 3, 8, 4, 5),
+      (2, 4, 3, 8, 7, 9, 8),
+      (3, 6, 1, 9, 2, 3, 6),
+      (4, 10, 8, 6, 9, 4, 5),
+      (5, 9, 2, 7, 10, 7, 3),
+      (6, 1, 1, 4, 2, 8, 4)
+    )).toDF("id1", "id2", "id3", "id4", "id5", "id6", "id7")
+
+    val assembler1 = new VectorAssembler().
+      setInputCols(Array("id2", "id3", "id4")).
+      setOutputCol("vec1")
+
+    val assembled1 = assembler1.transform(df)
+
+    val assembler2 = new VectorAssembler().
+      setInputCols(Array("id5", "id6", "id7")).
+      setOutputCol("vec2")
+
+    val assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
+
+    val interaction = new Interaction()
+      .setInputCols(Array("id1", "vec1", "vec2"))
+      .setOutputCol("interactedCol")
+
+    val interacted = interaction.transform(assembled2)
+
+    interacted.show(truncate = false)
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println

From 9595a71066ff222b28c7505db6b9426d78acaea8 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 8 Nov 2016 22:28:29 +0800
Subject: [PATCH 066/534] [SPARK-18346][SQL] TRUNCATE TABLE should fail if no
 partition is matched for the given non-partial partition spec

## What changes were proposed in this pull request?

a follow up of https://github.com/apache/spark/pull/15688

## How was this patch tested?

updated test in `DDLSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15805 from cloud-fan/truncate.

(cherry picked from commit 73feaa30ebfb62c81c7ce2c60ce2163611dd8852)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/command/tables.scala  | 33 ++++++++++++-------
 .../sql/execution/command/DDLSuite.scala      |  9 +++--
 .../sql/hive/execution/HiveDDLSuite.scala     | 12 +++----
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 3cfa639a2fc1f..3a856fa0f5699 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -30,13 +30,13 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
-import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -324,38 +324,47 @@ case class TruncateTableCommand(
   override def run(spark: SparkSession): Seq[Row] = {
     val catalog = spark.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
-    val tableIdentwithDB = table.identifier.quotedString
+    val tableIdentWithDB = table.identifier.quotedString
 
     if (table.tableType == CatalogTableType.EXTERNAL) {
       throw new AnalysisException(
-        s"Operation not allowed: TRUNCATE TABLE on external tables: $tableIdentwithDB")
+        s"Operation not allowed: TRUNCATE TABLE on external tables: $tableIdentWithDB")
     }
     if (table.tableType == CatalogTableType.VIEW) {
       throw new AnalysisException(
-        s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentwithDB")
+        s"Operation not allowed: TRUNCATE TABLE on views: $tableIdentWithDB")
     }
     if (table.partitionColumnNames.isEmpty && partitionSpec.isDefined) {
       throw new AnalysisException(
         s"Operation not allowed: TRUNCATE TABLE ... PARTITION is not supported " +
-        s"for tables that are not partitioned: $tableIdentwithDB")
+        s"for tables that are not partitioned: $tableIdentWithDB")
     }
     if (partitionSpec.isDefined) {
       DDLUtils.verifyPartitionProviderIsHive(spark, table, "TRUNCATE TABLE ... PARTITION")
     }
+
+    val partCols = table.partitionColumnNames
     val locations =
-      if (table.partitionColumnNames.isEmpty) {
+      if (partCols.isEmpty) {
         Seq(table.storage.locationUri)
       } else {
-        // Here we diverge from Hive when the given partition spec contains all partition columns
-        // but no partition is matched: Hive will throw an exception and we just do nothing.
         val normalizedSpec = partitionSpec.map { spec =>
           PartitioningUtils.normalizePartitionSpec(
             spec,
-            table.partitionColumnNames,
+            partCols,
             table.identifier.quotedString,
             spark.sessionState.conf.resolver)
         }
-        catalog.listPartitions(table.identifier, normalizedSpec).map(_.storage.locationUri)
+        val partLocations =
+          catalog.listPartitions(table.identifier, normalizedSpec).map(_.storage.locationUri)
+
+        // Fail if the partition spec is fully specified (not partial) and the partition does not
+        // exist.
+        for (spec <- partitionSpec if partLocations.isEmpty && spec.size == partCols.length) {
+          throw new NoSuchPartitionException(table.database, table.identifier.table, spec)
+        }
+
+        partLocations
       }
     val hadoopConf = spark.sessionState.newHadoopConf()
     locations.foreach { location =>
@@ -368,7 +377,7 @@ case class TruncateTableCommand(
         } catch {
           case NonFatal(e) =>
             throw new AnalysisException(
-              s"Failed to truncate table $tableIdentwithDB when removing data of the path: $path " +
+              s"Failed to truncate table $tableIdentWithDB when removing data of the path: $path " +
                 s"because of ${e.toString}")
         }
       }
@@ -381,7 +390,7 @@ case class TruncateTableCommand(
       spark.sharedState.cacheManager.uncacheQuery(spark.table(table.identifier))
     } catch {
       case NonFatal(e) =>
-        log.warn(s"Exception when attempting to uncache table $tableIdentwithDB", e)
+        log.warn(s"Exception when attempting to uncache table $tableIdentWithDB", e)
     }
     Seq.empty[Row]
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 864af8d578b17..df3a3c34c39a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1673,11 +1673,10 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       sql("TRUNCATE TABLE partTable PARTITION (width=100)")
       assert(spark.table("partTable").count() == data.count())
 
-      // do nothing if no partition is matched for the given non-partial partition spec
-      // TODO: This behaviour is different from Hive, we should decide whether we need to follow
-      // Hive's behaviour or stick with our existing behaviour later.
-      sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
-      assert(spark.table("partTable").count() == data.count())
+      // throw exception if no partition is matched for the given non-partial partition spec.
+      intercept[NoSuchPartitionException] {
+        sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
+      }
 
       // throw exception if the column in partition spec is not a partition column.
       val e = intercept[AnalysisException] {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 4150e649bef84..0076a778683ca 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -23,11 +23,10 @@ import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode}
-import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
+import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.command.DDLUtils
-import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
@@ -1149,11 +1148,10 @@ class HiveDDLSuite
       sql("TRUNCATE TABLE partTable PARTITION (width=100)")
       assert(spark.table("partTable").count() == data.count())
 
-      // do nothing if no partition is matched for the given non-partial partition spec
-      // TODO: This behaviour is different from Hive, we should decide whether we need to follow
-      // Hive's behaviour or stick with our existing behaviour later.
-      sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
-      assert(spark.table("partTable").count() == data.count())
+      // throw exception if no partition is matched for the given non-partial partition spec.
+      intercept[NoSuchPartitionException] {
+        sql("TRUNCATE TABLE partTable PARTITION (width=100, length=100)")
+      }
 
       // throw exception if the column in partition spec is not a partition column.
       val e = intercept[AnalysisException] {

From 876eee2b1610d7de5ed6f86d06bf6105d7c9de16 Mon Sep 17 00:00:00 2001
From: Kishor Patil <kpatil@yahoo-inc.com>
Date: Tue, 8 Nov 2016 12:13:09 -0600
Subject: [PATCH 067/534] [SPARK-18357] Fix yarn files/archive broken issue
 andd unit tests

## What changes were proposed in this pull request?

The #15627 broke functionality with yarn --files --archives does not accept any files.
This patch ensures that --files and --archives accept unique files.

## How was this patch tested?

A. I added unit tests.
B. Also, manually tested --files with --archives to throw exception if duplicate files are specified and continue if unique files are specified.

Author: Kishor Patil <kpatil@yahoo-inc.com>

Closes #15810 from kishorvpatil/SPARK18357.

(cherry picked from commit 245e5a2f80e3195b7f8a38b480b29bfc23af66bf)
Signed-off-by: Tom Graves <tgraves@yahoo-inc.com>
---
 .../org/apache/spark/deploy/yarn/Client.scala   |  2 +-
 .../apache/spark/deploy/yarn/ClientSuite.scala  | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 172fb46c986c6..e77fa386dc933 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -604,7 +604,7 @@ private[spark] class Client(
             cachedSecondaryJarLinks += localizedPath
           }
         } else {
-          if (localizedPath != null) {
+          if (localizedPath == null) {
             throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" +
               " to the distributed cache.")
           }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 06516c1baf1cc..7deaf0af94849 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -322,6 +322,23 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     intercept[IllegalArgumentException] {
       clientArchives.prepareLocalResources(new Path(tempDirForArchives.getAbsolutePath()), Nil)
     }
+
+    // Case 4: FILES_TO_DISTRIBUTE can have unique file.
+    val sparkConfFilesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+    val clientFilesUniq = createClient(sparkConfFilesUniq)
+    val tempDirForFilesUniq = Utils.createTempDir()
+    clientFilesUniq.prepareLocalResources(new Path(tempDirForFilesUniq.getAbsolutePath()), Nil)
+
+    // Case 5: ARCHIVES_TO_DISTRIBUTE can have unique file.
+    val sparkConfArchivesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+      .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+    val clientArchivesUniq = createClient(sparkConfArchivesUniq)
+    val tempDirArchivesUniq = Utils.createTempDir()
+    clientArchivesUniq.prepareLocalResources(new Path(tempDirArchivesUniq.getAbsolutePath()), Nil)
+
   }
 
   test("distribute local spark jars") {

From 21bbf94b41fbd193e370a3820131e449aaf0e3db Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 8 Nov 2016 12:58:29 -0800
Subject: [PATCH 068/534] [SPARK-17748][ML] Minor cleanups to one-pass linear
 regression with elastic net

## What changes were proposed in this pull request?

* Made SingularMatrixException private ml
* WeightedLeastSquares: Changed to allow tol >= 0 instead of only tol > 0

## How was this patch tested?

existing tests

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #15779 from jkbradley/wls-cleanups.

(cherry picked from commit 26e1c53aceee37e3687a372ff6c6f05463fd8a94)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../spark/ml/optim/NormalEquationSolver.scala |  9 ++++----
 .../spark/ml/optim/WeightedLeastSquares.scala |  4 ++--
 .../ml/regression/LinearRegression.scala      | 22 ++++++++++++++-----
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
index 2f5299b010223..96fd0d18b5ae9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
@@ -16,9 +16,10 @@
  */
 package org.apache.spark.ml.optim
 
+import scala.collection.mutable
+
 import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
-import scala.collection.mutable
 
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vectors}
 import org.apache.spark.mllib.linalg.CholeskyDecomposition
@@ -57,7 +58,7 @@ private[ml] sealed trait NormalEquationSolver {
  */
 private[ml] class CholeskySolver extends NormalEquationSolver {
 
-  def solve(
+  override def solve(
       bBar: Double,
       bbBar: Double,
       abBar: DenseVector,
@@ -80,7 +81,7 @@ private[ml] class QuasiNewtonSolver(
     tol: Double,
     l1RegFunc: Option[(Int) => Double]) extends NormalEquationSolver {
 
-  def solve(
+  override def solve(
       bBar: Double,
       bbBar: Double,
       abBar: DenseVector,
@@ -156,7 +157,7 @@ private[ml] class QuasiNewtonSolver(
  * Exception thrown when solving a linear system Ax = b for which the matrix A is non-invertible
  * (singular).
  */
-class SingularMatrixException(message: String, cause: Throwable)
+private[spark] class SingularMatrixException(message: String, cause: Throwable)
   extends IllegalArgumentException(message, cause) {
 
   def this(message: String) = this(message, null)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 90c24e1b590ea..56ab9675700a0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -47,7 +47,7 @@ private[ml] class WeightedLeastSquaresModel(
  * formulation:
  *
  * min,,x,z,, 1/2 sum,,i,, w,,i,, (a,,i,,^T^ x + z - b,,i,,)^2^ / sum,,i,, w,,i,,
- *   + lambda / delta (1/2 (1 - alpha) sumj,, (sigma,,j,, x,,j,,)^2^
+ *   + lambda / delta (1/2 (1 - alpha) sum,,j,, (sigma,,j,, x,,j,,)^2^
  *   + alpha sum,,j,, abs(sigma,,j,, x,,j,,)),
  *
  * where lambda is the regularization parameter, alpha is the ElasticNet mixing parameter,
@@ -91,7 +91,7 @@ private[ml] class WeightedLeastSquares(
   require(elasticNetParam >= 0.0 && elasticNetParam <= 1.0,
     s"elasticNetParam must be in [0, 1]: $elasticNetParam")
   require(maxIter >= 0, s"maxIter must be a positive integer: $maxIter")
-  require(tol > 0, s"tol must be greater than zero: $tol")
+  require(tol >= 0.0, s"tol must be >= 0, but was set to $tol")
 
   /**
    * Creates a [[WeightedLeastSquaresModel]] from an RDD of [[Instance]]s.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index ae876b3839734..9639b07496c13 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -31,7 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.linalg.BLAS._
-import org.apache.spark.ml.optim.{NormalEquationSolver, WeightedLeastSquares}
+import org.apache.spark.ml.optim.WeightedLeastSquares
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
@@ -160,11 +160,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   /**
    * Set the solver algorithm used for optimization.
    * In case of linear regression, this can be "l-bfgs", "normal" and "auto".
-   * "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
-   * optimization method. "normal" denotes using Normal Equation as an analytical
-   * solution to the linear regression problem.
-   * The default value is "auto" which means that the solver algorithm is
-   * selected automatically.
+   *  - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
+   *    optimization method.
+   *  - "normal" denotes using Normal Equation as an analytical solution to the linear regression
+   *    problem.  This solver is limited to [[LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER]].
+   *  - "auto" (default) means that the solver algorithm is selected automatically.
+   *    The Normal Equations solver will be used when possible, but this will automatically fall
+   *    back to iterative optimization methods when needed.
    *
    * @group setParam
    */
@@ -404,6 +406,14 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
 
   @Since("1.6.0")
   override def load(path: String): LinearRegression = super.load(path)
+
+  /**
+   * When using [[LinearRegression.solver]] == "normal", the solver must limit the number of
+   * features to at most this number.  The entire covariance matrix X^T^X will be collected
+   * to the driver. This limit helps prevent memory overflow errors.
+   */
+  @Since("2.1.0")
+  val MAX_FEATURES_FOR_NORMAL_SOLVER: Int = WeightedLeastSquares.MAX_NUM_FEATURES
 }
 
 /**

From ba80eaf72a9d78a3838677595b42b4bffdda0357 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 8 Nov 2016 13:14:56 -0800
Subject: [PATCH 069/534] [SPARK-18280][CORE] Fix potential deadlock in
 `StandaloneSchedulerBackend.dead`

## What changes were proposed in this pull request?

"StandaloneSchedulerBackend.dead" is called in a RPC thread, so it should not call "SparkContext.stop" in the same thread. "SparkContext.stop" will block until all RPC threads exit, if it's called inside a RPC thread, it will be dead-lock.

This PR add a thread local flag inside RPC threads. `SparkContext.stop` uses it to decide if launching a new thread to stop the SparkContext.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15775 from zsxwing/SPARK-18280.
---
 .../scala/org/apache/spark/SparkContext.scala | 22 +++++++++++++++++--
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |  4 ++++
 .../apache/spark/rpc/netty/Dispatcher.scala   |  1 +
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  |  3 +++
 .../org/apache/spark/rpc/RpcEnvSuite.scala    | 13 +++++++++++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 9f0f6074229dd..25a3d609a6b09 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1757,8 +1757,26 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   def listJars(): Seq[String] = addedJars.keySet.toSeq
 
-  // Shut down the SparkContext.
-  def stop() {
+  /**
+   * Shut down the SparkContext.
+   */
+  def stop(): Unit = {
+    if (env.rpcEnv.isInRPCThread) {
+      // `stop` will block until all RPC threads exit, so we cannot call stop inside a RPC thread.
+      // We should launch a new thread to call `stop` to avoid dead-lock.
+      new Thread("stop-spark-context") {
+        setDaemon(true)
+
+        override def run(): Unit = {
+          _stop()
+        }
+      }.start()
+    } else {
+      _stop()
+    }
+  }
+
+  private def _stop() {
     if (LiveListenerBus.withinListenerThread.value) {
       throw new SparkException(
         s"Cannot stop SparkContext within listener thread of ${LiveListenerBus.name}")
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 579122868afc8..bbc416381490b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -147,6 +147,10 @@ private[spark] abstract class RpcEnv(conf: SparkConf) {
    */
   def openChannel(uri: String): ReadableByteChannel
 
+  /**
+   * Return if the current thread is a RPC thread.
+   */
+  def isInRPCThread: Boolean
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index a02cf30a5d831..67baabd2cbff2 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -201,6 +201,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   /** Message loop used for dispatching messages. */
   private class MessageLoop extends Runnable {
     override def run(): Unit = {
+      NettyRpcEnv.rpcThreadFlag.value = true
       try {
         while (true) {
           try {
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index e51649a1ecce9..0b8cd144a2161 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -408,10 +408,13 @@ private[netty] class NettyRpcEnv(
 
   }
 
+  override def isInRPCThread: Boolean = NettyRpcEnv.rpcThreadFlag.value
 }
 
 private[netty] object NettyRpcEnv extends Logging {
 
+  private[netty] val rpcThreadFlag = new DynamicVariable[Boolean](false)
+
   /**
    * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]].
    * Use `currentEnv` to wrap the deserialization codes. E.g.,
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index acdf21df9a161..aa0705987d837 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -870,6 +870,19 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     verify(endpoint, never()).onDisconnected(any())
     verify(endpoint, never()).onNetworkError(any(), any())
   }
+
+  test("isInRPCThread") {
+    val rpcEndpointRef = env.setupEndpoint("isInRPCThread", new RpcEndpoint {
+      override val rpcEnv = env
+
+      override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+        case m => context.reply(rpcEnv.isInRPCThread)
+      }
+    })
+    assert(rpcEndpointRef.askWithRetry[Boolean]("hello") === true)
+    assert(env.isInRPCThread === false)
+    env.stop(rpcEndpointRef)
+  }
 }
 
 class UnserializableClass

From 988f9080a08e861c34a8734de8304e6e0e5a22c7 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 8 Nov 2016 15:08:09 -0800
Subject: [PATCH 070/534] [SPARK-18342] Make rename failures fatal in
 HDFSBackedStateStore

## What changes were proposed in this pull request?

If the rename operation in the state store fails (`fs.rename` returns `false`), the StateStore should throw an exception and have the task retry. Currently if renames fail, nothing happens during execution immediately. However, you will observe that snapshot operations will fail, and then any attempt at recovery (executor failure / checkpoint recovery) also fails.

## How was this patch tested?

Unit test

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #15804 from brkyvz/rename-state.

(cherry picked from commit 6f7ecb0f2975d24a71e4240cf623f5bd8992bbeb)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../state/HDFSBackedStateStoreProvider.scala  |  6 ++-
 .../streaming/state/StateStoreSuite.scala     | 41 ++++++++++++++++---
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index f1e7f1d113ce7..808713161c316 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -254,7 +254,9 @@ private[state] class HDFSBackedStateStoreProvider(
   private def commitUpdates(newVersion: Long, map: MapType, tempDeltaFile: Path): Path = {
     synchronized {
       val finalDeltaFile = deltaFile(newVersion)
-      fs.rename(tempDeltaFile, finalDeltaFile)
+      if (!fs.rename(tempDeltaFile, finalDeltaFile)) {
+        throw new IOException(s"Failed to rename $tempDeltaFile to $finalDeltaFile")
+      }
       loadedMaps.put(newVersion, map)
       finalDeltaFile
     }
@@ -525,7 +527,7 @@ private[state] class HDFSBackedStateStoreProvider(
 
         val deltaFiles = allFiles.filter { file =>
           file.version > snapshotFile.version && file.version <= version
-        }
+        }.toList
         verify(
           deltaFiles.size == version - snapshotFile.version,
           s"Unexpected list of delta files for version $version for $this: $deltaFiles"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index fcf300b3c81bb..504a26516107f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
-import java.io.File
+import java.io.{File, IOException}
+import java.net.URI
 
 import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
@@ -455,6 +456,18 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     }
   }
 
+  test("SPARK-18342: commit fails when rename fails") {
+    import RenameReturnsFalseFileSystem._
+    val dir = scheme + "://" + Utils.createDirectory(tempDir, Random.nextString(5)).toString
+    val conf = new Configuration()
+    conf.set(s"fs.$scheme.impl", classOf[RenameReturnsFalseFileSystem].getName)
+    val provider = newStoreProvider(dir = dir, hadoopConf = conf)
+    val store = provider.getStore(0)
+    put(store, "a", 0)
+    val e = intercept[IllegalStateException](store.commit())
+    assert(e.getCause.getMessage.contains("Failed to rename"))
+  }
+
   def getDataFromFiles(
       provider: HDFSBackedStateStoreProvider,
     version: Int = -1): Set[(String, Int)] = {
@@ -524,9 +537,10 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
   def newStoreProvider(
       opId: Long = Random.nextLong,
       partition: Int = 0,
-      minDeltasForSnapshot: Int = SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT.defaultValue.get
+      minDeltasForSnapshot: Int = SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT.defaultValue.get,
+      dir: String = Utils.createDirectory(tempDir, Random.nextString(5)).toString,
+      hadoopConf: Configuration = new Configuration()
     ): HDFSBackedStateStoreProvider = {
-    val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
     val sqlConf = new SQLConf()
     sqlConf.setConf(SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT, minDeltasForSnapshot)
     new HDFSBackedStateStoreProvider(
@@ -534,7 +548,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
       keySchema,
       valueSchema,
       new StateStoreConf(sqlConf),
-      new Configuration())
+      hadoopConf)
   }
 
   def remove(store: StateStore, condition: String => Boolean): Unit = {
@@ -598,3 +612,20 @@ private[state] object StateStoreSuite {
     }}.toSet
   }
 }
+
+/**
+ * Fake FileSystem to test that the StateStore throws an exception while committing the
+ * delta file, when `fs.rename` returns `false`.
+ */
+class RenameReturnsFalseFileSystem extends RawLocalFileSystem {
+  import RenameReturnsFalseFileSystem._
+  override def getUri: URI = {
+    URI.create(s"$scheme:///")
+  }
+
+  override def rename(src: Path, dst: Path): Boolean = false
+}
+
+object RenameReturnsFalseFileSystem {
+  val scheme = s"StateStoreSuite${math.abs(Random.nextInt)}fs"
+}

From 98dd7ac719d592e64488a4ecd1ea3543b326fe29 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Tue, 8 Nov 2016 16:00:45 -0800
Subject: [PATCH 071/534] [SPARK-18239][SPARKR] Gradient Boosted Tree for R

## What changes were proposed in this pull request?

Gradient Boosted Tree in R.
With a few minor improvements to RandomForest in R.

Since this is relatively isolated I'd like to target this for branch-2.1

## How was this patch tested?

manual tests, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15746 from felixcheung/rgbt.

(cherry picked from commit 55964c15a7b639f920dfe6c104ae4fdcd673705c)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/NAMESPACE                               |   9 +-
 R/pkg/R/generics.R                            |   4 +
 R/pkg/R/mllib.R                               | 331 +++++++++++++++---
 R/pkg/inst/tests/testthat/test_mllib.R        |  68 ++++
 .../spark/ml/r/GBTClassificationWrapper.scala | 164 +++++++++
 .../spark/ml/r/GBTRegressionWrapper.scala     | 144 ++++++++
 .../org/apache/spark/ml/r/RWrappers.scala     |   4 +
 .../r/RandomForestClassificationWrapper.scala |  14 +-
 .../ml/r/RandomForestRegressionWrapper.scala  |  14 +-
 python/pyspark/ml/regression.py               |  10 +-
 10 files changed, 696 insertions(+), 66 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9cd6269f9a8f7..daee09de88263 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -45,7 +45,8 @@ exportMethods("glm",
               "spark.als",
               "spark.kstest",
               "spark.logit",
-              "spark.randomForest")
+              "spark.randomForest",
+              "spark.gbt")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -353,7 +354,9 @@ export("as.DataFrame",
        "read.ml",
        "print.summary.KSTest",
        "print.summary.RandomForestRegressionModel",
-       "print.summary.RandomForestClassificationModel")
+       "print.summary.RandomForestClassificationModel",
+       "print.summary.GBTRegressionModel",
+       "print.summary.GBTClassificationModel")
 
 export("structField",
        "structField.jobj",
@@ -380,6 +383,8 @@ S3method(print, summary.GeneralizedLinearRegressionModel)
 S3method(print, summary.KSTest)
 S3method(print, summary.RandomForestRegressionModel)
 S3method(print, summary.RandomForestClassificationModel)
+S3method(print, summary.GBTRegressionModel)
+S3method(print, summary.GBTClassificationModel)
 S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0271b26a10a90..7653ca7bccec9 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1343,6 +1343,10 @@ setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
 setGeneric("spark.gaussianMixture",
            function(data, formula, ...) { standardGeneric("spark.gaussianMixture") })
 
+#' @rdname spark.gbt
+#' @export
+setGeneric("spark.gbt", function(data, formula, ...) { standardGeneric("spark.gbt") })
+
 #' @rdname spark.glm
 #' @export
 setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.glm") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 7a220b8d53a2f..1065b4b37d7f3 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -116,6 +116,20 @@ setClass("RandomForestRegressionModel", representation(jobj = "jobj"))
 #' @note RandomForestClassificationModel since 2.1.0
 setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
 
+#' S4 class that represents a GBTRegressionModel
+#'
+#' @param jobj a Java object reference to the backing Scala GBTRegressionModel
+#' @export
+#' @note GBTRegressionModel since 2.1.0
+setClass("GBTRegressionModel", representation(jobj = "jobj"))
+
+#' S4 class that represents a GBTClassificationModel
+#'
+#' @param jobj a Java object reference to the backing Scala GBTClassificationModel
+#' @export
+#' @note GBTClassificationModel since 2.1.0
+setClass("GBTClassificationModel", representation(jobj = "jobj"))
+
 #' Saves the MLlib model to the input path
 #'
 #' Saves the MLlib model to the input path. For more information, see the specific
@@ -124,7 +138,8 @@ setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
 #' @name write.ml
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
-#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
+#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg},
+#' @seealso \link{spark.kmeans},
 #' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
 #' @seealso \link{spark.randomForest}, \link{spark.survreg},
 #' @seealso \link{read.ml}
@@ -138,7 +153,8 @@ NULL
 #' @name predict
 #' @export
 #' @seealso \link{spark.glm}, \link{glm},
-#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.isoreg}, \link{spark.kmeans},
+#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, \link{spark.isoreg},
+#' @seealso \link{spark.kmeans},
 #' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
 #' @seealso \link{spark.randomForest}, \link{spark.survreg}
 NULL
@@ -634,7 +650,7 @@ setMethod("fitted", signature(object = "KMeansModel"),
 #  Get the summary of a k-means model
 
 #' @param object a fitted k-means model.
-#' @return \code{summary} returns the model's coefficients, size and cluster.
+#' @return \code{summary} returns the model's features, coefficients, k, size and cluster.
 #' @rdname spark.kmeans
 #' @export
 #' @note summary(KMeansModel) since 2.0.0
@@ -679,15 +695,15 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param data SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
-#' @param regParam the regularization parameter. Default is 0.0.
+#' @param regParam the regularization parameter.
 #' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty.
 #'                        For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination
 #'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
-#' @param fitIntercept whether to fit an intercept term. Default is TRUE.
+#' @param fitIntercept whether to fit an intercept term.
 #' @param family the name of family which is a description of the label distribution to be used in the model.
-#'               Supported options: Default is "auto".
+#'               Supported options:
 #'                 \itemize{
 #'                   \item{"auto": Automatically select the family based on the number of classes:
 #'                           If number of classes == 1 || number of classes == 2, set to "binomial".
@@ -705,11 +721,11 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                  threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of
 #'                  predicting each class. Array must have length equal to the number of classes, with values > 0,
 #'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
-#'                  is the original probability of that class and t is the class's threshold. Default is 0.5.
+#'                  is the original probability of that class and t is the class's threshold.
 #' @param weightCol The weight column name.
 #' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
-#'                         are large, this param could be adjusted to a larger size. Default is 2.
-#' @param probabilityCol column name for predicted class conditional probabilities. Default is "probability".
+#'                         are large, this param could be adjusted to a larger size.
+#' @param probabilityCol column name for predicted class conditional probabilities.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model
 #' @rdname spark.logit
@@ -791,8 +807,10 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
 #  Get the summary of an LogisticRegressionModel
 
 #' @param object an LogisticRegressionModel fitted by \code{spark.logit}
-#' @return \code{summary} returns the Binary Logistic regression results of a given model as lists. Note that
-#'                        Multinomial logistic regression summary is not available now.
+#' @return \code{summary} returns the Binary Logistic regression results of a given model as list,
+#'         including roc, areaUnderROC, pr, fMeasureByThreshold, precisionByThreshold,
+#'         recallByThreshold, totalIterations, objectiveHistory. Note that Multinomial logistic
+#'         regression summary is not available now.
 #' @rdname spark.logit
 #' @aliases summary,LogisticRegressionModel-method
 #' @export
@@ -1141,6 +1159,10 @@ read.ml <- function(path) {
     new("RandomForestRegressionModel", jobj = jobj)
   } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.RandomForestClassifierWrapper")) {
     new("RandomForestClassificationModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTRegressorWrapper")) {
+    new("GBTRegressionModel", jobj = jobj)
+  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTClassifierWrapper")) {
+    new("GBTClassificationModel", jobj = jobj)
   } else {
     stop("Unsupported model: ", jobj)
   }
@@ -1196,13 +1218,13 @@ setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula
 #' data and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
 #' @param data A SparkDataFrame for training
-#' @param features Features column name, default "features". Either libSVM-format column or
-#'        character-format column is valid.
-#' @param k Number of topics, default 10
-#' @param maxIter Maximum iterations, default 20
-#' @param optimizer Optimizer to train an LDA model, "online" or "em", default "online"
+#' @param features Features column name. Either libSVM-format column or character-format column is
+#'        valid.
+#' @param k Number of topics.
+#' @param maxIter Maximum iterations.
+#' @param optimizer Optimizer to train an LDA model, "online" or "em", default is "online".
 #' @param subsamplingRate (For online optimizer) Fraction of the corpus to be sampled and used in
-#'        each iteration of mini-batch gradient descent, in range (0, 1], default 0.05
+#'        each iteration of mini-batch gradient descent, in range (0, 1].
 #' @param topicConcentration concentration parameter (commonly named \code{beta} or \code{eta}) for
 #'        the prior placed on topic distributions over terms, default -1 to set automatically on the
 #'        Spark side. Use \code{summary} to retrieve the effective topicConcentration. Only 1-size
@@ -1263,7 +1285,7 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
 # similarly to R's summary().
 
 #' @param object a fitted AFT survival regression model.
-#' @return \code{summary} returns a list containing the model's coefficients,
+#' @return \code{summary} returns a list containing the model's features, coefficients,
 #' intercept and log(scale)
 #' @rdname spark.survreg
 #' @export
@@ -1351,7 +1373,7 @@ setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula =
 #  Get the summary of a multivariate gaussian mixture model
 
 #' @param object a fitted gaussian mixture model.
-#' @return \code{summary} returns the model's lambda, mu, sigma and posterior.
+#' @return \code{summary} returns the model's lambda, mu, sigma, k, dim and posterior.
 #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
 #' @rdname spark.gaussianMixture
 #' @export
@@ -1644,33 +1666,38 @@ print.summary.KSTest <- function(x, ...) {
 #' model, \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #' For more details, see
-#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{Random Forest}
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
+#' Random Forest Regression} and
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
+#' Random Forest Classification}
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', ':', '+', and '-'.
 #' @param type type of model, one of "regression" or "classification", to fit
-#' @param maxDepth Maximum depth of the tree (>= 0). (default = 5)
+#' @param maxDepth Maximum depth of the tree (>= 0).
 #' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing
 #'                how to split on features at each node. More bins give higher granularity. Must be
-#'                >= 2 and >= number of categories in any categorical feature. (default = 32)
+#'                >= 2 and >= number of categories in any categorical feature.
 #' @param numTrees Number of trees to train (>= 1).
 #' @param impurity Criterion used for information gain calculation.
 #'                 For regression, must be "variance". For classification, must be one of
-#'                 "entropy" and "gini". (default = gini)
-#' @param minInstancesPerNode Minimum number of instances each child must have after split.
-#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
-#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                 "entropy" and "gini", default is "gini".
 #' @param featureSubsetStrategy The number of features to consider for splits at each tree node.
 #'        Supported options: "auto", "all", "onethird", "sqrt", "log2", (0.0-1.0], [1-n].
 #' @param seed integer seed for random number generation.
 #' @param subsamplingRate Fraction of the training data used for learning each decision tree, in
-#'                        range (0, 1]. (default = 1.0)
-#' @param probabilityCol column name for predicted class conditional probabilities, only for
-#'                       classification. (default = "probability")
+#'                        range (0, 1].
+#' @param minInstancesPerNode Minimum number of instances each child must have after split.
+#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
+#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
-#'                     nodes.
+#'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
+#'                     can speed up training of deeper trees. Users can set how often should the
+#'                     cache be checkpointed or disable it by setting checkpointInterval.
+#' @param probabilityCol column name for predicted class conditional probabilities, only for
+#'                       classification.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -1703,9 +1730,9 @@ print.summary.KSTest <- function(x, ...) {
 setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, type = c("regression", "classification"),
                    maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
-                   minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
                    featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0,
-                   probabilityCol = "probability", maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+                   minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
+                   maxMemoryInMB = 256, cacheNodeIds = FALSE, probabilityCol = "probability") {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -1749,7 +1776,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
 #' @rdname spark.randomForest
 #' @aliases predict,RandomForestRegressionModel-method
 #' @export
-#' @note predict(randomForestRegressionModel) since 2.1.0
+#' @note predict(RandomForestRegressionModel) since 2.1.0
 setMethod("predict", signature(object = "RandomForestRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
@@ -1758,7 +1785,7 @@ setMethod("predict", signature(object = "RandomForestRegressionModel"),
 #' @rdname spark.randomForest
 #' @aliases predict,RandomForestClassificationModel-method
 #' @export
-#' @note predict(randomForestClassificationModel) since 2.1.0
+#' @note predict(RandomForestClassificationModel) since 2.1.0
 setMethod("predict", signature(object = "RandomForestClassificationModel"),
           function(object, newData) {
             predict_internal(object, newData)
@@ -1789,8 +1816,8 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
             write_internal(object, path, overwrite)
           })
 
-#  Get the summary of an RandomForestRegressionModel model
-summary.randomForest <- function(model) {
+# Create the summary of a tree ensemble model (eg. Random Forest, GBT)
+summary.treeEnsemble <- function(model) {
   jobj <- model@jobj
   formula <- callJMethod(jobj, "formula")
   numFeatures <- callJMethod(jobj, "numFeatures")
@@ -1807,20 +1834,23 @@ summary.randomForest <- function(model) {
        jobj = jobj)
 }
 
-#' @return \code{summary} returns the model's features as lists, depth and number of nodes
-#'                        or number of classes.
+#  Get the summary of a Random Forest Regression Model
+
+#' @return \code{summary} returns a summary object of the fitted model, a list of components
+#'         including formula, number of features, list of features, feature importances, number of
+#'         trees, and tree weights
 #' @rdname spark.randomForest
 #' @aliases summary,RandomForestRegressionModel-method
 #' @export
 #' @note summary(RandomForestRegressionModel) since 2.1.0
 setMethod("summary", signature(object = "RandomForestRegressionModel"),
           function(object) {
-            ans <- summary.randomForest(object)
+            ans <- summary.treeEnsemble(object)
             class(ans) <- "summary.RandomForestRegressionModel"
             ans
           })
 
-#  Get the summary of an RandomForestClassificationModel model
+#  Get the summary of a Random Forest Classification Model
 
 #' @rdname spark.randomForest
 #' @aliases summary,RandomForestClassificationModel-method
@@ -1828,13 +1858,13 @@ setMethod("summary", signature(object = "RandomForestRegressionModel"),
 #' @note summary(RandomForestClassificationModel) since 2.1.0
 setMethod("summary", signature(object = "RandomForestClassificationModel"),
           function(object) {
-            ans <- summary.randomForest(object)
+            ans <- summary.treeEnsemble(object)
             class(ans) <- "summary.RandomForestClassificationModel"
             ans
           })
 
-#  Prints the summary of Random Forest Regression Model
-print.summary.randomForest <- function(x) {
+#  Prints the summary of tree ensemble models (eg. Random Forest, GBT)
+print.summary.treeEnsemble <- function(x) {
   jobj <- x$jobj
   cat("Formula: ", x$formula)
   cat("\nNumber of features: ", x$numFeatures)
@@ -1848,13 +1878,15 @@ print.summary.randomForest <- function(x) {
   invisible(x)
 }
 
+#  Prints the summary of Random Forest Regression Model
+
 #' @param x summary object of Random Forest regression model or classification model
 #'          returned by \code{summary}.
 #' @rdname spark.randomForest
 #' @export
 #' @note print.summary.RandomForestRegressionModel since 2.1.0
 print.summary.RandomForestRegressionModel <- function(x, ...) {
-  print.summary.randomForest(x)
+  print.summary.treeEnsemble(x)
 }
 
 #  Prints the summary of Random Forest Classification Model
@@ -1863,5 +1895,214 @@ print.summary.RandomForestRegressionModel <- function(x, ...) {
 #' @export
 #' @note print.summary.RandomForestClassificationModel since 2.1.0
 print.summary.RandomForestClassificationModel <- function(x, ...) {
-  print.summary.randomForest(x)
+  print.summary.treeEnsemble(x)
+}
+
+#' Gradient Boosted Tree Model for Regression and Classification
+#'
+#' \code{spark.gbt} fits a Gradient Boosted Tree Regression model or Classification model on a
+#' SparkDataFrame. Users can call \code{summary} to get a summary of the fitted
+#' Gradient Boosted Tree model, \code{predict} to make predictions on new data, and
+#' \code{write.ml}/\code{read.ml} to save/load fitted models.
+#' For more details, see
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
+#' GBT Regression} and
+#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{
+#' GBT Classification}
+#'
+#' @param data a SparkDataFrame for training.
+#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', ':', '+', and '-'.
+#' @param type type of model, one of "regression" or "classification", to fit
+#' @param maxDepth Maximum depth of the tree (>= 0).
+#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing
+#'                how to split on features at each node. More bins give higher granularity. Must be
+#'                >= 2 and >= number of categories in any categorical feature.
+#' @param maxIter Param for maximum number of iterations (>= 0).
+#' @param stepSize Param for Step size to be used for each iteration of optimization.
+#' @param lossType Loss function which GBT tries to minimize.
+#'                 For classification, must be "logistic". For regression, must be one of
+#'                 "squared" (L2) and "absolute" (L1), default is "squared".
+#' @param seed integer seed for random number generation.
+#' @param subsamplingRate Fraction of the training data used for learning each decision tree, in
+#'                        range (0, 1].
+#' @param minInstancesPerNode Minimum number of instances each child must have after split. If a
+#'                            split causes the left or right child to have fewer than
+#'                            minInstancesPerNode, the split will be discarded as invalid. Should be
+#'                            >= 1.
+#' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
+#' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
+#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
+#'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
+#'                     can speed up training of deeper trees. Users can set how often should the
+#'                     cache be checkpointed or disable it by setting checkpointInterval.
+#' @param ... additional arguments passed to the method.
+#' @aliases spark.gbt,SparkDataFrame,formula-method
+#' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model.
+#' @rdname spark.gbt
+#' @name spark.gbt
+#' @export
+#' @examples
+#' \dontrun{
+#' # fit a Gradient Boosted Tree Regression Model
+#' df <- createDataFrame(longley)
+#' model <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 5, maxBins = 16)
+#'
+#' # get the summary of the model
+#' summary(model)
+#'
+#' # make predictions
+#' predictions <- predict(model, df)
+#'
+#' # save and load the model
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
+#'
+#' # fit a Gradient Boosted Tree Classification Model
+#' # label must be binary - Only binary classification is supported for GBT.
+#' df <- createDataFrame(iris[iris$Species != "virginica", ])
+#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification")
+#'
+#' # numeric label is also supported
+#' iris2 <- iris[iris$Species != "virginica", ]
+#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
+#' df <- createDataFrame(iris2)
+#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+#' }
+#' @note spark.gbt since 2.1.0
+setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, type = c("regression", "classification"),
+                   maxDepth = 5, maxBins = 32, maxIter = 20, stepSize = 0.1, lossType = NULL,
+                   seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0,
+                   checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE) {
+            type <- match.arg(type)
+            formula <- paste(deparse(formula), collapse = "")
+            if (!is.null(seed)) {
+              seed <- as.character(as.integer(seed))
+            }
+            switch(type,
+                   regression = {
+                     if (is.null(lossType)) lossType <- "squared"
+                     lossType <- match.arg(lossType, c("squared", "absolute"))
+                     jobj <- callJStatic("org.apache.spark.ml.r.GBTRegressorWrapper",
+                                         "fit", data@sdf, formula, as.integer(maxDepth),
+                                         as.integer(maxBins), as.integer(maxIter),
+                                         as.numeric(stepSize), as.integer(minInstancesPerNode),
+                                         as.numeric(minInfoGain), as.integer(checkpointInterval),
+                                         lossType, seed, as.numeric(subsamplingRate),
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                     new("GBTRegressionModel", jobj = jobj)
+                   },
+                   classification = {
+                     if (is.null(lossType)) lossType <- "logistic"
+                     lossType <- match.arg(lossType, "logistic")
+                     jobj <- callJStatic("org.apache.spark.ml.r.GBTClassifierWrapper",
+                                         "fit", data@sdf, formula, as.integer(maxDepth),
+                                         as.integer(maxBins), as.integer(maxIter),
+                                         as.numeric(stepSize), as.integer(minInstancesPerNode),
+                                         as.numeric(minInfoGain), as.integer(checkpointInterval),
+                                         lossType, seed, as.numeric(subsamplingRate),
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                     new("GBTClassificationModel", jobj = jobj)
+                   }
+            )
+          })
+
+# Makes predictions from a Gradient Boosted Tree Regression model or Classification model
+
+#' @param newData a SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
+#' "prediction"
+#' @rdname spark.gbt
+#' @aliases predict,GBTRegressionModel-method
+#' @export
+#' @note predict(GBTRegressionModel) since 2.1.0
+setMethod("predict", signature(object = "GBTRegressionModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+#' @rdname spark.gbt
+#' @aliases predict,GBTClassificationModel-method
+#' @export
+#' @note predict(GBTClassificationModel) since 2.1.0
+setMethod("predict", signature(object = "GBTClassificationModel"),
+          function(object, newData) {
+            predict_internal(object, newData)
+          })
+
+# Save the Gradient Boosted Tree Regression or Classification model to the input path.
+
+#' @param object A fitted Gradient Boosted Tree regression model or classification model
+#' @param path The directory where the model is saved
+#' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
+#'                  which means throw exception if the output path exists.
+#' @aliases write.ml,GBTRegressionModel,character-method
+#' @rdname spark.gbt
+#' @export
+#' @note write.ml(GBTRegressionModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "GBTRegressionModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
+#' @aliases write.ml,GBTClassificationModel,character-method
+#' @rdname spark.gbt
+#' @export
+#' @note write.ml(GBTClassificationModel, character) since 2.1.0
+setMethod("write.ml", signature(object = "GBTClassificationModel", path = "character"),
+          function(object, path, overwrite = FALSE) {
+            write_internal(object, path, overwrite)
+          })
+
+#  Get the summary of a Gradient Boosted Tree Regression Model
+
+#' @return \code{summary} returns a summary object of the fitted model, a list of components
+#'         including formula, number of features, list of features, feature importances, number of
+#'         trees, and tree weights
+#' @rdname spark.gbt
+#' @aliases summary,GBTRegressionModel-method
+#' @export
+#' @note summary(GBTRegressionModel) since 2.1.0
+setMethod("summary", signature(object = "GBTRegressionModel"),
+          function(object) {
+            ans <- summary.treeEnsemble(object)
+            class(ans) <- "summary.GBTRegressionModel"
+            ans
+          })
+
+#  Get the summary of a Gradient Boosted Tree Classification Model
+
+#' @rdname spark.gbt
+#' @aliases summary,GBTClassificationModel-method
+#' @export
+#' @note summary(GBTClassificationModel) since 2.1.0
+setMethod("summary", signature(object = "GBTClassificationModel"),
+          function(object) {
+            ans <- summary.treeEnsemble(object)
+            class(ans) <- "summary.GBTClassificationModel"
+            ans
+          })
+
+#  Prints the summary of Gradient Boosted Tree Regression Model
+
+#' @param x summary object of Gradient Boosted Tree regression model or classification model
+#'          returned by \code{summary}.
+#' @rdname spark.gbt
+#' @export
+#' @note print.summary.GBTRegressionModel since 2.1.0
+print.summary.GBTRegressionModel <- function(x, ...) {
+  print.summary.treeEnsemble(x)
+}
+
+#  Prints the summary of Gradient Boosted Tree Classification Model
+
+#' @rdname spark.gbt
+#' @export
+#' @note print.summary.GBTClassificationModel since 2.1.0
+print.summary.GBTClassificationModel <- function(x, ...) {
+  print.summary.treeEnsemble(x)
 }
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 27c59f0b9624c..1e456ef5c6b16 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -949,4 +949,72 @@ test_that("spark.randomForest Classification", {
   unlink(modelPath)
 })
 
+test_that("spark.gbt", {
+  # regression
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$formula, "Employed ~ .")
+  expect_equal(stats$numFeatures, 6)
+  expect_equal(length(stats$treeWeights), 20)
+
+  modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$formula, stats2$formula)
+  expect_equal(stats$numFeatures, stats2$numFeatures)
+  expect_equal(stats$features, stats2$features)
+  expect_equal(stats$featureImportances, stats2$featureImportances)
+  expect_equal(stats$numTrees, stats2$numTrees)
+  expect_equal(stats$treeWeights, stats2$treeWeights)
+
+  unlink(modelPath)
+
+  # classification
+  # label must be binary - GBTClassifier currently only supports binary classification.
+  iris2 <- iris[iris$Species != "virginica", ]
+  data <- suppressWarnings(createDataFrame(iris2))
+  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+  predictions <- collect(predict(model, data))$prediction
+  # test string prediction values
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
+
+  modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+  expect_equal(stats$depth, stats2$depth)
+  expect_equal(stats$numNodes, stats2$numNodes)
+  expect_equal(stats$numClasses, stats2$numClasses)
+
+  unlink(modelPath)
+
+  iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
+  df <- suppressWarnings(createDataFrame(iris2))
+  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+  s <- summary(m)
+  # test numeric prediction values
+  expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
+  expect_equal(s$numFeatures, 5)
+  expect_equal(s$numTrees, 20)
+})
+
 sparkR.session.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
new file mode 100644
index 0000000000000..8946025032200
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class GBTClassifierWrapper private (
+  val pipeline: PipelineModel,
+  val formula: String,
+  val features: Array[String]) extends MLWritable {
+
+  import GBTClassifierWrapper._
+
+  private val gbtcModel: GBTClassificationModel =
+    pipeline.stages(1).asInstanceOf[GBTClassificationModel]
+
+  lazy val numFeatures: Int = gbtcModel.numFeatures
+  lazy val featureImportances: Vector = gbtcModel.featureImportances
+  lazy val numTrees: Int = gbtcModel.getNumTrees
+  lazy val treeWeights: Array[Double] = gbtcModel.treeWeights
+
+  def summary: String = gbtcModel.toDebugString
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(gbtcModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      GBTClassifierWrapper.GBTClassifierWrapperWriter(this)
+}
+
+private[r] object GBTClassifierWrapper extends MLReadable[GBTClassifierWrapper] {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
+  def fit(  // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      maxDepth: Int,
+      maxBins: Int,
+      maxIter: Int,
+      stepSize: Double,
+      minInstancesPerNode: Int,
+      minInfoGain: Double,
+      checkpointInterval: Int,
+      lossType: String,
+      seed: String,
+      subsamplingRate: Double,
+      maxMemoryInMB: Int,
+      cacheNodeIds: Boolean): GBTClassifierWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .setForceIndexLabel(true)
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+
+    // get label names from output schema
+    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+
+    // assemble and fit the pipeline
+    val rfc = new GBTClassifier()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setMaxIter(maxIter)
+      .setStepSize(stepSize)
+      .setMinInstancesPerNode(minInstancesPerNode)
+      .setMinInfoGain(minInfoGain)
+      .setCheckpointInterval(checkpointInterval)
+      .setLossType(lossType)
+      .setSubsamplingRate(subsamplingRate)
+      .setMaxMemoryInMB(maxMemoryInMB)
+      .setCacheNodeIds(cacheNodeIds)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
+    if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
+
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, rfc, idxToStr))
+      .fit(data)
+
+    new GBTClassifierWrapper(pipeline, formula, features)
+  }
+
+  override def read: MLReader[GBTClassifierWrapper] = new GBTClassifierWrapperReader
+
+  override def load(path: String): GBTClassifierWrapper = super.load(path)
+
+  class GBTClassifierWrapperWriter(instance: GBTClassifierWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("formula" -> instance.formula) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class GBTClassifierWrapperReader extends MLReader[GBTClassifierWrapper] {
+
+    override def load(path: String): GBTClassifierWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val formula = (rMetadata \ "formula").extract[String]
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      new GBTClassifierWrapper(pipeline, formula, features)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala
new file mode 100644
index 0000000000000..585077588eb9b
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GBTRegressionWrapper.scala
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.hadoop.fs.Path
+import org.json4s._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+private[r] class GBTRegressorWrapper private (
+  val pipeline: PipelineModel,
+  val formula: String,
+  val features: Array[String]) extends MLWritable {
+
+  private val gbtrModel: GBTRegressionModel =
+    pipeline.stages(1).asInstanceOf[GBTRegressionModel]
+
+  lazy val numFeatures: Int = gbtrModel.numFeatures
+  lazy val featureImportances: Vector = gbtrModel.featureImportances
+  lazy val numTrees: Int = gbtrModel.getNumTrees
+  lazy val treeWeights: Array[Double] = gbtrModel.treeWeights
+
+  def summary: String = gbtrModel.toDebugString
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    pipeline.transform(dataset).drop(gbtrModel.getFeaturesCol)
+  }
+
+  override def write: MLWriter = new
+      GBTRegressorWrapper.GBTRegressorWrapperWriter(this)
+}
+
+private[r] object GBTRegressorWrapper extends MLReadable[GBTRegressorWrapper] {
+  def fit(  // scalastyle:ignore
+      data: DataFrame,
+      formula: String,
+      maxDepth: Int,
+      maxBins: Int,
+      maxIter: Int,
+      stepSize: Double,
+      minInstancesPerNode: Int,
+      minInfoGain: Double,
+      checkpointInterval: Int,
+      lossType: String,
+      seed: String,
+      subsamplingRate: Double,
+      maxMemoryInMB: Int,
+      cacheNodeIds: Boolean): GBTRegressorWrapper = {
+
+    val rFormula = new RFormula()
+      .setFormula(formula)
+    RWrapperUtils.checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
+
+    // get feature names from output schema
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+
+    // assemble and fit the pipeline
+    val rfr = new GBTRegressor()
+      .setMaxDepth(maxDepth)
+      .setMaxBins(maxBins)
+      .setMaxIter(maxIter)
+      .setStepSize(stepSize)
+      .setMinInstancesPerNode(minInstancesPerNode)
+      .setMinInfoGain(minInfoGain)
+      .setCheckpointInterval(checkpointInterval)
+      .setLossType(lossType)
+      .setSubsamplingRate(subsamplingRate)
+      .setMaxMemoryInMB(maxMemoryInMB)
+      .setCacheNodeIds(cacheNodeIds)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+    if (seed != null && seed.length > 0) rfr.setSeed(seed.toLong)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, rfr))
+      .fit(data)
+
+    new GBTRegressorWrapper(pipeline, formula, features)
+  }
+
+  override def read: MLReader[GBTRegressorWrapper] = new GBTRegressorWrapperReader
+
+  override def load(path: String): GBTRegressorWrapper = super.load(path)
+
+  class GBTRegressorWrapperWriter(instance: GBTRegressorWrapper)
+    extends MLWriter {
+
+    override protected def saveImpl(path: String): Unit = {
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+
+      val rMetadata = ("class" -> instance.getClass.getName) ~
+        ("formula" -> instance.formula) ~
+        ("features" -> instance.features.toSeq)
+      val rMetadataJson: String = compact(render(rMetadata))
+
+      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
+      instance.pipeline.save(pipelinePath)
+    }
+  }
+
+  class GBTRegressorWrapperReader extends MLReader[GBTRegressorWrapper] {
+
+    override def load(path: String): GBTRegressorWrapper = {
+      implicit val format = DefaultFormats
+      val rMetadataPath = new Path(path, "rMetadata").toString
+      val pipelinePath = new Path(path, "pipeline").toString
+      val pipeline = PipelineModel.load(pipelinePath)
+
+      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
+      val rMetadata = parse(rMetadataStr)
+      val formula = (rMetadata \ "formula").extract[String]
+      val features = (rMetadata \ "features").extract[Array[String]]
+
+      new GBTRegressorWrapper(pipeline, formula, features)
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
index 0e09e18027ca7..b59fe292349bf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrappers.scala
@@ -60,6 +60,10 @@ private[r] object RWrappers extends MLReader[Object] {
         RandomForestRegressorWrapper.load(path)
       case "org.apache.spark.ml.r.RandomForestClassifierWrapper" =>
         RandomForestClassifierWrapper.load(path)
+      case "org.apache.spark.ml.r.GBTRegressorWrapper" =>
+        GBTRegressorWrapper.load(path)
+      case "org.apache.spark.ml.r.GBTClassifierWrapper" =>
+        GBTClassifierWrapper.load(path)
       case _ =>
         throw new SparkException(s"SparkR read.ml does not support load $className")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
index b0088ddaf3b1d..6947ba7e7597a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -35,18 +35,18 @@ private[r] class RandomForestClassifierWrapper private (
   val formula: String,
   val features: Array[String]) extends MLWritable {
 
-  private val DTModel: RandomForestClassificationModel =
+  private val rfcModel: RandomForestClassificationModel =
     pipeline.stages(1).asInstanceOf[RandomForestClassificationModel]
 
-  lazy val numFeatures: Int = DTModel.numFeatures
-  lazy val featureImportances: Vector = DTModel.featureImportances
-  lazy val numTrees: Int = DTModel.getNumTrees
-  lazy val treeWeights: Array[Double] = DTModel.treeWeights
+  lazy val numFeatures: Int = rfcModel.numFeatures
+  lazy val featureImportances: Vector = rfcModel.featureImportances
+  lazy val numTrees: Int = rfcModel.getNumTrees
+  lazy val treeWeights: Array[Double] = rfcModel.treeWeights
 
-  def summary: String = DTModel.toDebugString
+  def summary: String = rfcModel.toDebugString
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(DTModel.getFeaturesCol)
+    pipeline.transform(dataset).drop(rfcModel.getFeaturesCol)
   }
 
   override def write: MLWriter = new
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
index c8874407fa75e..4b9a3a731da9b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestRegressionWrapper.scala
@@ -35,18 +35,18 @@ private[r] class RandomForestRegressorWrapper private (
   val formula: String,
   val features: Array[String]) extends MLWritable {
 
-  private val DTModel: RandomForestRegressionModel =
+  private val rfrModel: RandomForestRegressionModel =
     pipeline.stages(1).asInstanceOf[RandomForestRegressionModel]
 
-  lazy val numFeatures: Int = DTModel.numFeatures
-  lazy val featureImportances: Vector = DTModel.featureImportances
-  lazy val numTrees: Int = DTModel.getNumTrees
-  lazy val treeWeights: Array[Double] = DTModel.treeWeights
+  lazy val numFeatures: Int = rfrModel.numFeatures
+  lazy val featureImportances: Vector = rfrModel.featureImportances
+  lazy val numTrees: Int = rfrModel.getNumTrees
+  lazy val treeWeights: Array[Double] = rfrModel.treeWeights
 
-  def summary: String = DTModel.toDebugString
+  def summary: String = rfrModel.toDebugString
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(DTModel.getFeaturesCol)
+    pipeline.transform(dataset).drop(rfrModel.getFeaturesCol)
   }
 
   override def write: MLWriter = new
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 9233d2e7e1a77..0bc319ca4d601 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -828,7 +828,7 @@ def featureImportances(self):
 @inherit_doc
 class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
                             RandomForestParams, TreeRegressorParams, HasCheckpointInterval,
-                            JavaMLWritable, JavaMLReadable, HasVarianceCol):
+                            JavaMLWritable, JavaMLReadable):
     """
     `Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
     learning algorithm for regression.
@@ -876,13 +876,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
-                 featureSubsetStrategy="auto", varianceCol=None):
+                 featureSubsetStrategy="auto"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
-                 featureSubsetStrategy="auto", varianceCol=None)
+                 featureSubsetStrategy="auto")
         """
         super(RandomForestRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -900,13 +900,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
-                  featureSubsetStrategy="auto", varianceCol=None):
+                  featureSubsetStrategy="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
-                  featureSubsetStrategy="auto", varianceCol=None)
+                  featureSubsetStrategy="auto")
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs

From 0dc14f12917626a5d7f0c9a21e4edd0b63587470 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 9 Nov 2016 15:00:46 +0800
Subject: [PATCH 072/534] [SPARK-18333][SQL] Revert hacks in parquet and orc
 reader to support case insensitive resolution

## What changes were proposed in this pull request?

These are no longer needed after https://issues.apache.org/jira/browse/SPARK-17183

cc cloud-fan

## How was this patch tested?

Existing parquet and orc tests.

Author: Eric Liang <ekl@databricks.com>

Closes #15799 from ericl/sc-4929.

(cherry picked from commit 4afa39e223c70e91b6ee19e9ea76fa9115203d74)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../parquet/ParquetReadSupport.scala          |  6 +---
 .../parquet/ParquetSchemaSuite.scala          | 28 -------------------
 .../spark/sql/hive/orc/OrcFileFormat.scala    | 12 +-------
 3 files changed, 2 insertions(+), 44 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index 4dea8cf29ec58..f1a35dd8a6200 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -269,15 +269,11 @@ private[parquet] object ParquetReadSupport {
    */
   private def clipParquetGroupFields(
       parquetRecord: GroupType, structType: StructType): Seq[Type] = {
-    val parquetFieldMap = parquetRecord.getFields.asScala
-      .map(f => f.getName -> f).toMap
-    val caseInsensitiveParquetFieldMap = parquetRecord.getFields.asScala
-      .map(f => f.getName.toLowerCase -> f).toMap
+    val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
     val toParquet = new ParquetSchemaConverter(writeLegacyParquetFormat = false)
     structType.map { f =>
       parquetFieldMap
         .get(f.name)
-        .orElse(caseInsensitiveParquetFieldMap.get(f.name.toLowerCase))
         .map(clipParquetType(_, f.dataType))
         .getOrElse(toParquet.convertField(f))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index c3d202ced24c8..8a980a7eb538f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -1080,34 +1080,6 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     }
   }
 
-  testSchemaClipping(
-    "falls back to case insensitive resolution",
-
-    parquetSchema =
-      """message root {
-        |  required group A {
-        |    optional int32 B;
-        |  }
-        |  optional int32 c;
-        |}
-      """.stripMargin,
-
-    catalystSchema = {
-      val nestedType = new StructType().add("b", IntegerType, nullable = true)
-      new StructType()
-        .add("a", nestedType, nullable = true)
-        .add("c", IntegerType, nullable = true)
-    },
-
-    expectedSchema =
-      """message root {
-        |  required group A {
-        |    optional int32 B;
-        |  }
-        |  optional int32 c;
-        |}
-      """.stripMargin)
-
   testSchemaClipping(
     "simple nested struct",
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 7c519a074317a..42c92ed5cae26 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -305,17 +305,7 @@ private[orc] object OrcRelation extends HiveInspectors {
 
   def setRequiredColumns(
       conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val caseInsensitiveFieldMap: Map[String, Int] = physicalSchema.fieldNames
-      .zipWithIndex
-      .map(f => (f._1.toLowerCase, f._2))
-      .toMap
-    val ids = requestedSchema.map { a =>
-      val exactMatch: Option[Int] = physicalSchema.getFieldIndex(a.name)
-      val res = exactMatch.getOrElse(
-        caseInsensitiveFieldMap.getOrElse(a.name,
-          throw new IllegalArgumentException(s"""Field "$a.name" does not exist.""")))
-      res: Integer
-    }
+    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }

From f672083693c2c4dfea6dc43c024993d4561b1e79 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Tue, 8 Nov 2016 23:47:48 -0800
Subject: [PATCH 073/534] [SPARK-18368] Fix regexp_replace with task
 serialization.

## What changes were proposed in this pull request?

This makes the result value both transient and lazy, so that if the RegExpReplace object is initialized then serialized, `result: StringBuffer` will be correctly initialized.

## How was this patch tested?

* Verified that this patch fixed the query that found the bug.
* Added a test case that fails without the fix.

Author: Ryan Blue <blue@apache.org>

Closes #15816 from rdblue/SPARK-18368-fix-regexp-replace.

(cherry picked from commit b9192bb3ffc319ebee7dbd15c24656795e454749)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalyst/expressions/regexpExpressions.scala  |  2 +-
 .../expressions/ExpressionEvalHelper.scala        | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5648ad6b6dc18..4896a6225aa80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   @transient private var lastReplacement: String = _
   @transient private var lastReplacementInUTF8: UTF8String = _
   // result buffer write by Matcher
-  @transient private val result: StringBuffer = new StringBuffer
+  @transient private lazy val result: StringBuffer = new StringBuffer
 
   override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
     if (!p.equals(lastRegex)) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 9ceb709185417..f83650424a964 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -22,7 +22,8 @@ import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.exceptions.TestFailedException
 import org.scalatest.prop.GeneratorDrivenPropertyChecks
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
@@ -43,13 +44,15 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
 
   protected def checkEvaluation(
       expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
+    val serializer = new JavaSerializer(new SparkConf()).newInstance
+    val expr: Expression = serializer.deserialize(serializer.serialize(expression))
     val catalystValue = CatalystTypeConverters.convertToCatalyst(expected)
-    checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
-    checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
-    if (GenerateUnsafeProjection.canSupport(expression.dataType)) {
-      checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow)
+    checkEvaluationWithoutCodegen(expr, catalystValue, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow)
+    if (GenerateUnsafeProjection.canSupport(expr.dataType)) {
+      checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow)
     }
-    checkEvaluationWithOptimization(expression, catalystValue, inputRow)
+    checkEvaluationWithOptimization(expr, catalystValue, inputRow)
   }
 
   /**

From b89c38b2ee2c418ad2de4f2fc70ad9f81eac3240 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 9 Nov 2016 00:11:48 -0800
Subject: [PATCH 074/534] [SPARK-17659][SQL] Partitioned View is Not Supported
 By SHOW CREATE TABLE

### What changes were proposed in this pull request?

`Partitioned View` is not supported by SPARK SQL. For Hive partitioned view, SHOW CREATE TABLE is unable to generate the right DDL. Thus, SHOW CREATE TABLE should not support it like the other Hive-only features. This PR is to issue an exception when detecting the view is a partitioned view.
### How was this patch tested?

Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15233 from gatorsmile/partitionedView.

(cherry picked from commit e256392a128c8fffa8abb86ab99224ae09b0e1ff)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/execution/command/tables.scala  |  2 +-
 .../sql/hive/client/HiveClientImpl.scala      |  4 +++
 .../spark/sql/hive/ShowCreateTableSuite.scala | 28 +++++++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 3a856fa0f5699..e49a1f5acd0c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -780,7 +780,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman
   private def showCreateHiveTable(metadata: CatalogTable): String = {
     def reportUnsupportedError(features: Seq[String]): Unit = {
       throw new AnalysisException(
-        s"Failed to execute SHOW CREATE TABLE against table ${metadata.identifier.quotedString}, " +
+        s"Failed to execute SHOW CREATE TABLE against table/view ${metadata.identifier}, " +
           "which is created by Hive and uses the following unsupported feature(s)\n" +
           features.map(" - " + _).mkString("\n")
       )
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 84873bbbb81ce..2bf9a26b0b7fc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -377,6 +377,10 @@ private[hive] class HiveClientImpl(
         unsupportedFeatures += "bucketing"
       }
 
+      if (h.getTableType == HiveTableType.VIRTUAL_VIEW && partCols.nonEmpty) {
+        unsupportedFeatures += "partitioned view"
+      }
+
       val properties = Option(h.getParameters).map(_.asScala.toMap).orNull
 
       CatalogTable(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala
index e925921165d6a..68df80943430a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ShowCreateTableSuite.scala
@@ -265,6 +265,34 @@ class ShowCreateTableSuite extends QueryTest with SQLTestUtils with TestHiveSing
     }
   }
 
+  test("hive partitioned view is not supported") {
+    withTable("t1") {
+      withView("v1") {
+        sql(
+          s"""
+             |CREATE TABLE t1 (c1 INT, c2 STRING)
+             |PARTITIONED BY (
+             |  p1 BIGINT COMMENT 'bla',
+             |  p2 STRING )
+           """.stripMargin)
+
+        createRawHiveTable(
+          s"""
+             |CREATE VIEW v1
+             |PARTITIONED ON (p1, p2)
+             |AS SELECT * from t1
+           """.stripMargin
+        )
+
+        val cause = intercept[AnalysisException] {
+          sql("SHOW CREATE TABLE v1")
+        }
+
+        assert(cause.getMessage.contains(" - partitioned view"))
+      }
+    }
+  }
+
   private def createRawHiveTable(ddl: String): Unit = {
     hiveContext.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client.runSqlHive(ddl)
   }

From ac441d1738efb008a607e3f852fff3744007fc1d Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 9 Nov 2016 17:48:16 +0000
Subject: [PATCH 075/534] [SPARK-18292][SQL] LogicalPlanToSQLSuite should not
 use resource dependent path for golden file generation

## What changes were proposed in this pull request?

`LogicalPlanToSQLSuite` uses the following command to update the existing answer files.

```bash
SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "hive/test-only *LogicalPlanToSQLSuite"
```

However, after introducing `getTestResourcePath`, it fails to update the previous golden answer files in the predefined directory. This issue aims to fix that.

## How was this patch tested?

It's a testsuite update. Manual.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15789 from dongjoon-hyun/SPARK-18292.

(cherry picked from commit 02c5325b8ff75bf2e5bcb66e0482298ab408b091)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../spark/sql/catalyst/LogicalPlanToSQLSuite.scala     | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
index 12d18dc87ceb4..8696337b9dc8a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
@@ -46,7 +46,15 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
 
   // Used for generating new query answer files by saving
   private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1"
-  private val goldenSQLPath = getTestResourcePath("sqlgen")
+  private val goldenSQLPath = {
+    // If regenerateGoldenFiles is true, we must be running this in SBT and we use hard-coded
+    // relative path. Otherwise, we use classloader's getResource to find the location.
+    if (regenerateGoldenFiles) {
+      java.nio.file.Paths.get("src", "test", "resources", "sqlgen").toFile.getCanonicalPath
+    } else {
+      getTestResourcePath("sqlgen")
+    }
+  }
 
   protected override def beforeAll(): Unit = {
     super.beforeAll()

From 5bd31dc9d4cb7423c2d9c11260386665057656d3 Mon Sep 17 00:00:00 2001
From: Vinayak <vijoshi5@in.ibm.com>
Date: Wed, 9 Nov 2016 10:40:14 -0800
Subject: [PATCH 076/534] [SPARK-16808][CORE] History Server main page does not
 honor APPLICATION_WEB_PROXY_BASE

## What changes were proposed in this pull request?

Application links generated on the history server UI no longer (regression from 1.6) contain the configured spark.ui.proxyBase in the links. To address this, made the uiRoot available globally to all javascripts for Web UI. Updated the mustache template (historypage-template.html) to include the uiroot for rendering links to the applications.

The existing test was not sufficient to verify the scenario where ajax call is used to populate the application listing template, so added a new selenium test case to cover this scenario.

## How was this patch tested?

Existing tests and a new unit test.
No visual changes to the UI.

Author: Vinayak <vijoshi5@in.ibm.com>

Closes #15742 from vijoshi/SPARK-16808_master.

(cherry picked from commit 06a13ecca728e431c66fafb333b3bcff808e1afd)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/ui/static/historypage-template.html |  6 +-
 .../org/apache/spark/ui/static/historypage.js |  6 +-
 .../org/apache/spark/ui/static/webui.js       |  6 ++
 .../scala/org/apache/spark/ui/UIUtils.scala   |  1 +
 .../deploy/history/HistoryServerSuite.scala   | 80 ++++++++++++++++++-
 5 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
index 1fd6ef4a71253..42e2d9abdeb5e 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-template.html
@@ -68,16 +68,16 @@
   <tbody>
   {{#applications}}
     <tr>
-      <td class="rowGroupColumn"><span title="{{id}}"><a href="/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
+      <td class="rowGroupColumn"><span title="{{id}}"><a href="{{uiroot}}/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
       <td class="rowGroupColumn">{{name}}</td>
       {{#attempts}}
-      <td class="attemptIDSpan"><a href="/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
+      <td class="attemptIDSpan"><a href="{{uiroot}}/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
       <td>{{startTime}}</td>
       <td>{{endTime}}</td>
       <td><span title="{{duration}}" class="durationClass">{{duration}}</span></td>
       <td>{{sparkUser}}</td>
       <td>{{lastUpdated}}</td>
-      <td><a href="/api/v1/applications/{{id}}/{{num}}/logs" class="btn btn-info btn-mini">Download</a></td>
+      <td><a href="{{uiroot}}/api/v1/applications/{{id}}/{{num}}/logs" class="btn btn-info btn-mini">Download</a></td>
       {{/attempts}}
     </tr>
   {{/applications}}
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 2a32e18672a22..6c0ec8d5fce54 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -119,7 +119,11 @@ $(document).ready(function() {
         }
       }
 
-      var data = {"applications": array}
+      var data = {
+        "uiroot": uiRoot,
+        "applications": array
+        }
+
       $.get("static/historypage-template.html", function(template) {
         historySummary.append(Mustache.render($(template).filter("#history-summary-template").html(),data));
         var selector = "#history-summary-table";
diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.js b/core/src/main/resources/org/apache/spark/ui/static/webui.js
index e37307aa1f705..0fa1fcf25f8b9 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.js
@@ -15,6 +15,12 @@
  * limitations under the License.
  */
 
+var uiRoot = "";
+
+function setUIRoot(val) {
+    uiRoot = val;
+}
+
 function collapseTablePageLoad(name, table){
   if (window.localStorage.getItem(name) == "true") {
     // Set it to false so that the click function can revert it
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 66b097aa8166d..57f6f2f0a9be5 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -171,6 +171,7 @@ private[spark] object UIUtils extends Logging {
     <script src={prependBaseUri("/static/timeline-view.js")}></script>
     <script src={prependBaseUri("/static/log-view.js")}></script>
     <script src={prependBaseUri("/static/webui.js")}></script>
+    <script>setUIRoot('{UIUtils.uiRoot}')</script>
   }
 
   def vizHeaderNodes: Seq[Node] = {
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index a595bc174a310..715811a46f42d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -29,6 +29,8 @@ import com.codahale.metrics.Counter
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.commons.io.{FileUtils, IOUtils}
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.eclipse.jetty.proxy.ProxyServlet
+import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 import org.json4s.JsonAST._
 import org.json4s.jackson.JsonMethods
 import org.json4s.jackson.JsonMethods._
@@ -258,8 +260,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     getContentAndCode("foobar")._1 should be (HttpServletResponse.SC_NOT_FOUND)
   }
 
-  test("relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
-    val proxyBaseBeforeTest = System.getProperty("spark.ui.proxyBase")
+  test("static relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
     val uiRoot = Option(System.getenv("APPLICATION_WEB_PROXY_BASE")).getOrElse("/testwebproxybase")
     val page = new HistoryPage(server)
     val request = mock[HttpServletRequest]
@@ -267,7 +268,6 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     // when
     System.setProperty("spark.ui.proxyBase", uiRoot)
     val response = page.render(request)
-    System.setProperty("spark.ui.proxyBase", Option(proxyBaseBeforeTest).getOrElse(""))
 
     // then
     val urls = response \\ "@href" map (_.toString)
@@ -275,6 +275,80 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     all (siteRelativeLinks) should startWith (uiRoot)
   }
 
+  test("ajax rendered relative links are prefixed with uiRoot (spark.ui.proxyBase)") {
+    val uiRoot = "/testwebproxybase"
+    System.setProperty("spark.ui.proxyBase", uiRoot)
+
+    server.stop()
+
+    val conf = new SparkConf()
+      .set("spark.history.fs.logDirectory", logDir)
+      .set("spark.history.fs.update.interval", "0")
+      .set("spark.testing", "true")
+
+    provider = new FsHistoryProvider(conf)
+    provider.checkForLogs()
+    val securityManager = new SecurityManager(conf)
+
+    server = new HistoryServer(conf, provider, securityManager, 18080)
+    server.initialize()
+    server.bind()
+
+    val port = server.boundPort
+
+    val servlet = new ProxyServlet {
+      override def rewriteTarget(request: HttpServletRequest): String = {
+        // servlet acts like a proxy that redirects calls made on
+        // spark.ui.proxyBase context path to the normal servlet handlers operating off "/"
+        val sb = request.getRequestURL()
+
+        if (request.getQueryString() != null) {
+          sb.append(s"?${request.getQueryString()}")
+        }
+
+        val proxyidx = sb.indexOf(uiRoot)
+        sb.delete(proxyidx, proxyidx + uiRoot.length).toString
+      }
+    }
+
+    val contextHandler = new ServletContextHandler
+    val holder = new ServletHolder(servlet)
+    contextHandler.setContextPath(uiRoot)
+    contextHandler.addServlet(holder, "/")
+    server.attachHandler(contextHandler)
+
+    implicit val webDriver: WebDriver = new HtmlUnitDriver(true) {
+      getWebClient.getOptions.setThrowExceptionOnScriptError(false)
+    }
+
+    try {
+      val url = s"http://localhost:$port"
+
+      go to s"$url$uiRoot"
+
+      // expect the ajax call to finish in 5 seconds
+      implicitlyWait(org.scalatest.time.Span(5, org.scalatest.time.Seconds))
+
+      // once this findAll call returns, we know the ajax load of the table completed
+      findAll(ClassNameQuery("odd"))
+
+      val links = findAll(TagNameQuery("a"))
+        .map(_.attribute("href"))
+        .filter(_.isDefined)
+        .map(_.get)
+        .filter(_.startsWith(url)).toList
+
+      // there are atleast some URL links that were generated via javascript,
+      // and they all contain the spark.ui.proxyBase (uiRoot)
+      links.length should be > 4
+      all(links) should startWith(url + uiRoot)
+    } finally {
+      contextHandler.stop()
+      quit()
+    }
+
+  }
+
   test("incomplete apps get refreshed") {
 
     implicit val webDriver: WebDriver = new HtmlUnitDriver

From 626f6d6d4f297fd67cfec017a790d79ddad41d70 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 9 Nov 2016 10:47:29 -0800
Subject: [PATCH 077/534] Revert "[SPARK-18368] Fix regexp_replace with task
 serialization."

This reverts commit b9192bb3ffc319ebee7dbd15c24656795e454749.
---
 .../catalyst/expressions/regexpExpressions.scala  |  2 +-
 .../expressions/ExpressionEvalHelper.scala        | 15 ++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 4896a6225aa80..5648ad6b6dc18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   @transient private var lastReplacement: String = _
   @transient private var lastReplacementInUTF8: UTF8String = _
   // result buffer write by Matcher
-  @transient private lazy val result: StringBuffer = new StringBuffer
+  @transient private val result: StringBuffer = new StringBuffer
 
   override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
     if (!p.equals(lastRegex)) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index f83650424a964..9ceb709185417 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -22,8 +22,7 @@ import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.exceptions.TestFailedException
 import org.scalatest.prop.GeneratorDrivenPropertyChecks
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
@@ -44,15 +43,13 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
 
   protected def checkEvaluation(
       expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
-    val serializer = new JavaSerializer(new SparkConf()).newInstance
-    val expr: Expression = serializer.deserialize(serializer.serialize(expression))
     val catalystValue = CatalystTypeConverters.convertToCatalyst(expected)
-    checkEvaluationWithoutCodegen(expr, catalystValue, inputRow)
-    checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow)
-    if (GenerateUnsafeProjection.canSupport(expr.dataType)) {
-      checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow)
+    checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
+    if (GenerateUnsafeProjection.canSupport(expression.dataType)) {
+      checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow)
     }
-    checkEvaluationWithOptimization(expr, catalystValue, inputRow)
+    checkEvaluationWithOptimization(expression, catalystValue, inputRow)
   }
 
   /**

From 80f58510a7a3e039eecf875f02a115c0fd166f55 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Wed, 9 Nov 2016 11:00:53 -0800
Subject: [PATCH 078/534] [SPARK-18368][SQL] Fix regexp replace when serialized

## What changes were proposed in this pull request?

This makes the result value both transient and lazy, so that if the RegExpReplace object is initialized then serialized, `result: StringBuffer` will be correctly initialized.

## How was this patch tested?

* Verified that this patch fixed the query that found the bug.
* Added a test case that fails without the fix.

Author: Ryan Blue <blue@apache.org>

Closes #15834 from rdblue/SPARK-18368-fix-regexp-replace.

(cherry picked from commit d4028de97687385fa1d1eb6301eb544c0ea4a135)
Signed-off-by: Yin Huai <yhuai@databricks.com>
---
 .../catalyst/expressions/regexpExpressions.scala |  2 +-
 .../expressions/RegexpExpressionsSuite.scala     | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5648ad6b6dc18..4896a6225aa80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -230,7 +230,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   @transient private var lastReplacement: String = _
   @transient private var lastReplacementInUTF8: UTF8String = _
   // result buffer write by Matcher
-  @transient private val result: StringBuffer = new StringBuffer
+  @transient private lazy val result: StringBuffer = new StringBuffer
 
   override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
     if (!p.equals(lastRegex)) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 5299549e7b4da..d0d1aaa9d299d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types.StringType
 
@@ -191,4 +192,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringSplit(s1, s2), null, row3)
   }
 
+  test("RegExpReplace serialization") {
+    val serializer = new JavaSerializer(new SparkConf()).newInstance
+
+    val row = create_row("abc", "b", "")
+
+    val s = 's.string.at(0)
+    val p = 'p.string.at(1)
+    val r = 'r.string.at(2)
+
+    val expr: RegExpReplace = serializer.deserialize(serializer.serialize(RegExpReplace(s, p, r)))
+    checkEvaluation(expr, "ac", row)
+  }
+
 }

From 4424c901e82ed4992d5568cbc5a5f524b88dc5eb Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Wed, 9 Nov 2016 12:26:09 -0800
Subject: [PATCH 079/534] [SPARK-18370][SQL] Add table information to
 InsertIntoHadoopFsRelationCommand

## What changes were proposed in this pull request?
`InsertIntoHadoopFsRelationCommand` does not keep track if it inserts into a table and what table it inserts to. This can make debugging these statements problematic. This PR adds table information the `InsertIntoHadoopFsRelationCommand`. Explaining this SQL command `insert into prq select * from range(0, 100000)` now yields the following executed plan:
```
== Physical Plan ==
ExecutedCommand
   +- InsertIntoHadoopFsRelationCommand file:/dev/assembly/spark-warehouse/prq, ParquetFormat, <function1>, Map(serialization.format -> 1, path -> file:/dev/assembly/spark-warehouse/prq), Append, CatalogTable(
	Table: `default`.`prq`
	Owner: hvanhovell
	Created: Wed Nov 09 17:42:30 CET 2016
	Last Access: Thu Jan 01 01:00:00 CET 1970
	Type: MANAGED
	Schema: [StructField(id,LongType,true)]
	Provider: parquet
	Properties: [transient_lastDdlTime=1478709750]
	Storage(Location: file:/dev/assembly/spark-warehouse/prq, InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat, OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat, Serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe, Properties: [serialization.format=1]))
         +- Project [id#7L]
            +- Range (0, 100000, step=1, splits=None)
```

## How was this patch tested?
Added extra checks to the `ParquetMetastoreSuite`

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #15832 from hvanhovell/SPARK-18370.

(cherry picked from commit d8b81f778af8c3d7112ad37f691c49215b392836)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/sql/execution/datasources/DataSource.scala | 3 ++-
 .../sql/execution/datasources/DataSourceStrategy.scala      | 5 +++--
 .../datasources/InsertIntoHadoopFsRelationCommand.scala     | 5 +++--
 .../scala/org/apache/spark/sql/hive/parquetSuites.scala     | 6 ++++--
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 52666119351b1..5d663949df6b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -424,7 +424,8 @@ case class DataSource(
             _ => Unit, // No existing table needs to be refreshed.
             options,
             data.logicalPlan,
-            mode)
+            mode,
+            catalogTable)
         sparkSession.sessionState.executePlan(plan).toRdd
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring it.
         copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index a548e88cb683a..2d43a6ad098ed 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -162,7 +162,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
 
 
     case i @ logical.InsertIntoTable(
-           l @ LogicalRelation(t: HadoopFsRelation, _, _), part, query, overwrite, false)
+           l @ LogicalRelation(t: HadoopFsRelation, _, table), part, query, overwrite, false)
         if query.resolved && t.schema.asNullable == query.schema.asNullable =>
 
       // Sanity checks
@@ -222,7 +222,8 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         refreshPartitionsCallback,
         t.options,
         query,
-        mode)
+        mode,
+        table)
 
       insertCmd
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 9c75e2ae74761..a0a8cb5024c33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -23,7 +23,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -41,7 +41,8 @@ case class InsertIntoHadoopFsRelationCommand(
     refreshFunction: (Seq[TablePartitionSpec]) => Unit,
     options: Map[String, String],
     @transient query: LogicalPlan,
-    mode: SaveMode)
+    mode: SaveMode,
+    catalogTable: Option[CatalogTable])
   extends RunnableCommand {
 
   override protected def innerChildren: Seq[LogicalPlan] = query :: Nil
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 9fc62a389db4d..3644ff952eb0d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -307,7 +307,8 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt")
       df.queryExecution.sparkPlan match {
-        case ExecutedCommandExec(_: InsertIntoHadoopFsRelationCommand) => // OK
+        case ExecutedCommandExec(cmd: InsertIntoHadoopFsRelationCommand) =>
+          assert(cmd.catalogTable.map(_.identifier.table) === Some("test_insert_parquet"))
         case o => fail("test_insert_parquet should be converted to a " +
           s"${classOf[HadoopFsRelation ].getCanonicalName} and " +
           s"${classOf[InsertIntoDataSourceCommand].getCanonicalName} should have been SparkPlan. " +
@@ -337,7 +338,8 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
 
       val df = sql("INSERT INTO TABLE test_insert_parquet SELECT a FROM jt_array")
       df.queryExecution.sparkPlan match {
-        case ExecutedCommandExec(_: InsertIntoHadoopFsRelationCommand) => // OK
+        case ExecutedCommandExec(cmd: InsertIntoHadoopFsRelationCommand) =>
+          assert(cmd.catalogTable.map(_.identifier.table) === Some("test_insert_parquet"))
         case o => fail("test_insert_parquet should be converted to a " +
           s"${classOf[HadoopFsRelation ].getCanonicalName} and " +
           s"${classOf[InsertIntoDataSourceCommand].getCanonicalName} should have been SparkPlan." +

From b7d29256bad465bd01a5bfaaf7163b911e01182b Mon Sep 17 00:00:00 2001
From: Tyson Condie <tcondie@gmail.com>
Date: Wed, 9 Nov 2016 15:03:22 -0800
Subject: [PATCH 080/534] [SPARK-17829][SQL] Stable format for offset log

## What changes were proposed in this pull request?

Currently we use java serialization for the WAL that stores the offsets contained in each batch. This has two main issues:
It can break across spark releases (though this is not the only thing preventing us from upgrading a running query)
It is unnecessarily opaque to the user.
I'd propose we require offsets to provide a user readable serialization and use that instead. JSON is probably a good option.
## How was this patch tested?

Tests were added for KafkaSourceOffset in [KafkaSourceOffsetSuite](external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala) and for LongOffset in [OffsetSuite](sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala)

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

zsxwing marmbrus

Author: Tyson Condie <tcondie@gmail.com>
Author: Tyson Condie <tcondie@clash.local>

Closes #15626 from tcondie/spark-8360.

(cherry picked from commit 3f62e1b5d9e75dc07bac3aa4db3e8d0615cc3cc3)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../apache/spark/sql/kafka010/JsonUtils.scala |  2 -
 .../spark/sql/kafka010/KafkaSource.scala      | 19 ++++-
 .../sql/kafka010/KafkaSourceOffset.scala      | 14 +++-
 .../sql/kafka010/KafkaSourceOffsetSuite.scala | 55 ++++++++++++-
 python/pyspark/sql/streaming.py               | 12 +--
 .../streaming/CompactibleFileStreamLog.scala  | 23 +++---
 .../streaming/FileStreamSinkLog.scala         |  8 --
 .../streaming/FileStreamSource.scala          |  4 +-
 .../streaming/FileStreamSourceLog.scala       |  8 --
 .../execution/streaming/HDFSMetadataLog.scala | 22 ++---
 .../sql/execution/streaming/LongOffset.scala  | 21 ++++-
 .../sql/execution/streaming/Offset.scala      | 36 ++++++++-
 ...{CompositeOffset.scala => OffsetSeq.scala} | 15 ++--
 .../execution/streaming/OffsetSeqLog.scala    | 80 +++++++++++++++++++
 .../sql/execution/streaming/Source.scala      |  8 ++
 .../execution/streaming/StreamExecution.scala | 11 ++-
 .../execution/streaming/StreamProgress.scala  |  4 +-
 .../sql/execution/streaming/memory.scala      | 32 ++++----
 .../sql/execution/streaming/socket.scala      | 25 +++---
 .../streaming/StreamingQueryException.scala   |  6 +-
 .../sql/streaming/StreamingQueryStatus.scala  |  6 +-
 .../streaming/OffsetSeqLogSuite.scala         | 63 +++++++++++++++
 .../spark/sql/streaming/OffsetSuite.scala     | 24 ++----
 .../streaming/StreamingQueryStatusSuite.scala | 16 ++--
 .../sql/streaming/StreamingQuerySuite.scala   | 38 ++++-----
 25 files changed, 402 insertions(+), 150 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/{CompositeOffset.scala => OffsetSeq.scala} (83%)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
index 40d568a12c25d..13d717092a898 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.kafka010
 
-import java.io.Writer
-
 import scala.collection.mutable.HashMap
 import scala.util.control.NonFatal
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index b21508cd7ebd8..5bcc5124b0915 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
+import java.io._
+import java.nio.charset.StandardCharsets
 
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
@@ -114,7 +116,22 @@ private[kafka010] case class KafkaSource(
    * `KafkaConsumer.poll` may hang forever (KAFKA-1894).
    */
   private lazy val initialPartitionOffsets = {
-    val metadataLog = new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath)
+    val metadataLog =
+      new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath) {
+        override def serialize(metadata: KafkaSourceOffset, out: OutputStream): Unit = {
+          val bytes = metadata.json.getBytes(StandardCharsets.UTF_8)
+          out.write(bytes.length)
+          out.write(bytes)
+        }
+
+        override def deserialize(in: InputStream): KafkaSourceOffset = {
+          val length = in.read()
+          val bytes = new Array[Byte](length)
+          in.read(bytes)
+          KafkaSourceOffset(SerializedOffset(new String(bytes, StandardCharsets.UTF_8)))
+        }
+      }
+
     metadataLog.get(0).getOrElse {
       val offsets = startingOffsets match {
         case EarliestOffsets => KafkaSourceOffset(fetchEarliestOffsets())
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5ade982515f0..b5da415b3097e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.kafka010
 
 import org.apache.kafka.common.TopicPartition
 
-import org.apache.spark.sql.execution.streaming.Offset
+import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
@@ -27,9 +27,8 @@ import org.apache.spark.sql.execution.streaming.Offset
  */
 private[kafka010]
 case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
-  override def toString(): String = {
-    partitionToOffsets.toSeq.sortBy(_._1.toString).mkString("[", ", ", "]")
-  }
+
+  override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
 /** Companion object of the [[KafkaSourceOffset]] */
@@ -38,6 +37,7 @@ private[kafka010] object KafkaSourceOffset {
   def getPartitionOffsets(offset: Offset): Map[TopicPartition, Long] = {
     offset match {
       case o: KafkaSourceOffset => o.partitionToOffsets
+      case so: SerializedOffset => KafkaSourceOffset(so).partitionToOffsets
       case _ =>
         throw new IllegalArgumentException(
           s"Invalid conversion from offset of ${offset.getClass} to KafkaSourceOffset")
@@ -51,4 +51,10 @@ private[kafka010] object KafkaSourceOffset {
   def apply(offsetTuples: (String, Int, Long)*): KafkaSourceOffset = {
     KafkaSourceOffset(offsetTuples.map { case(t, p, o) => (new TopicPartition(t, p), o) }.toMap)
   }
+
+  /**
+   * Returns [[KafkaSourceOffset]] from a JSON [[SerializedOffset]]
+   */
+  def apply(offset: SerializedOffset): KafkaSourceOffset =
+    KafkaSourceOffset(JsonUtils.partitionOffsets(offset.json))
 }
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
index 7056a41b1751e..881018fd95665 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
@@ -17,9 +17,13 @@
 
 package org.apache.spark.sql.kafka010
 
+import java.io.File
+
+import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.streaming.OffsetSuite
+import org.apache.spark.sql.test.SharedSQLContext
 
-class KafkaSourceOffsetSuite extends OffsetSuite {
+class KafkaSourceOffsetSuite extends OffsetSuite with SharedSQLContext {
 
   compare(
     one = KafkaSourceOffset(("t", 0, 1L)),
@@ -36,4 +40,53 @@ class KafkaSourceOffsetSuite extends OffsetSuite {
   compare(
     one = KafkaSourceOffset(("t", 0, 1L)),
     two = KafkaSourceOffset(("t", 0, 2L), ("t", 1, 1L)))
+
+
+  val kso1 = KafkaSourceOffset(("t", 0, 1L))
+  val kso2 = KafkaSourceOffset(("t", 0, 2L), ("t", 1, 3L))
+  val kso3 = KafkaSourceOffset(("t", 0, 2L), ("t", 1, 3L), ("t", 1, 4L))
+
+  compare(KafkaSourceOffset(SerializedOffset(kso1.json)),
+    KafkaSourceOffset(SerializedOffset(kso2.json)))
+
+  test("basic serialization - deserialization") {
+    assert(KafkaSourceOffset.getPartitionOffsets(kso1) ==
+      KafkaSourceOffset.getPartitionOffsets(SerializedOffset(kso1.json)))
+  }
+
+
+  testWithUninterruptibleThread("OffsetSeqLog serialization - deserialization") {
+    withTempDir { temp =>
+      // use non-existent directory to test whether log make the dir
+      val dir = new File(temp, "dir")
+      val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
+      val batch0 = OffsetSeq.fill(kso1)
+      val batch1 = OffsetSeq.fill(kso2, kso3)
+
+      val batch0Serialized = OffsetSeq.fill(batch0.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      val batch1Serialized = OffsetSeq.fill(batch1.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      assert(metadataLog.add(0, batch0))
+      assert(metadataLog.getLatest() === Some(0 -> batch0Serialized))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+
+      assert(metadataLog.add(1, batch1))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+
+      // Adding the same batch does nothing
+      metadataLog.add(1, OffsetSeq.fill(LongOffset(3)))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+    }
+  }
 }
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 1c94413e3c457..f326f16232690 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -220,7 +220,7 @@ def __str__(self):
                 triggerId: 5
             Source statuses [1 source]:
                 Source 1 - MySource1
-                    Available offset: #0
+                    Available offset: 0
                     Input rate: 15.5 rows/sec
                     Processing rate: 23.5 rows/sec
                     Trigger details:
@@ -228,7 +228,7 @@ def __str__(self):
                         latency.getOffset.source: 10
                         latency.getBatch.source: 20
             Sink status - MySink
-                Committed offsets: [#1, -]
+                Committed offsets: [1, -]
         """
         return self._jsqs.toString()
 
@@ -366,7 +366,7 @@ def __str__(self):
 
         >>> print(sqs.sourceStatuses[0])
         Status of source MySource1
-            Available offset: #0
+            Available offset: 0
             Input rate: 15.5 rows/sec
             Processing rate: 23.5 rows/sec
             Trigger details:
@@ -396,7 +396,7 @@ def offsetDesc(self):
         Description of the current offset if known.
 
         >>> sqs.sourceStatuses[0].offsetDesc
-        u'#0'
+        u'0'
         """
         return self._jss.offsetDesc()
 
@@ -457,7 +457,7 @@ def __str__(self):
 
         >>> print(sqs.sinkStatus)
         Status of sink MySink
-            Committed offsets: [#1, -]
+            Committed offsets: [1, -]
         """
         return self._jss.toString()
 
@@ -481,7 +481,7 @@ def offsetDesc(self):
         Description of the current offsets up to which data has been written by the sink.
 
         >>> sqs.sinkStatus.offsetDesc
-        u'[#1, -]'
+        u'[1, -]'
         """
         return self._jss.offsetDesc()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index b26edeeb04009..8af3db1968882 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -24,6 +24,8 @@ import scala.io.{Source => IOSource}
 import scala.reflect.ClassTag
 
 import org.apache.hadoop.fs.{Path, PathFilter}
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
 
 import org.apache.spark.sql.SparkSession
 
@@ -37,7 +39,7 @@ import org.apache.spark.sql.SparkSession
  * compact log files every 10 batches by default into a big file. When
  * doing a compaction, it will read all old log files and merge them with the new batch.
  */
-abstract class CompactibleFileStreamLog[T: ClassTag](
+abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
     metadataLogVersion: String,
     sparkSession: SparkSession,
     path: String)
@@ -45,6 +47,11 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
 
   import CompactibleFileStreamLog._
 
+  private implicit val formats = Serialization.formats(NoTypeHints)
+
+  /** Needed to serialize type T into JSON when using Jackson */
+  private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass)
+
   /**
    * If we delete the old files after compaction at once, there is a race condition in S3: other
    * processes may see the old files are deleted but still cannot see the compaction file using
@@ -58,16 +65,6 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
 
   protected def compactInterval: Int
 
-  /**
-   * Serialize the data into encoded string.
-   */
-  protected def serializeData(t: T): String
-
-  /**
-   * Deserialize the string into data object.
-   */
-  protected def deserializeData(encodedString: String): T
-
   /**
    * Filter out the obsolete logs.
    */
@@ -99,7 +96,7 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
     out.write(metadataLogVersion.getBytes(UTF_8))
     logData.foreach { data =>
       out.write('\n')
-      out.write(serializeData(data).getBytes(UTF_8))
+      out.write(Serialization.write(data).getBytes(UTF_8))
     }
   }
 
@@ -112,7 +109,7 @@ abstract class CompactibleFileStreamLog[T: ClassTag](
     if (version != metadataLogVersion) {
       throw new IllegalStateException(s"Unknown log version: ${version}")
     }
-    lines.map(deserializeData).toArray
+    lines.map(Serialization.read[T]).toArray
   }
 
   override def add(batchId: Long, logs: Array[T]): Boolean = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
index f9e24167a17ec..b4f14151f1ef2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
@@ -93,14 +93,6 @@ class FileStreamSinkLog(
     s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " +
       "to a positive value.")
 
-  protected override def serializeData(data: SinkFileStatus): String = {
-    write(data)
-  }
-
-  protected override def deserializeData(encodedString: String): SinkFileStatus = {
-    read[SinkFileStatus](encodedString)
-  }
-
   override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
     val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet
     if (deletedFiles.isEmpty) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 680df01acc1a6..8494aef004bb5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -131,8 +131,8 @@ class FileStreamSource(
    * Returns the data that is between the offsets (`start`, `end`].
    */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
-    val startId = start.map(_.asInstanceOf[LongOffset].offset).getOrElse(-1L)
-    val endId = end.asInstanceOf[LongOffset].offset
+    val startId = start.flatMap(LongOffset.convert(_)).getOrElse(LongOffset(-1L)).offset
+    val endId = LongOffset.convert(end).getOrElse(LongOffset(0)).offset
 
     assert(startId <= endId)
     val files = metadataLog.get(Some(startId + 1), Some(endId)).flatMap(_._2)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
index 4681f2ba08c84..fe81b15607068 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
@@ -60,14 +60,6 @@ class FileStreamSourceLog(
     }
   }
 
-  protected override def serializeData(data: FileEntry): String = {
-    Serialization.write(data)
-  }
-
-  protected override def deserializeData(encodedString: String): FileEntry = {
-    Serialization.read[FileEntry](encodedString)
-  }
-
   def compactLogs(logs: Seq[FileEntry]): Seq[FileEntry] = {
     logs
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index 9a0f87cf0498c..db7057d7da70c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.io.{FileNotFoundException, InputStream, IOException, OutputStream}
+import java.io._
+import java.nio.charset.StandardCharsets
 import java.util.{ConcurrentModificationException, EnumSet, UUID}
 
 import scala.reflect.ClassTag
@@ -26,9 +27,10 @@ import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.apache.hadoop.fs.permission.FsPermission
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.util.UninterruptibleThread
 
@@ -44,9 +46,14 @@ import org.apache.spark.util.UninterruptibleThread
  * Note: [[HDFSMetadataLog]] doesn't support S3-like file systems as they don't guarantee listing
  * files in a directory always shows the latest files.
  */
-class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
+class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: String)
   extends MetadataLog[T] with Logging {
 
+  private implicit val formats = Serialization.formats(NoTypeHints)
+
+  /** Needed to serialize type T into JSON when using Jackson */
+  private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass)
+
   // Avoid serializing generic sequences, see SPARK-17372
   require(implicitly[ClassTag[T]].runtimeClass != classOf[Seq[_]],
     "Should not create a log with type Seq, use Arrays instead - see SPARK-17372")
@@ -67,8 +74,6 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
     override def accept(path: Path): Boolean = isBatchFile(path)
   }
 
-  private val serializer = new JavaSerializer(sparkSession.sparkContext.conf).newInstance()
-
   protected def batchIdToPath(batchId: Long): Path = {
     new Path(metadataPath, batchId.toString)
   }
@@ -88,14 +93,13 @@ class HDFSMetadataLog[T: ClassTag](sparkSession: SparkSession, path: String)
 
   protected def serialize(metadata: T, out: OutputStream): Unit = {
     // called inside a try-finally where the underlying stream is closed in the caller
-    val outStream = serializer.serializeStream(out)
-    outStream.writeObject(metadata)
+    Serialization.write(metadata, out)
   }
 
   protected def deserialize(in: InputStream): T = {
     // called inside a try-finally where the underlying stream is closed in the caller
-    val inStream = serializer.deserializeStream(in)
-    inStream.readObject[T]()
+    val reader = new InputStreamReader(in, StandardCharsets.UTF_8)
+    Serialization.read[T](reader)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
index c5e8827777792..5f0b195fcfcb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
@@ -22,8 +22,27 @@ package org.apache.spark.sql.execution.streaming
  */
 case class LongOffset(offset: Long) extends Offset {
 
+  override val json = offset.toString
+
   def +(increment: Long): LongOffset = new LongOffset(offset + increment)
   def -(decrement: Long): LongOffset = new LongOffset(offset - decrement)
+}
+
+object LongOffset {
+
+  /**
+   * LongOffset factory from serialized offset.
+   * @return new LongOffset
+   */
+  def apply(offset: SerializedOffset) : LongOffset = new LongOffset(offset.json.toLong)
 
-  override def toString: String = s"#$offset"
+  /**
+   * Convert generic Offset to LongOffset if possible.
+   * @return converted LongOffset
+   */
+  def convert(offset: Offset): Option[LongOffset] = offset match {
+    case lo: LongOffset => Some(lo)
+    case so: SerializedOffset => Some(LongOffset(so))
+    case _ => None
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
index 1f52abf277581..4efcee0f8f9d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
@@ -23,4 +23,38 @@ package org.apache.spark.sql.execution.streaming
  * ordering of two [[Offset]] instances.  We do assume that if two offsets are `equal` then no
  * new data has arrived.
  */
-trait Offset extends Serializable {}
+abstract class Offset {
+
+  /**
+   * Equality based on JSON string representation. We leverage the
+   * JSON representation for normalization between the Offset's
+   * in memory and on disk representations.
+   */
+  override def equals(obj: Any): Boolean = obj match {
+    case o: Offset => this.json == o.json
+    case _ => false
+  }
+
+  override def hashCode(): Int = this.json.hashCode
+
+  override def toString(): String = this.json.toString
+
+  /**
+   * A JSON-serialized representation of an Offset that is
+   * used for saving offsets to the offset log.
+   * Note: We assume that equivalent/equal offsets serialize to
+   * identical JSON strings.
+   *
+   * @return JSON string encoding
+   */
+  def json: String
+}
+
+/**
+ * Used when loading a JSON serialized offset from external storage.
+ * We are currently not responsible for converting JSON serialized
+ * data into an internal (i.e., object) representation. Sources should
+ * define a factory method in their source Offset companion objects
+ * that accepts a [[SerializedOffset]] for doing the conversion.
+ */
+case class SerializedOffset(override val json: String) extends Offset
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
similarity index 83%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index ebc6ee8184902..a4e1fe6797097 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.execution.streaming
 
+
 /**
  * An ordered collection of offsets, used to track the progress of processing data from one or more
  * [[Source]]s that are present in a streaming query. This is similar to simplified, single-instance
  * vector clock that must progress linearly forward.
  */
-case class CompositeOffset(offsets: Seq[Option[Offset]]) extends Offset {
+case class OffsetSeq(offsets: Seq[Option[Offset]]) {
+
   /**
    * Unpacks an offset into [[StreamProgress]] by associating each offset with the order list of
    * sources.
@@ -36,15 +38,16 @@ case class CompositeOffset(offsets: Seq[Option[Offset]]) extends Offset {
   }
 
   override def toString: String =
-    offsets.map(_.map(_.toString).getOrElse("-")).mkString("[", ", ", "]")
+    offsets.map(_.map(_.json).getOrElse("-")).mkString("[", ", ", "]")
 }
 
-object CompositeOffset {
+object OffsetSeq {
+
   /**
-   * Returns a [[CompositeOffset]] with a variable sequence of offsets.
+   * Returns a [[OffsetSeq]] with a variable sequence of offsets.
    * `nulls` in the sequence are converted to `None`s.
    */
-  def fill(offsets: Offset*): CompositeOffset = {
-    CompositeOffset(offsets.map(Option(_)))
+  def fill(offsets: Offset*): OffsetSeq = {
+    OffsetSeq(offsets.map(Option(_)))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
new file mode 100644
index 0000000000000..d1c9d95be9fdb
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -0,0 +1,80 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.streaming
+
+
+import java.io.{InputStream, OutputStream}
+import java.nio.charset.StandardCharsets._
+
+import scala.io.{Source => IOSource}
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ * This class is used to log offsets to persistent files in HDFS.
+ * Each file corresponds to a specific batch of offsets. The file
+ * format contain a version string in the first line, followed
+ * by a the JSON string representation of the offsets separated
+ * by a newline character. If a source offset is missing, then
+ * that line will contain a string value defined in the
+ * SERIALIZED_VOID_OFFSET variable in [[OffsetSeqLog]] companion object.
+ * For instance, when dealine wiht [[LongOffset]] types:
+ *   v1   // version 1
+ *   {0}  // LongOffset 0
+ *   {3}  // LongOffset 3
+ *   -    // No offset for this source i.e., an invalid JSON string
+ *   {2}  // LongOffset 2
+ *   ...
+ */
+class OffsetSeqLog(sparkSession: SparkSession, path: String)
+  extends HDFSMetadataLog[OffsetSeq](sparkSession, path) {
+
+  override protected def deserialize(in: InputStream): OffsetSeq = {
+    // called inside a try-finally where the underlying stream is closed in the caller
+    def parseOffset(value: String): Offset = value match {
+      case OffsetSeqLog.SERIALIZED_VOID_OFFSET => null
+      case json => SerializedOffset(json)
+    }
+    val lines = IOSource.fromInputStream(in, UTF_8.name()).getLines()
+    if (!lines.hasNext) {
+      throw new IllegalStateException("Incomplete log file")
+    }
+    val version = lines.next()
+    if (version != OffsetSeqLog.VERSION) {
+      throw new IllegalStateException(s"Unknown log version: ${version}")
+    }
+    OffsetSeq.fill(lines.map(parseOffset).toArray: _*)
+  }
+
+  override protected def serialize(metadata: OffsetSeq, out: OutputStream): Unit = {
+    // called inside a try-finally where the underlying stream is closed in the caller
+    out.write(OffsetSeqLog.VERSION.getBytes(UTF_8))
+    metadata.offsets.map(_.map(_.json)).foreach { offset =>
+      out.write('\n')
+      offset match {
+        case Some(json: String) => out.write(json.getBytes(UTF_8))
+        case None => out.write(OffsetSeqLog.SERIALIZED_VOID_OFFSET.getBytes(UTF_8))
+      }
+    }
+  }
+}
+
+object OffsetSeqLog {
+  private val VERSION = "v1"
+  private val SERIALIZED_VOID_OFFSET = "-"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index f3bd5bfe23fdf..75ffe90f2bb70 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -45,6 +45,14 @@ trait Source  {
    * Higher layers will always call this method with a value of `start` greater than or equal
    * to the last value passed to `commit` and a value of `end` less than or equal to the
    * last value returned by `getOffset`
+   *
+   * It is possible for the [[Offset]] type to be a [[SerializedOffset]] when it was
+   * obtained from the log. Moreover, [[StreamExecution]] only compares the [[Offset]]
+   * JSON representation to determine if the two objects are equal. This could have
+   * ramifications when upgrading [[Offset]] JSON formats i.e., two equivalent [[Offset]]
+   * objects could differ between version. Consequently, [[StreamExecution]] may call
+   * this method with two such equivalent [[Offset]] objects. In which case, the [[Source]]
+   * should return an empty [[DataFrame]]
    */
   def getBatch(start: Option[Offset], end: Offset): DataFrame
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 37af1a550aaf1..57e89f85361e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan}
 import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming._
 import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
 
@@ -149,7 +148,7 @@ class StreamExecution(
    * processing is done.  Thus, the Nth record in this log indicated data that is currently being
    * processed and the N-1th entry indicates which offsets have been durably committed to the sink.
    */
-  val offsetLog = new HDFSMetadataLog[CompositeOffset](sparkSession, checkpointFile("offsets"))
+  val offsetLog = new OffsetSeqLog(sparkSession, checkpointFile("offsets"))
 
   /** Whether the query is currently active or not */
   override def isActive: Boolean = state == ACTIVE
@@ -249,7 +248,7 @@ class StreamExecution(
           this,
           s"Query $name terminated with exception: ${e.getMessage}",
           e,
-          Some(committedOffsets.toCompositeOffset(sources)))
+          Some(committedOffsets.toOffsetSeq(sources)))
         logError(s"Query $name terminated with error", e)
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
         // handle them
@@ -343,7 +342,7 @@ class StreamExecution(
     }
     if (hasNewData) {
       reportTimeTaken(OFFSET_WAL_WRITE_LATENCY) {
-        assert(offsetLog.add(currentBatchId, availableOffsets.toCompositeOffset(sources)),
+        assert(offsetLog.add(currentBatchId, availableOffsets.toOffsetSeq(sources)),
           s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
         logInfo(s"Committed offsets for batch $currentBatchId.")
 
@@ -684,14 +683,14 @@ class StreamExecution(
     val sourceStatuses = sources.map { s =>
       SourceStatus(
         s.toString,
-        localAvailableOffsets.get(s).map(_.toString).getOrElse("-"), // TODO: use json if available
+        localAvailableOffsets.get(s).map(_.json).getOrElse("-"),
         streamMetrics.currentSourceInputRate(s),
         streamMetrics.currentSourceProcessingRate(s),
         streamMetrics.currentSourceTriggerDetails(s))
     }.toArray
     val sinkStatus = SinkStatus(
       sink.toString,
-      committedOffsets.toCompositeOffset(sources).toString)
+      committedOffsets.toOffsetSeq(sources).toString)
 
     currentStatus =
       StreamingQueryStatus(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index db0bd9e6bc6f0..05a65476709cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -26,8 +26,8 @@ class StreamProgress(
     val baseMap: immutable.Map[Source, Offset] = new immutable.HashMap[Source, Offset])
   extends scala.collection.immutable.Map[Source, Offset] {
 
-  def toCompositeOffset(source: Seq[Source]): CompositeOffset = {
-    CompositeOffset(source.map(get))
+  def toOffsetSeq(source: Seq[Source]): OffsetSeq = {
+    OffsetSeq(source.map(get))
   }
 
   override def toString: String =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 613c7ccdd226a..582b5481220da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -106,8 +106,8 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
     // Compute the internal batch numbers to fetch: [startOrdinal, endOrdinal)
     val startOrdinal =
-      start.map(_.asInstanceOf[LongOffset]).getOrElse(LongOffset(-1)).offset.toInt + 1
-    val endOrdinal = end.asInstanceOf[LongOffset].offset.toInt + 1
+      start.flatMap(LongOffset.convert).getOrElse(LongOffset(-1)).offset.toInt + 1
+    val endOrdinal = LongOffset.convert(end).getOrElse(LongOffset(-1)).offset.toInt + 1
 
     // Internal buffer only holds the batches after lastCommittedOffset.
     val newBlocks = synchronized {
@@ -127,19 +127,21 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
   }
 
   override def commit(end: Offset): Unit = synchronized {
-    end match {
-      case newOffset: LongOffset =>
-        val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
-
-        if (offsetDiff < 0) {
-          sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
-        }
-
-        batches.trimStart(offsetDiff)
-        lastOffsetCommitted = newOffset
-      case _ =>
-        sys.error(s"MemoryStream.commit() received an offset ($end) that did not originate with " +
-          "an instance of this class")
+    def check(newOffset: LongOffset): Unit = {
+      val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
+
+      if (offsetDiff < 0) {
+        sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
+      }
+
+      batches.trimStart(offsetDiff)
+      lastOffsetCommitted = newOffset
+    }
+
+    LongOffset.convert(end) match {
+      case Some(lo) => check(lo)
+      case None => sys.error(s"MemoryStream.commit() received an offset ($end) " +
+        "that did not originate with an instance of this class")
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index 042977f870b8e..900d92bc0d959 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -116,8 +116,8 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
   /** Returns the data that is between the offsets (`start`, `end`]. */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = synchronized {
     val startOrdinal =
-      start.map(_.asInstanceOf[LongOffset]).getOrElse(LongOffset(-1)).offset.toInt + 1
-    val endOrdinal = end.asInstanceOf[LongOffset].offset.toInt + 1
+      start.flatMap(LongOffset.convert).getOrElse(LongOffset(-1)).offset.toInt + 1
+    val endOrdinal = LongOffset.convert(end).getOrElse(LongOffset(-1)).offset.toInt + 1
 
     // Internal buffer only holds the batches after lastOffsetCommitted
     val rawList = synchronized {
@@ -140,20 +140,19 @@ class TextSocketSource(host: String, port: Int, includeTimestamp: Boolean, sqlCo
   }
 
   override def commit(end: Offset): Unit = synchronized {
-    if (end.isInstanceOf[LongOffset]) {
-      val newOffset = end.asInstanceOf[LongOffset]
-      val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
-
-      if (offsetDiff < 0) {
-        sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
-      }
-
-      batches.trimStart(offsetDiff)
-      lastOffsetCommitted = newOffset
-    } else {
+    val newOffset = LongOffset.convert(end).getOrElse(
       sys.error(s"TextSocketStream.commit() received an offset ($end) that did not " +
         s"originate with an instance of this class")
+    )
+
+    val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt
+
+    if (offsetDiff < 0) {
+      sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
     }
+
+    batches.trimStart(offsetDiff)
+    lastOffsetCommitted = newOffset
   }
 
   /** Stop this source. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
index bd3e5a5618ec4..0a58142e066ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution}
+import org.apache.spark.sql.execution.streaming.{Offset, OffsetSeq, StreamExecution}
 
 /**
  * :: Experimental ::
@@ -36,8 +36,8 @@ class StreamingQueryException private[sql](
     @transient val query: StreamingQuery,
     val message: String,
     val cause: Throwable,
-    val startOffset: Option[Offset] = None,
-    val endOffset: Option[Offset] = None)
+    val startOffset: Option[OffsetSeq] = None,
+    val endOffset: Option[OffsetSeq] = None)
   extends Exception(message, cause) {
 
   /** Time when the exception occurred */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
index a50b0d96c13f7..99c7729d02351 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -27,7 +27,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset}
+import org.apache.spark.sql.execution.streaming.{LongOffset, OffsetSeq}
 import org.apache.spark.util.JsonProtocol
 
 /**
@@ -140,7 +140,7 @@ private[sql] object StreamingQueryStatus {
       sourceStatuses = Array(
         SourceStatus(
           desc = "MySource1",
-          offsetDesc = LongOffset(0).toString,
+          offsetDesc = LongOffset(0).json,
           inputRate = 15.5,
           processingRate = 23.5,
           triggerDetails = Map(
@@ -149,7 +149,7 @@ private[sql] object StreamingQueryStatus {
             SOURCE_GET_BATCH_LATENCY -> "20"))),
       sinkStatus = SinkStatus(
         desc = "MySink",
-        offsetDesc = CompositeOffset(Some(LongOffset(1)) :: None :: Nil).toString),
+        offsetDesc = OffsetSeq(Some(LongOffset(1)) :: None :: Nil).toString),
       triggerDetails = Map(
         TRIGGER_ID -> "5",
         IS_TRIGGER_ACTIVE -> "true",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
new file mode 100644
index 0000000000000..3afd11fa4686d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.File
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.test.SharedSQLContext
+
+class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
+
+  /** test string offset type */
+  case class StringOffset(override val json: String) extends Offset
+
+  testWithUninterruptibleThread("serialization - deserialization") {
+    withTempDir { temp =>
+      val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
+    val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
+      val batch0 = OffsetSeq.fill(LongOffset(0), LongOffset(1), LongOffset(2))
+      val batch1 = OffsetSeq.fill(StringOffset("one"), StringOffset("two"), StringOffset("three"))
+
+      val batch0Serialized = OffsetSeq.fill(batch0.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      val batch1Serialized = OffsetSeq.fill(batch1.offsets.flatMap(_.map(o =>
+        SerializedOffset(o.json))): _*)
+
+      assert(metadataLog.add(0, batch0))
+      assert(metadataLog.getLatest() === Some(0 -> batch0Serialized))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+
+      assert(metadataLog.add(1, batch1))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+
+      // Adding the same batch does nothing
+      metadataLog.add(1, OffsetSeq.fill(LongOffset(3)))
+      assert(metadataLog.get(0) === Some(batch0Serialized))
+      assert(metadataLog.get(1) === Some(batch1Serialized))
+      assert(metadataLog.getLatest() === Some(1 -> batch1Serialized))
+      assert(metadataLog.get(None, Some(1)) ===
+        Array(0 -> batch0Serialized, 1 -> batch1Serialized))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
index b65a987770304..f208f9bd9b6e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.execution.streaming.{CompositeOffset, LongOffset, Offset}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}
 
 trait OffsetSuite extends SparkFunSuite {
   /** Creates test to check all the comparisons of offsets given a `one` that is less than `two`. */
@@ -35,25 +35,11 @@ trait OffsetSuite extends SparkFunSuite {
 class LongOffsetSuite extends OffsetSuite {
   val one = LongOffset(1)
   val two = LongOffset(2)
+  val three = LongOffset(3)
   compare(one, two)
-}
-
-class CompositeOffsetSuite extends OffsetSuite {
-  compare(
-    one = CompositeOffset(Some(LongOffset(1)) :: Nil),
-    two = CompositeOffset(Some(LongOffset(2)) :: Nil))
-
-  compare(
-    one = CompositeOffset(None :: Nil),
-    two = CompositeOffset(Some(LongOffset(2)) :: Nil))
-
-  compare(
-    one = CompositeOffset.fill(LongOffset(0), LongOffset(1)),
-    two = CompositeOffset.fill(LongOffset(1), LongOffset(2)))
-
-  compare(
-    one = CompositeOffset.fill(LongOffset(1), LongOffset(1)),
-    two = CompositeOffset.fill(LongOffset(1), LongOffset(2)))
 
+  compare(LongOffset(SerializedOffset(one.json)),
+          LongOffset(SerializedOffset(three.json)))
 }
 
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
index 1a98cf2ba74e6..6af19fb0c2327 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
@@ -24,7 +24,7 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
     assert(StreamingQueryStatus.testStatus.sourceStatuses(0).toString ===
       """
         |Status of source MySource1
-        |    Available offset: #0
+        |    Available offset: 0
         |    Input rate: 15.5 rows/sec
         |    Processing rate: 23.5 rows/sec
         |    Trigger details:
@@ -36,7 +36,7 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
     assert(StreamingQueryStatus.testStatus.sinkStatus.toString ===
       """
         |Status of sink MySink
-        |    Committed offsets: [#1, -]
+        |    Committed offsets: [1, -]
       """.stripMargin.trim, "SinkStatus.toString does not match")
 
     assert(StreamingQueryStatus.testStatus.toString ===
@@ -56,7 +56,7 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
         |        triggerId: 5
         |    Source statuses [1 source]:
         |        Source 1 - MySource1
-        |            Available offset: #0
+        |            Available offset: 0
         |            Input rate: 15.5 rows/sec
         |            Processing rate: 23.5 rows/sec
         |            Trigger details:
@@ -64,7 +64,7 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
         |                latency.getOffset.source: 10
         |                latency.getBatch.source: 20
         |    Sink status - MySink
-        |        Committed offsets: [#1, -]
+        |        Committed offsets: [1, -]
       """.stripMargin.trim, "StreamingQueryStatus.toString does not match")
 
   }
@@ -72,10 +72,10 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
   test("json") {
     assert(StreamingQueryStatus.testStatus.json ===
       """
-        |{"sourceStatuses":[{"description":"MySource1","offsetDesc":"#0","inputRate":15.5,
+        |{"sourceStatuses":[{"description":"MySource1","offsetDesc":"0","inputRate":15.5,
         |"processingRate":23.5,"triggerDetails":{"numRows.input.source":"100",
         |"latency.getOffset.source":"10","latency.getBatch.source":"20"}}],
-        |"sinkStatus":{"description":"MySink","offsetDesc":"[#1, -]"}}
+        |"sinkStatus":{"description":"MySink","offsetDesc":"[1, -]"}}
       """.stripMargin.replace("\n", "").trim)
   }
 
@@ -86,7 +86,7 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
           |{
           |  "sourceStatuses" : [ {
           |    "description" : "MySource1",
-          |    "offsetDesc" : "#0",
+          |    "offsetDesc" : "0",
           |    "inputRate" : 15.5,
           |    "processingRate" : 23.5,
           |    "triggerDetails" : {
@@ -97,7 +97,7 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
           |  } ],
           |  "sinkStatus" : {
           |    "description" : "MySink",
-          |    "offsetDesc" : "[#1, -]"
+          |    "offsetDesc" : "[1, -]"
           |  }
           |}
         """.stripMargin.trim)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 31b7fe0b04da9..e2e66d6663e19 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -104,7 +104,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 10),
       AssertOnQuery(
         q =>
-          q.exception.get.startOffset.get === q.committedOffsets.toCompositeOffset(Seq(inputData)),
+          q.exception.get.startOffset.get === q.committedOffsets.toOffsetSeq(Seq(inputData)),
         "incorrect start offset on exception")
     )
   }
@@ -124,13 +124,13 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
       AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
       AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc === CompositeOffset(None :: Nil).toString),
+      AssertOnQuery(_.status.sinkStatus.offsetDesc === OffsetSeq(None :: Nil).toString),
       AssertOnQuery(_.sourceStatuses(0).description.contains("Memory")),
       AssertOnQuery(_.sourceStatuses(0).offsetDesc === "-"),
       AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
       AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
       AssertOnQuery(_.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.sinkStatus.offsetDesc === new CompositeOffset(None :: Nil).toString),
+      AssertOnQuery(_.sinkStatus.offsetDesc === new OffsetSeq(None :: Nil).toString),
 
       AddData(inputData, 1, 2),
       CheckAnswer(6, 3),
@@ -139,38 +139,38 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.processingRate >= 0.0),
       AssertOnQuery(_.status.sourceStatuses.length === 1),
       AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(0).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(0).json),
       AssertOnQuery(_.status.sourceStatuses(0).inputRate >= 0.0),
       AssertOnQuery(_.status.sourceStatuses(0).processingRate >= 0.0),
       AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
       AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(0)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(0).toString),
+        OffsetSeq.fill(LongOffset(0)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(0).json),
       AssertOnQuery(_.sourceStatuses(0).inputRate >= 0.0),
       AssertOnQuery(_.sourceStatuses(0).processingRate >= 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(0)).toString),
+      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(0)).toString),
 
       AddData(inputData, 1, 2),
       CheckAnswer(6, 3, 6, 3),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).json),
       AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString),
+        OffsetSeq.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).json),
+      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(1)).toString),
 
       StopStream,
       AssertOnQuery(_.status.inputRate === 0.0),
       AssertOnQuery(_.status.processingRate === 0.0),
       AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).json),
       AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
       AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
       AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).toString),
+        OffsetSeq.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).json),
       AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
       AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(1)).toString),
       AssertOnQuery(_.status.triggerDetails.isEmpty),
 
       StartStream(),
@@ -179,15 +179,15 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.inputRate === 0.0),
       AssertOnQuery(_.status.processingRate === 0.0),
       AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(2).toString),
+      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(2).json),
       AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
       AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
       AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        CompositeOffset.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(2).toString),
+        OffsetSeq.fill(LongOffset(1)).toString),
+      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(2).json),
       AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
       AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === CompositeOffset.fill(LongOffset(1)).toString)
+      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(1)).toString)
     )
   }
 

From 8c489a78d263bdd4ae2fb79de6fd00e21d124b69 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 10 Nov 2016 13:03:59 +0800
Subject: [PATCH 081/534] [SPARK-18147][SQL] do not fail for very complex
 aggregator result type

## What changes were proposed in this pull request?

~In `TypedAggregateExpression.evaluateExpression`, we may create `ReferenceToExpressions` with `CreateStruct`, and `CreateStruct` may generate too many codes and split them into several methods.  `ReferenceToExpressions` will replace `BoundReference` in `CreateStruct` with `LambdaVariable`, which can only be used as local variables and doesn't work if we split the generated code.~

It's already fixed by #15693 , this pr adds regression test

## How was this patch tested?

new test in `DatasetAggregatorSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15807 from cloud-fan/typed-agg.

(cherry picked from commit 6021c95a3aa3858b0499782b23b08ef92c73245d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/DatasetAggregatorSuite.scala    | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
index b117fbd0bcf97..36b2651e5a9e8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -134,6 +134,19 @@ object NullResultAgg extends Aggregator[AggData, AggData, AggData] {
   override def outputEncoder: Encoder[AggData] = Encoders.product[AggData]
 }
 
+case class ComplexAggData(d1: AggData, d2: AggData)
+
+object VeryComplexResultAgg extends Aggregator[Row, String, ComplexAggData] {
+  override def zero: String = ""
+  override def reduce(buffer: String, input: Row): String = buffer + input.getString(1)
+  override def merge(b1: String, b2: String): String = b1 + b2
+  override def finish(reduction: String): ComplexAggData = {
+    ComplexAggData(AggData(reduction.length, reduction), AggData(reduction.length, reduction))
+  }
+  override def bufferEncoder: Encoder[String] = Encoders.STRING
+  override def outputEncoder: Encoder[ComplexAggData] = Encoders.product[ComplexAggData]
+}
+
 
 class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -312,4 +325,12 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
     val ds3 = sql("SELECT 'Some String' AS b, 1279869254 AS a").as[AggData]
     assert(ds3.select(NameAgg.toColumn).schema.head.nullable === true)
   }
+
+  test("SPARK-18147: very complex aggregator result type") {
+    val df = Seq(1 -> "a", 2 -> "b", 2 -> "c").toDF("i", "j")
+
+    checkAnswer(
+      df.groupBy($"i").agg(VeryComplexResultAgg.toColumn),
+      Row(1, Row(Row(1, "a"), Row(1, "a"))) :: Row(2, Row(Row(2, "bc"), Row(2, "bc"))) :: Nil)
+  }
 }

From b54d71b6f3e265b0af9fad30c0f1ea5d2baa1a94 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 10 Nov 2016 10:23:45 +0000
Subject: [PATCH 082/534] [MINOR][PYSPARK] Improve error message when running
 PySpark with different minor versions

## What changes were proposed in this pull request?

Currently the error message is correct but doesn't provide additional hint to new users. It would be better to hint related configuration to users in the message.

## How was this patch tested?

N/A because it only changes error message.

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #15822 from viirya/minor-pyspark-worker-errmsg.

(cherry picked from commit cc86fcd0d6746a9821c8082cf91dafad101e0a9c)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 python/pyspark/worker.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index cf47ab8f96c6d..09182829538fc 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -119,7 +119,9 @@ def main(infile, outfile):
         version = utf8_deserializer.loads(infile)
         if version != "%d.%d" % sys.version_info[:2]:
             raise Exception(("Python in worker has different version %s than that in " +
-                             "driver %s, PySpark cannot run with different minor versions") %
+                             "driver %s, PySpark cannot run with different minor versions." +
+                             "Please check environment variables PYSPARK_PYTHON and " +
+                             "PYSPARK_DRIVER_PYTHON are correctly set.") %
                             ("%d.%d" % sys.version_info[:2], version))
 
         # initialize global state

From 62236b9eb951f171d96e9d7f5f12d641a2da9a26 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Thu, 10 Nov 2016 10:20:03 -0800
Subject: [PATCH 083/534] [SPARK-18262][BUILD][SQL] JSON.org license is now
 CatX

## What changes were proposed in this pull request?

Try excluding org.json:json from hive-exec dep as it's Cat X now. It may be the case that it's not used by the part of Hive Spark uses anyway.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15798 from srowen/SPARK-18262.

(cherry picked from commit 16eaad9daed0b633e6a714b5704509aa7107d6e5)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 NOTICE                         | 3 ---
 dev/deps/spark-deps-hadoop-2.2 | 1 -
 dev/deps/spark-deps-hadoop-2.3 | 1 -
 dev/deps/spark-deps-hadoop-2.4 | 1 -
 dev/deps/spark-deps-hadoop-2.6 | 1 -
 dev/deps/spark-deps-hadoop-2.7 | 1 -
 pom.xml                        | 5 +++++
 7 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/NOTICE b/NOTICE
index 69b513ea3ba3c..f4b64b5c3f470 100644
--- a/NOTICE
+++ b/NOTICE
@@ -421,9 +421,6 @@ Copyright (c) 2011, Terrence Parr.
 This product includes/uses ASM (http://asm.ow2.org/),
 Copyright (c) 2000-2007 INRIA, France Telecom.
 
-This product includes/uses org.json (http://www.json.org/java/index.html),
-Copyright (c) 2002 JSON.org
-
 This product includes/uses JLine (http://jline.sourceforge.net/),
 Copyright (c) 2002-2006, Marc Prud'hommeaux <mwp1@cornell.edu>.
 
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 99279a4ca8be9..6e749ac16cac0 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -103,7 +103,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index f094b4a7e167a..515995a0a46bd 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -108,7 +108,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 7f0ef98680a15..d2139fd952406 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -108,7 +108,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 4a27bf3deecb6..b5cecf72ec35f 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -116,7 +116,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 151670a8e23e4..a5e03a78e7ea8 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -116,7 +116,6 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json-20090211.jar
 json4s-ast_2.11-3.2.11.jar
 json4s-core_2.11-3.2.11.jar
 json4s-jackson_2.11-3.2.11.jar
diff --git a/pom.xml b/pom.xml
index 04d2eaa1d3bac..8aa0a6c3caab9 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1429,6 +1429,11 @@
             <groupId>jline</groupId>
             <artifactId>jline</artifactId>
           </exclusion>
+          <!-- Cat X license now; see SPARK-18262 -->
+          <exclusion>
+            <groupId>org.json</groupId>
+            <artifactId>json</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <dependency>

From be3933ddfa3b6b6cf458c0fc4865a61fef40e76a Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Thu, 10 Nov 2016 13:41:13 -0800
Subject: [PATCH 084/534] [SPARK-17993][SQL] Fix Parquet log output redirection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(Link to Jira issue: https://issues.apache.org/jira/browse/SPARK-17993)
## What changes were proposed in this pull request?

PR #14690 broke parquet log output redirection for converted partitioned Hive tables. For example, when querying parquet files written by Parquet-mr 1.6.0 Spark prints a torrent of (harmless) warning messages from the Parquet reader:

```
Oct 18, 2016 7:42:18 PM WARNING: org.apache.parquet.CorruptStatistics: Ignoring statistics because created_by could not be parsed (see PARQUET-251): parquet-mr version 1.6.0
org.apache.parquet.VersionParser$VersionParseException: Could not parse created_by: parquet-mr version 1.6.0 using format: (.+) version ((.*) )?\(build ?(.*)\)
    at org.apache.parquet.VersionParser.parse(VersionParser.java:112)
    at org.apache.parquet.CorruptStatistics.shouldIgnoreStatistics(CorruptStatistics.java:60)
    at org.apache.parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics(ParquetMetadataConverter.java:263)
    at org.apache.parquet.hadoop.ParquetFileReader$Chunk.readAllPages(ParquetFileReader.java:583)
    at org.apache.parquet.hadoop.ParquetFileReader.readNextRowGroup(ParquetFileReader.java:513)
    at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.checkEndOfRowGroup(VectorizedParquetRecordReader.java:270)
    at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:225)
    at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:137)
    at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:162)
    at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:102)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown Source)
    at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
    at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:372)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
    at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:99)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
```

This only happens during execution, not planning, and it doesn't matter what log level the `SparkContext` is set to. That's because Parquet (versions < 1.9) doesn't use slf4j for logging. Note, you can tell that log redirection is not working here because the log message format does not conform to the default Spark log message format.

This is a regression I noted as something we needed to fix as a follow up.

It appears that the problem arose because we removed the call to `inferSchema` during Hive table conversion. That call is what triggered the output redirection.

## How was this patch tested?

I tested this manually in four ways:
1. Executing `spark.sqlContext.range(10).selectExpr("id as a").write.mode("overwrite").parquet("test")`.
2. Executing `spark.read.format("parquet").load(legacyParquetFile).show` for a Parquet file `legacyParquetFile` written using Parquet-mr 1.6.0.
3. Executing `select * from legacy_parquet_table limit 1` for some unpartitioned Parquet-based Hive table written using Parquet-mr 1.6.0.
4. Executing `select * from legacy_partitioned_parquet_table where partcol=x limit 1` for some partitioned Parquet-based Hive table written using Parquet-mr 1.6.0.

I ran each test with a new instance of `spark-shell` or `spark-sql`.

Incidentally, I found that test case 3 was not a regression—redirection was not occurring in the master codebase prior to #14690.

I spent some time working on a unit test, but based on my experience working on this ticket I feel that automated testing here is far from feasible.

cc ericl dongjoon-hyun

Author: Michael Allman <michael@videoamp.com>

Closes #15538 from mallman/spark-17993-fix_parquet_log_redirection.

(cherry picked from commit b533fa2b205544b42dcebe0a6fee9d8275f6da7d)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../parquet/ParquetLogRedirector.java         | 72 +++++++++++++++++++
 .../parquet/ParquetFileFormat.scala           | 58 ++++-----------
 sql/core/src/test/resources/log4j.properties  |  4 +-
 sql/hive/src/test/resources/log4j.properties  |  4 ++
 4 files changed, 90 insertions(+), 48 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java
new file mode 100644
index 0000000000000..7a7f32ee1e87b
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetLogRedirector.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.datasources.parquet;
+
+import java.io.Serializable;
+import java.util.logging.Handler;
+import java.util.logging.Logger;
+
+import org.apache.parquet.Log;
+import org.slf4j.bridge.SLF4JBridgeHandler;
+
+// Redirects the JUL logging for parquet-mr versions <= 1.8 to SLF4J logging using
+// SLF4JBridgeHandler. Parquet-mr versions >= 1.9 use SLF4J directly
+final class ParquetLogRedirector implements Serializable {
+  // Client classes should hold a reference to INSTANCE to ensure redirection occurs. This is
+  // especially important for Serializable classes where fields are set but constructors are
+  // ignored
+  static final ParquetLogRedirector INSTANCE = new ParquetLogRedirector();
+
+  // JUL loggers must be held by a strong reference, otherwise they may get destroyed by GC.
+  // However, the root JUL logger used by Parquet isn't properly referenced.  Here we keep
+  // references to loggers in both parquet-mr <= 1.6 and 1.7/1.8
+  private static final Logger apacheParquetLogger =
+    Logger.getLogger(Log.class.getPackage().getName());
+  private static final Logger parquetLogger = Logger.getLogger("parquet");
+
+  static {
+    // For parquet-mr 1.7 and 1.8, which are under `org.apache.parquet` namespace.
+    try {
+      Class.forName(Log.class.getName());
+      redirect(Logger.getLogger(Log.class.getPackage().getName()));
+    } catch (ClassNotFoundException ex) {
+      throw new RuntimeException(ex);
+    }
+
+    // For parquet-mr 1.6.0 and lower versions bundled with Hive, which are under `parquet`
+    // namespace.
+    try {
+      Class.forName("parquet.Log");
+      redirect(Logger.getLogger("parquet"));
+    } catch (Throwable t) {
+      // SPARK-9974: com.twitter:parquet-hadoop-bundle:1.6.0 is not packaged into the assembly
+      // when Spark is built with SBT. So `parquet.Log` may not be found.  This try/catch block
+      // should be removed after this issue is fixed.
+    }
+  }
+
+  private ParquetLogRedirector() {
+  }
+
+  private static void redirect(Logger logger) {
+    for (Handler handler : logger.getHandlers()) {
+      logger.removeHandler(handler);
+    }
+    logger.setUseParentHandlers(false);
+    logger.addHandler(new SLF4JBridgeHandler());
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index b8ea7f40c4ab3..031a0fe57893f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.execution.datasources.parquet
 
 import java.net.URI
-import java.util.logging.{Logger => JLogger}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -29,14 +28,12 @@ import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.FileSplit
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
-import org.apache.parquet.{Log => ApacheParquetLog}
 import org.apache.parquet.filter2.compat.FilterCompat
 import org.apache.parquet.filter2.predicate.FilterApi
 import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.codec.CodecConfig
 import org.apache.parquet.hadoop.util.ContextUtil
 import org.apache.parquet.schema.MessageType
-import org.slf4j.bridge.SLF4JBridgeHandler
 
 import org.apache.spark.{SparkException, TaskContext}
 import org.apache.spark.internal.Logging
@@ -56,6 +53,11 @@ class ParquetFileFormat
   with DataSourceRegister
   with Logging
   with Serializable {
+  // Hold a reference to the (serializable) singleton instance of ParquetLogRedirector. This
+  // ensures the ParquetLogRedirector class is initialized whether an instance of ParquetFileFormat
+  // is constructed or deserialized. Do not heed the Scala compiler's warning about an unused field
+  // here.
+  private val parquetLogRedirector = ParquetLogRedirector.INSTANCE
 
   override def shortName(): String = "parquet"
 
@@ -129,10 +131,14 @@ class ParquetFileFormat
       conf.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
     }
 
-    ParquetFileFormat.redirectParquetLogs()
-
     new OutputWriterFactory {
-      override def newInstance(
+      // This OutputWriterFactory instance is deserialized when writing Parquet files on the
+      // executor side without constructing or deserializing ParquetFileFormat. Therefore, we hold
+      // another reference to ParquetLogRedirector.INSTANCE here to ensure the latter class is
+      // initialized.
+      private val parquetLogRedirector = ParquetLogRedirector.INSTANCE
+
+        override def newInstance(
           path: String,
           dataSchema: StructType,
           context: TaskAttemptContext): OutputWriter = {
@@ -673,44 +679,4 @@ object ParquetFileFormat extends Logging {
         Failure(cause)
     }.toOption
   }
-
-  // JUL loggers must be held by a strong reference, otherwise they may get destroyed by GC.
-  // However, the root JUL logger used by Parquet isn't properly referenced.  Here we keep
-  // references to loggers in both parquet-mr <= 1.6 and >= 1.7
-  val apacheParquetLogger: JLogger = JLogger.getLogger(classOf[ApacheParquetLog].getPackage.getName)
-  val parquetLogger: JLogger = JLogger.getLogger("parquet")
-
-  // Parquet initializes its own JUL logger in a static block which always prints to stdout.  Here
-  // we redirect the JUL logger via SLF4J JUL bridge handler.
-  val redirectParquetLogsViaSLF4J: Unit = {
-    def redirect(logger: JLogger): Unit = {
-      logger.getHandlers.foreach(logger.removeHandler)
-      logger.setUseParentHandlers(false)
-      logger.addHandler(new SLF4JBridgeHandler)
-    }
-
-    // For parquet-mr 1.7.0 and above versions, which are under `org.apache.parquet` namespace.
-    // scalastyle:off classforname
-    Class.forName(classOf[ApacheParquetLog].getName)
-    // scalastyle:on classforname
-    redirect(JLogger.getLogger(classOf[ApacheParquetLog].getPackage.getName))
-
-    // For parquet-mr 1.6.0 and lower versions bundled with Hive, which are under `parquet`
-    // namespace.
-    try {
-      // scalastyle:off classforname
-      Class.forName("parquet.Log")
-      // scalastyle:on classforname
-      redirect(JLogger.getLogger("parquet"))
-    } catch { case _: Throwable =>
-      // SPARK-9974: com.twitter:parquet-hadoop-bundle:1.6.0 is not packaged into the assembly
-      // when Spark is built with SBT. So `parquet.Log` may not be found.  This try/catch block
-      // should be removed after this issue is fixed.
-    }
-  }
-
-  /**
-   * ParquetFileFormat.prepareWrite calls this function to initialize `redirectParquetLogsViaSLF4J`.
-   */
-  def redirectParquetLogs(): Unit = {}
 }
diff --git a/sql/core/src/test/resources/log4j.properties b/sql/core/src/test/resources/log4j.properties
index 33b9ecf1e2826..25b817382195a 100644
--- a/sql/core/src/test/resources/log4j.properties
+++ b/sql/core/src/test/resources/log4j.properties
@@ -53,5 +53,5 @@ log4j.additivity.hive.ql.metadata.Hive=false
 log4j.logger.hive.ql.metadata.Hive=OFF
 
 # Parquet related logging
-log4j.logger.org.apache.parquet.hadoop=WARN
-log4j.logger.org.apache.spark.sql.parquet=INFO
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR
diff --git a/sql/hive/src/test/resources/log4j.properties b/sql/hive/src/test/resources/log4j.properties
index fea3404769d9d..072bb25d30a87 100644
--- a/sql/hive/src/test/resources/log4j.properties
+++ b/sql/hive/src/test/resources/log4j.properties
@@ -59,3 +59,7 @@ log4j.logger.hive.ql.metadata.Hive=OFF
 
 log4j.additivity.org.apache.hadoop.hive.ql.io.RCFile=false
 log4j.logger.org.apache.hadoop.hive.ql.io.RCFile=ERROR
+
+# Parquet related logging
+log4j.logger.org.apache.parquet=ERROR
+log4j.logger.parquet=ERROR

From c602894f25bf9e61b759815674008471858cc71e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 10 Nov 2016 13:42:48 -0800
Subject: [PATCH 085/534] [SPARK-17990][SPARK-18302][SQL] correct several
 partition related behaviours of ExternalCatalog

## What changes were proposed in this pull request?

This PR corrects several partition related behaviors of `ExternalCatalog`:

1. default partition location should not always lower case the partition column names in path string(fix `HiveExternalCatalog`)
2. rename partition should not always lower case the partition column names in updated partition path string(fix `HiveExternalCatalog`)
3. rename partition should update the partition location only for managed table(fix `InMemoryCatalog`)
4. create partition with existing directory should be fine(fix `InMemoryCatalog`)
5. create partition with non-existing directory should create that directory(fix `InMemoryCatalog`)
6. drop partition from external table should not delete the directory(fix `InMemoryCatalog`)

## How was this patch tested?

new tests in `ExternalCatalogSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15797 from cloud-fan/partition.

(cherry picked from commit 2f7461f31331cfc37f6cfa3586b7bbefb3af5547)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalog/ExternalCatalogUtils.scala        | 121 ++++++++++++++
 .../catalyst/catalog/InMemoryCatalog.scala    |  92 +++++------
 .../sql/catalyst/catalog/interface.scala      |  11 ++
 .../catalog/ExternalCatalogSuite.scala        | 150 ++++++++++++++----
 .../catalog/SessionCatalogSuite.scala         |  24 ++-
 .../spark/sql/execution/command/ddl.scala     |   8 +-
 .../spark/sql/execution/command/tables.scala  |   3 +-
 .../datasources/CatalogFileIndex.scala        |   2 +-
 .../datasources/DataSourceStrategy.scala      |   2 +-
 .../datasources/FileFormatWriter.scala        |   6 +-
 .../PartitioningAwareFileIndex.scala          |   2 -
 .../datasources/PartitioningUtils.scala       |  94 +----------
 .../sql/execution/command/DDLSuite.scala      |   8 +-
 .../ParquetPartitionDiscoverySuite.scala      |  21 +--
 .../spark/sql/hive/HiveExternalCatalog.scala  |  51 +++++-
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |   4 +-
 .../spark/sql/hive/MultiDatabaseSuite.scala   |   2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |   2 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   2 +-
 19 files changed, 397 insertions(+), 208 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
new file mode 100644
index 0000000000000..b1442eec164d8
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.catalog
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.Shell
+
+import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
+
+object ExternalCatalogUtils {
+  // This duplicates default value of Hive `ConfVars.DEFAULTPARTITIONNAME`, since catalyst doesn't
+  // depend on Hive.
+  val DEFAULT_PARTITION_NAME = "__HIVE_DEFAULT_PARTITION__"
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+
+  val charToEscape = {
+    val bitSet = new java.util.BitSet(128)
+
+    /**
+     * ASCII 01-1F are HTTP control characters that need to be escaped.
+     * \u000A and \u000D are \n and \r, respectively.
+     */
+    val clist = Array(
+      '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009',
+      '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
+      '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
+      '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F',
+      '{', '[', ']', '^')
+
+    clist.foreach(bitSet.set(_))
+
+    if (Shell.WINDOWS) {
+      Array(' ', '<', '>', '|').foreach(bitSet.set(_))
+    }
+
+    bitSet
+  }
+
+  def needsEscaping(c: Char): Boolean = {
+    c >= 0 && c < charToEscape.size() && charToEscape.get(c)
+  }
+
+  def escapePathName(path: String): String = {
+    val builder = new StringBuilder()
+    path.foreach { c =>
+      if (needsEscaping(c)) {
+        builder.append('%')
+        builder.append(f"${c.asInstanceOf[Int]}%02X")
+      } else {
+        builder.append(c)
+      }
+    }
+
+    builder.toString()
+  }
+
+
+  def unescapePathName(path: String): String = {
+    val sb = new StringBuilder
+    var i = 0
+
+    while (i < path.length) {
+      val c = path.charAt(i)
+      if (c == '%' && i + 2 < path.length) {
+        val code: Int = try {
+          Integer.parseInt(path.substring(i + 1, i + 3), 16)
+        } catch {
+          case _: Exception => -1
+        }
+        if (code >= 0) {
+          sb.append(code.asInstanceOf[Char])
+          i += 3
+        } else {
+          sb.append(c)
+          i += 1
+        }
+      } else {
+        sb.append(c)
+        i += 1
+      }
+    }
+
+    sb.toString()
+  }
+
+  def generatePartitionPath(
+      spec: TablePartitionSpec,
+      partitionColumnNames: Seq[String],
+      tablePath: Path): Path = {
+    val partitionPathStrings = partitionColumnNames.map { col =>
+      val partitionValue = spec(col)
+      val partitionString = if (partitionValue == null) {
+        DEFAULT_PARTITION_NAME
+      } else {
+        escapePathName(partitionValue)
+      }
+      escapePathName(col) + "=" + partitionString
+    }
+    partitionPathStrings.foldLeft(tablePath) { (totalPath, nextPartPath) =>
+      new Path(totalPath, nextPartPath)
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 20db81e6f9060..a3ffeaa63f690 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -231,7 +231,7 @@ class InMemoryCatalog(
         assert(tableMeta.storage.locationUri.isDefined,
           "Managed table should always have table location, as we will assign a default location " +
             "to it if it doesn't have one.")
-        val dir = new Path(tableMeta.storage.locationUri.get)
+        val dir = new Path(tableMeta.location)
         try {
           val fs = dir.getFileSystem(hadoopConfig)
           fs.delete(dir, true)
@@ -259,7 +259,7 @@ class InMemoryCatalog(
       assert(oldDesc.table.storage.locationUri.isDefined,
         "Managed table should always have table location, as we will assign a default location " +
           "to it if it doesn't have one.")
-      val oldDir = new Path(oldDesc.table.storage.locationUri.get)
+      val oldDir = new Path(oldDesc.table.location)
       val newDir = new Path(catalog(db).db.locationUri, newName)
       try {
         val fs = oldDir.getFileSystem(hadoopConfig)
@@ -355,25 +355,28 @@ class InMemoryCatalog(
       }
     }
 
-    val tableDir = new Path(catalog(db).db.locationUri, table)
-    val partitionColumnNames = getTable(db, table).partitionColumnNames
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    val tablePath = new Path(tableMeta.location)
     // TODO: we should follow hive to roll back if one partition path failed to create.
     parts.foreach { p =>
-      // If location is set, the partition is using an external partition location and we don't
-      // need to handle its directory.
-      if (p.storage.locationUri.isEmpty) {
-        val partitionPath = partitionColumnNames.flatMap { col =>
-          p.spec.get(col).map(col + "=" + _)
-        }.mkString("/")
-        try {
-          val fs = tableDir.getFileSystem(hadoopConfig)
-          fs.mkdirs(new Path(tableDir, partitionPath))
-        } catch {
-          case e: IOException =>
-            throw new SparkException(s"Unable to create partition path $partitionPath", e)
+      val partitionPath = p.storage.locationUri.map(new Path(_)).getOrElse {
+        ExternalCatalogUtils.generatePartitionPath(p.spec, partitionColumnNames, tablePath)
+      }
+
+      try {
+        val fs = tablePath.getFileSystem(hadoopConfig)
+        if (!fs.exists(partitionPath)) {
+          fs.mkdirs(partitionPath)
         }
+      } catch {
+        case e: IOException =>
+          throw new SparkException(s"Unable to create partition path $partitionPath", e)
       }
-      existingParts.put(p.spec, p)
+
+      existingParts.put(
+        p.spec,
+        p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toString))))
     }
   }
 
@@ -392,19 +395,15 @@ class InMemoryCatalog(
       }
     }
 
-    val tableDir = new Path(catalog(db).db.locationUri, table)
-    val partitionColumnNames = getTable(db, table).partitionColumnNames
-    // TODO: we should follow hive to roll back if one partition path failed to delete.
+    val shouldRemovePartitionLocation = getTable(db, table).tableType == CatalogTableType.MANAGED
+    // TODO: we should follow hive to roll back if one partition path failed to delete, and support
+    // partial partition spec.
     partSpecs.foreach { p =>
-      // If location is set, the partition is using an external partition location and we don't
-      // need to handle its directory.
-      if (existingParts.contains(p) && existingParts(p).storage.locationUri.isEmpty) {
-        val partitionPath = partitionColumnNames.flatMap { col =>
-          p.get(col).map(col + "=" + _)
-        }.mkString("/")
+      if (existingParts.contains(p) && shouldRemovePartitionLocation) {
+        val partitionPath = new Path(existingParts(p).location)
         try {
-          val fs = tableDir.getFileSystem(hadoopConfig)
-          fs.delete(new Path(tableDir, partitionPath), true)
+          val fs = partitionPath.getFileSystem(hadoopConfig)
+          fs.delete(partitionPath, true)
         } catch {
           case e: IOException =>
             throw new SparkException(s"Unable to delete partition path $partitionPath", e)
@@ -423,33 +422,34 @@ class InMemoryCatalog(
     requirePartitionsExist(db, table, specs)
     requirePartitionsNotExist(db, table, newSpecs)
 
-    val tableDir = new Path(catalog(db).db.locationUri, table)
-    val partitionColumnNames = getTable(db, table).partitionColumnNames
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    val tablePath = new Path(tableMeta.location)
+    val shouldUpdatePartitionLocation = getTable(db, table).tableType == CatalogTableType.MANAGED
+    val existingParts = catalog(db).tables(table).partitions
     // TODO: we should follow hive to roll back if one partition path failed to rename.
     specs.zip(newSpecs).foreach { case (oldSpec, newSpec) =>
-      val newPart = getPartition(db, table, oldSpec).copy(spec = newSpec)
-      val existingParts = catalog(db).tables(table).partitions
-
-      // If location is set, the partition is using an external partition location and we don't
-      // need to handle its directory.
-      if (newPart.storage.locationUri.isEmpty) {
-        val oldPath = partitionColumnNames.flatMap { col =>
-          oldSpec.get(col).map(col + "=" + _)
-        }.mkString("/")
-        val newPath = partitionColumnNames.flatMap { col =>
-          newSpec.get(col).map(col + "=" + _)
-        }.mkString("/")
+      val oldPartition = getPartition(db, table, oldSpec)
+      val newPartition = if (shouldUpdatePartitionLocation) {
+        val oldPartPath = new Path(oldPartition.location)
+        val newPartPath = ExternalCatalogUtils.generatePartitionPath(
+          newSpec, partitionColumnNames, tablePath)
         try {
-          val fs = tableDir.getFileSystem(hadoopConfig)
-          fs.rename(new Path(tableDir, oldPath), new Path(tableDir, newPath))
+          val fs = tablePath.getFileSystem(hadoopConfig)
+          fs.rename(oldPartPath, newPartPath)
         } catch {
           case e: IOException =>
-            throw new SparkException(s"Unable to rename partition path $oldPath", e)
+            throw new SparkException(s"Unable to rename partition path $oldPartPath", e)
         }
+        oldPartition.copy(
+          spec = newSpec,
+          storage = oldPartition.storage.copy(locationUri = Some(newPartPath.toString)))
+      } else {
+        oldPartition.copy(spec = newSpec)
       }
 
       existingParts.remove(oldSpec)
-      existingParts.put(newSpec, newPart)
+      existingParts.put(newSpec, newPartition)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 34748a04859ad..93c70de18ae7e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -99,6 +99,12 @@ case class CatalogTablePartition(
     output.filter(_.nonEmpty).mkString("CatalogPartition(\n\t", "\n\t", ")")
   }
 
+  /** Return the partition location, assuming it is specified. */
+  def location: String = storage.locationUri.getOrElse {
+    val specString = spec.map { case (k, v) => s"$k=$v" }.mkString(", ")
+    throw new AnalysisException(s"Partition [$specString] did not specify locationUri")
+  }
+
   /**
    * Given the partition schema, returns a row with that schema holding the partition values.
    */
@@ -171,6 +177,11 @@ case class CatalogTable(
     throw new AnalysisException(s"table $identifier did not specify database")
   }
 
+  /** Return the table location, assuming it is specified. */
+  def location: String = storage.locationUri.getOrElse {
+    throw new AnalysisException(s"table $identifier did not specify locationUri")
+  }
+
   /** Return the fully qualified name of this table, assuming the database was specified. */
   def qualifiedName: String = identifier.unquotedString
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 34bdfc8a98710..303a8662d3f4d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -17,9 +17,8 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
-import java.io.File
-import java.net.URI
-
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
@@ -320,6 +319,33 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     catalog.createPartitions("db2", "tbl2", Seq(part1), ignoreIfExists = true)
   }
 
+  test("create partitions without location") {
+    val catalog = newBasicCatalog()
+    val table = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat(None, None, None, None, false, Map.empty),
+      schema = new StructType()
+        .add("col1", "int")
+        .add("col2", "string")
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("partCol1", "partCol2"))
+    catalog.createTable(table, ignoreIfExists = false)
+
+    val partition = CatalogTablePartition(Map("partCol1" -> "1", "partCol2" -> "2"), storageFormat)
+    catalog.createPartitions("db1", "tbl", Seq(partition), ignoreIfExists = false)
+
+    val partitionLocation = catalog.getPartition(
+      "db1",
+      "tbl",
+      Map("partCol1" -> "1", "partCol2" -> "2")).location
+    val tableLocation = catalog.getTable("db1", "tbl").location
+    val defaultPartitionLocation = new Path(new Path(tableLocation, "partCol1=1"), "partCol2=2")
+    assert(new Path(partitionLocation) == defaultPartitionLocation)
+  }
+
   test("list partitions with partial partition spec") {
     val catalog = newBasicCatalog()
     val parts = catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "1")))
@@ -399,6 +425,46 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     intercept[AnalysisException] { catalog.getPartition("db2", "tbl2", part2.spec) }
   }
 
+  test("rename partitions should update the location for managed table") {
+    val catalog = newBasicCatalog()
+    val table = CatalogTable(
+      identifier = TableIdentifier("tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat(None, None, None, None, false, Map.empty),
+      schema = new StructType()
+        .add("col1", "int")
+        .add("col2", "string")
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
+      provider = Some("hive"),
+      partitionColumnNames = Seq("partCol1", "partCol2"))
+    catalog.createTable(table, ignoreIfExists = false)
+
+    val tableLocation = catalog.getTable("db1", "tbl").location
+
+    val mixedCasePart1 = CatalogTablePartition(
+      Map("partCol1" -> "1", "partCol2" -> "2"), storageFormat)
+    val mixedCasePart2 = CatalogTablePartition(
+      Map("partCol1" -> "3", "partCol2" -> "4"), storageFormat)
+
+    catalog.createPartitions("db1", "tbl", Seq(mixedCasePart1), ignoreIfExists = false)
+    assert(
+      new Path(catalog.getPartition("db1", "tbl", mixedCasePart1.spec).location) ==
+        new Path(new Path(tableLocation, "partCol1=1"), "partCol2=2"))
+
+    catalog.renamePartitions("db1", "tbl", Seq(mixedCasePart1.spec), Seq(mixedCasePart2.spec))
+    assert(
+      new Path(catalog.getPartition("db1", "tbl", mixedCasePart2.spec).location) ==
+        new Path(new Path(tableLocation, "partCol1=3"), "partCol2=4"))
+
+    // For external tables, RENAME PARTITION should not update the partition location.
+    val existingPartLoc = catalog.getPartition("db2", "tbl2", part1.spec).location
+    catalog.renamePartitions("db2", "tbl2", Seq(part1.spec), Seq(part3.spec))
+    assert(
+      new Path(catalog.getPartition("db2", "tbl2", part3.spec).location) ==
+        new Path(existingPartLoc))
+  }
+
   test("rename partitions when database/table does not exist") {
     val catalog = newBasicCatalog()
     intercept[AnalysisException] {
@@ -419,11 +485,6 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
   test("alter partitions") {
     val catalog = newBasicCatalog()
     try {
-      // Note: Before altering table partitions in Hive, you *must* set the current database
-      // to the one that contains the table of interest. Otherwise you will end up with the
-      // most helpful error message ever: "Unable to alter partition. alter is not possible."
-      // See HIVE-2742 for more detail.
-      catalog.setCurrentDatabase("db2")
       val newLocation = newUriForDatabase()
       val newSerde = "com.sparkbricks.text.EasySerde"
       val newSerdeProps = Map("spark" -> "bricks", "compressed" -> "false")
@@ -571,10 +632,11 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
   // --------------------------------------------------------------------------
 
   private def exists(uri: String, children: String*): Boolean = {
-    val base = new File(new URI(uri))
-    children.foldLeft(base) {
-      case (parent, child) => new File(parent, child)
-    }.exists()
+    val base = new Path(uri)
+    val finalPath = children.foldLeft(base) {
+      case (parent, child) => new Path(parent, child)
+    }
+    base.getFileSystem(new Configuration()).exists(finalPath)
   }
 
   test("create/drop database should create/delete the directory") {
@@ -623,7 +685,6 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
 
   test("create/drop/rename partitions should create/delete/rename the directory") {
     val catalog = newBasicCatalog()
-    val databaseDir = catalog.getDatabase("db1").locationUri
     val table = CatalogTable(
       identifier = TableIdentifier("tbl", Some("db1")),
       tableType = CatalogTableType.MANAGED,
@@ -631,34 +692,61 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
       schema = new StructType()
         .add("col1", "int")
         .add("col2", "string")
-        .add("a", "int")
-        .add("b", "string"),
+        .add("partCol1", "int")
+        .add("partCol2", "string"),
       provider = Some("hive"),
-      partitionColumnNames = Seq("a", "b")
-    )
+      partitionColumnNames = Seq("partCol1", "partCol2"))
     catalog.createTable(table, ignoreIfExists = false)
 
+    val tableLocation = catalog.getTable("db1", "tbl").location
+
+    val part1 = CatalogTablePartition(Map("partCol1" -> "1", "partCol2" -> "2"), storageFormat)
+    val part2 = CatalogTablePartition(Map("partCol1" -> "3", "partCol2" -> "4"), storageFormat)
+    val part3 = CatalogTablePartition(Map("partCol1" -> "5", "partCol2" -> "6"), storageFormat)
+
     catalog.createPartitions("db1", "tbl", Seq(part1, part2), ignoreIfExists = false)
-    assert(exists(databaseDir, "tbl", "a=1", "b=2"))
-    assert(exists(databaseDir, "tbl", "a=3", "b=4"))
+    assert(exists(tableLocation, "partCol1=1", "partCol2=2"))
+    assert(exists(tableLocation, "partCol1=3", "partCol2=4"))
 
     catalog.renamePartitions("db1", "tbl", Seq(part1.spec), Seq(part3.spec))
-    assert(!exists(databaseDir, "tbl", "a=1", "b=2"))
-    assert(exists(databaseDir, "tbl", "a=5", "b=6"))
+    assert(!exists(tableLocation, "partCol1=1", "partCol2=2"))
+    assert(exists(tableLocation, "partCol1=5", "partCol2=6"))
 
     catalog.dropPartitions("db1", "tbl", Seq(part2.spec, part3.spec), ignoreIfNotExists = false,
       purge = false)
-    assert(!exists(databaseDir, "tbl", "a=3", "b=4"))
-    assert(!exists(databaseDir, "tbl", "a=5", "b=6"))
+    assert(!exists(tableLocation, "partCol1=3", "partCol2=4"))
+    assert(!exists(tableLocation, "partCol1=5", "partCol2=6"))
 
-    val externalPartition = CatalogTablePartition(
-      Map("a" -> "7", "b" -> "8"),
+    val tempPath = Utils.createTempDir()
+    // create partition with existing directory is OK.
+    val partWithExistingDir = CatalogTablePartition(
+      Map("partCol1" -> "7", "partCol2" -> "8"),
       CatalogStorageFormat(
-        Some(Utils.createTempDir().getAbsolutePath),
-        None, None, None, false, Map.empty)
-    )
-    catalog.createPartitions("db1", "tbl", Seq(externalPartition), ignoreIfExists = false)
-    assert(!exists(databaseDir, "tbl", "a=7", "b=8"))
+        Some(tempPath.getAbsolutePath),
+        None, None, None, false, Map.empty))
+    catalog.createPartitions("db1", "tbl", Seq(partWithExistingDir), ignoreIfExists = false)
+
+    tempPath.delete()
+    // create partition with non-existing directory will create that directory.
+    val partWithNonExistingDir = CatalogTablePartition(
+      Map("partCol1" -> "9", "partCol2" -> "10"),
+      CatalogStorageFormat(
+        Some(tempPath.getAbsolutePath),
+        None, None, None, false, Map.empty))
+    catalog.createPartitions("db1", "tbl", Seq(partWithNonExistingDir), ignoreIfExists = false)
+    assert(tempPath.exists())
+  }
+
+  test("drop partition from external table should not delete the directory") {
+    val catalog = newBasicCatalog()
+    catalog.createPartitions("db2", "tbl1", Seq(part1), ignoreIfExists = false)
+
+    val partPath = new Path(catalog.getPartition("db2", "tbl1", part1.spec).location)
+    val fs = partPath.getFileSystem(new Configuration)
+    assert(fs.exists(partPath))
+
+    catalog.dropPartitions("db2", "tbl1", Seq(part1.spec), ignoreIfNotExists = false, purge = false)
+    assert(fs.exists(partPath))
   }
 }
 
@@ -731,7 +819,7 @@ abstract class CatalogTestUtils {
     CatalogTable(
       identifier = TableIdentifier(name, database),
       tableType = CatalogTableType.EXTERNAL,
-      storage = storageFormat,
+      storage = storageFormat.copy(locationUri = Some(Utils.createTempDir().getAbsolutePath)),
       schema = new StructType()
         .add("col1", "int")
         .add("col2", "string")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 001d9c47785d2..52385de50db6b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -527,13 +527,13 @@ class SessionCatalogSuite extends SparkFunSuite {
     sessionCatalog.createTable(newTable("tbl", "mydb"), ignoreIfExists = false)
     sessionCatalog.createPartitions(
       TableIdentifier("tbl", Some("mydb")), Seq(part1, part2), ignoreIfExists = false)
-    assert(catalogPartitionsEqual(externalCatalog, "mydb", "tbl", Seq(part1, part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("mydb", "tbl"), part1, part2))
     // Create partitions without explicitly specifying database
     sessionCatalog.setCurrentDatabase("mydb")
     sessionCatalog.createPartitions(
       TableIdentifier("tbl"), Seq(partWithMixedOrder), ignoreIfExists = false)
     assert(catalogPartitionsEqual(
-      externalCatalog, "mydb", "tbl", Seq(part1, part2, partWithMixedOrder)))
+      externalCatalog.listPartitions("mydb", "tbl"), part1, part2, partWithMixedOrder))
   }
 
   test("create partitions when database/table does not exist") {
@@ -586,13 +586,13 @@ class SessionCatalogSuite extends SparkFunSuite {
   test("drop partitions") {
     val externalCatalog = newBasicCatalog()
     val sessionCatalog = new SessionCatalog(externalCatalog)
-    assert(catalogPartitionsEqual(externalCatalog, "db2", "tbl2", Seq(part1, part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part1, part2))
     sessionCatalog.dropPartitions(
       TableIdentifier("tbl2", Some("db2")),
       Seq(part1.spec),
       ignoreIfNotExists = false,
       purge = false)
-    assert(catalogPartitionsEqual(externalCatalog, "db2", "tbl2", Seq(part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part2))
     // Drop partitions without explicitly specifying database
     sessionCatalog.setCurrentDatabase("db2")
     sessionCatalog.dropPartitions(
@@ -604,7 +604,7 @@ class SessionCatalogSuite extends SparkFunSuite {
     // Drop multiple partitions at once
     sessionCatalog.createPartitions(
       TableIdentifier("tbl2", Some("db2")), Seq(part1, part2), ignoreIfExists = false)
-    assert(catalogPartitionsEqual(externalCatalog, "db2", "tbl2", Seq(part1, part2)))
+    assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part1, part2))
     sessionCatalog.dropPartitions(
       TableIdentifier("tbl2", Some("db2")),
       Seq(part1.spec, part2.spec),
@@ -844,10 +844,11 @@ class SessionCatalogSuite extends SparkFunSuite {
 
   test("list partitions") {
     val catalog = new SessionCatalog(newBasicCatalog())
-    assert(catalog.listPartitions(TableIdentifier("tbl2", Some("db2"))).toSet == Set(part1, part2))
+    assert(catalogPartitionsEqual(
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2"))), part1, part2))
     // List partitions without explicitly specifying database
     catalog.setCurrentDatabase("db2")
-    assert(catalog.listPartitions(TableIdentifier("tbl2")).toSet == Set(part1, part2))
+    assert(catalogPartitionsEqual(catalog.listPartitions(TableIdentifier("tbl2")), part1, part2))
   }
 
   test("list partitions when database/table does not exist") {
@@ -860,6 +861,15 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
   }
 
+  private def catalogPartitionsEqual(
+      actualParts: Seq[CatalogTablePartition],
+      expectedParts: CatalogTablePartition*): Boolean = {
+    // ExternalCatalog may set a default location for partitions, here we ignore the partition
+    // location when comparing them.
+    actualParts.map(p => p.copy(storage = p.storage.copy(locationUri = None))).toSet ==
+      expectedParts.map(p => p.copy(storage = p.storage.copy(locationUri = None))).toSet
+  }
+
   // --------------------------------------------------------------------------
   // Functions
   // --------------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 8500ab460a1b6..84a63fdb9f36f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.Resolver
-import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, PartitioningUtils}
@@ -500,7 +500,7 @@ case class AlterTableRecoverPartitionsCommand(
         s"location provided: $tableIdentWithDB")
     }
 
-    val root = new Path(table.storage.locationUri.get)
+    val root = new Path(table.location)
     logInfo(s"Recover all the partitions in $root")
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
 
@@ -558,9 +558,9 @@ case class AlterTableRecoverPartitionsCommand(
       val name = st.getPath.getName
       if (st.isDirectory && name.contains("=")) {
         val ps = name.split("=", 2)
-        val columnName = PartitioningUtils.unescapePathName(ps(0))
+        val columnName = ExternalCatalogUtils.unescapePathName(ps(0))
         // TODO: Validate the value
-        val value = PartitioningUtils.unescapePathName(ps(1))
+        val value = ExternalCatalogUtils.unescapePathName(ps(1))
         if (resolver(columnName, partitionNames.head)) {
           scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value),
             partitionNames.drop(1), threshold, resolver)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index e49a1f5acd0c9..119e732d0202c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -710,7 +710,8 @@ case class ShowPartitionsCommand(
 
   private def getPartName(spec: TablePartitionSpec, partColNames: Seq[String]): String = {
     partColNames.map { name =>
-      PartitioningUtils.escapePathName(name) + "=" + PartitioningUtils.escapePathName(spec(name))
+      ExternalCatalogUtils.escapePathName(name) + "=" +
+        ExternalCatalogUtils.escapePathName(spec(name))
     }.mkString(File.separator)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
index 443a2ec033a98..4ad91dcceb432 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala
@@ -67,7 +67,7 @@ class CatalogFileIndex(
       val selectedPartitions = sparkSession.sessionState.catalog.listPartitionsByFilter(
         table.identifier, filters)
       val partitions = selectedPartitions.map { p =>
-        val path = new Path(p.storage.locationUri.get)
+        val path = new Path(p.location)
         val fs = path.getFileSystem(hadoopConf)
         PartitionPath(
           p.toRow(partitionSchema), path.makeQualified(fs.getUri, fs.getWorkingDirectory))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 2d43a6ad098ed..739aeac877b99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -190,7 +190,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
       val effectiveOutputPath = if (overwritingSinglePartition) {
         val partition = t.sparkSession.sessionState.catalog.getPartition(
           l.catalogTable.get.identifier, overwrite.specificPartition.get)
-        new Path(partition.storage.locationUri.get)
+        new Path(partition.location)
       } else {
         outputPath
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index e404dcd5452b9..0f8ed9e23fe3b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -32,7 +32,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage
 import org.apache.spark.sql.{Dataset, SparkSession}
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.catalog.{BucketSpec, ExternalCatalogUtils}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
@@ -281,11 +281,11 @@ object FileFormatWriter extends Logging {
     private def partitionStringExpression: Seq[Expression] = {
       description.partitionColumns.zipWithIndex.flatMap { case (c, i) =>
         val escaped = ScalaUDF(
-          PartitioningUtils.escapePathName _,
+          ExternalCatalogUtils.escapePathName _,
           StringType,
           Seq(Cast(c, StringType)),
           Seq(StringType))
-        val str = If(IsNull(c), Literal(PartitioningUtils.DEFAULT_PARTITION_NAME), escaped)
+        val str = If(IsNull(c), Literal(ExternalCatalogUtils.DEFAULT_PARTITION_NAME), escaped)
         val partitionName = Literal(c.name + "=") :: str :: Nil
         if (i == 0) partitionName else Literal(Path.SEPARATOR) :: partitionName
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index a8a722dd3c620..3740caa22c37e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -128,7 +128,6 @@ abstract class PartitioningAwareFileIndex(
       case Some(userProvidedSchema) if userProvidedSchema.nonEmpty =>
         val spec = PartitioningUtils.parsePartitions(
           leafDirs,
-          PartitioningUtils.DEFAULT_PARTITION_NAME,
           typeInference = false,
           basePaths = basePaths)
 
@@ -148,7 +147,6 @@ abstract class PartitioningAwareFileIndex(
       case _ =>
         PartitioningUtils.parsePartitions(
           leafDirs,
-          PartitioningUtils.DEFAULT_PARTITION_NAME,
           typeInference = sparkSession.sessionState.conf.partitionColumnTypeInferenceEnabled,
           basePaths = basePaths)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index b51b41869bf06..a28b04ca3fb5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -25,7 +25,6 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.util.Shell
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
@@ -56,15 +55,15 @@ object PartitionSpec {
 }
 
 object PartitioningUtils {
-  // This duplicates default value of Hive `ConfVars.DEFAULTPARTITIONNAME`, since sql/core doesn't
-  // depend on Hive.
-  val DEFAULT_PARTITION_NAME = "__HIVE_DEFAULT_PARTITION__"
 
   private[datasources] case class PartitionValues(columnNames: Seq[String], literals: Seq[Literal])
   {
     require(columnNames.size == literals.size)
   }
 
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.DEFAULT_PARTITION_NAME
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.unescapePathName
+
   /**
    * Given a group of qualified paths, tries to parse them and returns a partition specification.
    * For example, given:
@@ -90,12 +89,11 @@ object PartitioningUtils {
    */
   private[datasources] def parsePartitions(
       paths: Seq[Path],
-      defaultPartitionName: String,
       typeInference: Boolean,
       basePaths: Set[Path]): PartitionSpec = {
     // First, we need to parse every partition's path and see if we can find partition values.
     val (partitionValues, optDiscoveredBasePaths) = paths.map { path =>
-      parsePartition(path, defaultPartitionName, typeInference, basePaths)
+      parsePartition(path, typeInference, basePaths)
     }.unzip
 
     // We create pairs of (path -> path's partition value) here
@@ -173,7 +171,6 @@ object PartitioningUtils {
    */
   private[datasources] def parsePartition(
       path: Path,
-      defaultPartitionName: String,
       typeInference: Boolean,
       basePaths: Set[Path]): (Option[PartitionValues], Option[Path]) = {
     val columns = ArrayBuffer.empty[(String, Literal)]
@@ -196,7 +193,7 @@ object PartitioningUtils {
         // Let's say currentPath is a path of "/table/a=1/", currentPath.getName will give us a=1.
         // Once we get the string, we try to parse it and find the partition column and value.
         val maybeColumn =
-          parsePartitionColumn(currentPath.getName, defaultPartitionName, typeInference)
+          parsePartitionColumn(currentPath.getName, typeInference)
         maybeColumn.foreach(columns += _)
 
         // Now, we determine if we should stop.
@@ -228,7 +225,6 @@ object PartitioningUtils {
 
   private def parsePartitionColumn(
       columnSpec: String,
-      defaultPartitionName: String,
       typeInference: Boolean): Option[(String, Literal)] = {
     val equalSignIndex = columnSpec.indexOf('=')
     if (equalSignIndex == -1) {
@@ -240,7 +236,7 @@ object PartitioningUtils {
       val rawColumnValue = columnSpec.drop(equalSignIndex + 1)
       assert(rawColumnValue.nonEmpty, s"Empty partition column value in '$columnSpec'")
 
-      val literal = inferPartitionColumnValue(rawColumnValue, defaultPartitionName, typeInference)
+      val literal = inferPartitionColumnValue(rawColumnValue, typeInference)
       Some(columnName -> literal)
     }
   }
@@ -355,7 +351,6 @@ object PartitioningUtils {
    */
   private[datasources] def inferPartitionColumnValue(
       raw: String,
-      defaultPartitionName: String,
       typeInference: Boolean): Literal = {
     val decimalTry = Try {
       // `BigDecimal` conversion can fail when the `field` is not a form of number.
@@ -380,14 +375,14 @@ object PartitioningUtils {
         .orElse(Try(Literal(JTimestamp.valueOf(unescapePathName(raw)))))
         // Then falls back to string
         .getOrElse {
-          if (raw == defaultPartitionName) {
+          if (raw == DEFAULT_PARTITION_NAME) {
             Literal.create(null, NullType)
           } else {
             Literal.create(unescapePathName(raw), StringType)
           }
         }
     } else {
-      if (raw == defaultPartitionName) {
+      if (raw == DEFAULT_PARTITION_NAME) {
         Literal.create(null, NullType)
       } else {
         Literal.create(unescapePathName(raw), StringType)
@@ -450,77 +445,4 @@ object PartitioningUtils {
       Literal.create(Cast(l, desiredType).eval(), desiredType)
     }
   }
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-  // The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
-  //////////////////////////////////////////////////////////////////////////////////////////////////
-
-  val charToEscape = {
-    val bitSet = new java.util.BitSet(128)
-
-    /**
-     * ASCII 01-1F are HTTP control characters that need to be escaped.
-     * \u000A and \u000D are \n and \r, respectively.
-     */
-    val clist = Array(
-      '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009',
-      '\n', '\u000B', '\u000C', '\r', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
-      '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
-      '\u001D', '\u001E', '\u001F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '\u007F',
-      '{', '[', ']', '^')
-
-    clist.foreach(bitSet.set(_))
-
-    if (Shell.WINDOWS) {
-      Array(' ', '<', '>', '|').foreach(bitSet.set(_))
-    }
-
-    bitSet
-  }
-
-  def needsEscaping(c: Char): Boolean = {
-    c >= 0 && c < charToEscape.size() && charToEscape.get(c)
-  }
-
-  def escapePathName(path: String): String = {
-    val builder = new StringBuilder()
-    path.foreach { c =>
-      if (needsEscaping(c)) {
-        builder.append('%')
-        builder.append(f"${c.asInstanceOf[Int]}%02X")
-      } else {
-        builder.append(c)
-      }
-    }
-
-    builder.toString()
-  }
-
-  def unescapePathName(path: String): String = {
-    val sb = new StringBuilder
-    var i = 0
-
-    while (i < path.length) {
-      val c = path.charAt(i)
-      if (c == '%' && i + 2 < path.length) {
-        val code: Int = try {
-          Integer.parseInt(path.substring(i + 1, i + 3), 16)
-        } catch {
-          case _: Exception => -1
-        }
-        if (code >= 0) {
-          sb.append(code.asInstanceOf[Char])
-          i += 3
-        } else {
-          sb.append(c)
-          i += 1
-        }
-      } else {
-        sb.append(c)
-        i += 1
-      }
-    }
-
-    sb.toString()
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index df3a3c34c39a0..363715c6d2249 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -875,7 +875,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1))
 
     val part2 = Map("a" -> "2", "b" -> "6")
-    val root = new Path(catalog.getTableMetadata(tableIdent).storage.locationUri.get)
+    val root = new Path(catalog.getTableMetadata(tableIdent).location)
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
     // valid
     fs.mkdirs(new Path(new Path(root, "a=1"), "b=5"))
@@ -1133,7 +1133,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     }
     assert(catalog.getTableMetadata(tableIdent).storage.locationUri.isDefined)
     assert(catalog.getTableMetadata(tableIdent).storage.properties.isEmpty)
-    assert(catalog.getPartition(tableIdent, partSpec).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, partSpec).storage.locationUri.isDefined)
     assert(catalog.getPartition(tableIdent, partSpec).storage.properties.isEmpty)
     // Verify that the location is set to the expected string
     def verifyLocation(expected: String, spec: Option[TablePartitionSpec] = None): Unit = {
@@ -1296,9 +1296,9 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     sql("ALTER TABLE dbx.tab1 ADD IF NOT EXISTS " +
       "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')")
     assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3))
-    assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isDefined)
     assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option("paris"))
-    assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isEmpty)
+    assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isDefined)
 
     // add partitions without explicitly specifying database
     catalog.setCurrentDatabase("dbx")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index 120a3a2ef33aa..22e35a1bc0b1d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -29,6 +29,7 @@ import org.apache.parquet.hadoop.ParquetOutputFormat
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.datasources.{PartitionPath => Partition}
@@ -48,11 +49,11 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
   import PartitioningUtils._
   import testImplicits._
 
-  val defaultPartitionName = "__HIVE_DEFAULT_PARTITION__"
+  val defaultPartitionName = ExternalCatalogUtils.DEFAULT_PARTITION_NAME
 
   test("column type inference") {
     def check(raw: String, literal: Literal): Unit = {
-      assert(inferPartitionColumnValue(raw, defaultPartitionName, true) === literal)
+      assert(inferPartitionColumnValue(raw, true) === literal)
     }
 
     check("10", Literal.create(10, IntegerType))
@@ -76,7 +77,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       "hdfs://host:9000/path/a=10.5/b=hello")
 
     var exception = intercept[AssertionError] {
-      parsePartitions(paths.map(new Path(_)), defaultPartitionName, true, Set.empty[Path])
+      parsePartitions(paths.map(new Path(_)), true, Set.empty[Path])
     }
     assert(exception.getMessage().contains("Conflicting directory structures detected"))
 
@@ -88,7 +89,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     parsePartitions(
       paths.map(new Path(_)),
-      defaultPartitionName,
       true,
       Set(new Path("hdfs://host:9000/path/")))
 
@@ -101,7 +101,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     parsePartitions(
       paths.map(new Path(_)),
-      defaultPartitionName,
       true,
       Set(new Path("hdfs://host:9000/path/something=true/table")))
 
@@ -114,7 +113,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
     parsePartitions(
       paths.map(new Path(_)),
-      defaultPartitionName,
       true,
       Set(new Path("hdfs://host:9000/path/table=true")))
 
@@ -127,7 +125,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     exception = intercept[AssertionError] {
       parsePartitions(
         paths.map(new Path(_)),
-        defaultPartitionName,
         true,
         Set(new Path("hdfs://host:9000/path/")))
     }
@@ -147,7 +144,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     exception = intercept[AssertionError] {
       parsePartitions(
         paths.map(new Path(_)),
-        defaultPartitionName,
         true,
         Set(new Path("hdfs://host:9000/tmp/tables/")))
     }
@@ -156,13 +152,13 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
 
   test("parse partition") {
     def check(path: String, expected: Option[PartitionValues]): Unit = {
-      val actual = parsePartition(new Path(path), defaultPartitionName, true, Set.empty[Path])._1
+      val actual = parsePartition(new Path(path), true, Set.empty[Path])._1
       assert(expected === actual)
     }
 
     def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
       val message = intercept[T] {
-        parsePartition(new Path(path), defaultPartitionName, true, Set.empty[Path])
+        parsePartition(new Path(path), true, Set.empty[Path])
       }.getMessage
 
       assert(message.contains(expected))
@@ -204,7 +200,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     // when the basePaths is the same as the path to a leaf directory
     val partitionSpec1: Option[PartitionValues] = parsePartition(
       path = new Path("file://path/a=10"),
-      defaultPartitionName = defaultPartitionName,
       typeInference = true,
       basePaths = Set(new Path("file://path/a=10")))._1
 
@@ -213,7 +208,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
     // when the basePaths is the path to a base directory of leaf directories
     val partitionSpec2: Option[PartitionValues] = parsePartition(
       path = new Path("file://path/a=10"),
-      defaultPartitionName = defaultPartitionName,
       typeInference = true,
       basePaths = Set(new Path("file://path")))._1
 
@@ -231,7 +225,6 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
       val actualSpec =
         parsePartitions(
           paths.map(new Path(_)),
-          defaultPartitionName,
           true,
           rootPaths)
       assert(actualSpec === spec)
@@ -314,7 +307,7 @@ class ParquetPartitionDiscoverySuite extends QueryTest with ParquetTest with Sha
   test("parse partitions with type inference disabled") {
     def check(paths: Seq[String], spec: PartitionSpec): Unit = {
       val actualSpec =
-        parsePartitions(paths.map(new Path(_)), defaultPartitionName, false, Set.empty[Path])
+        parsePartitions(paths.map(new Path(_)), false, Set.empty[Path])
       assert(actualSpec === spec)
     }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index b537061d0d221..42ce1a88a2b67 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.hive
 
+import java.io.IOException
 import java.util
 
 import scala.util.control.NonFatal
@@ -26,7 +27,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.ql.metadata.HiveException
 import org.apache.thrift.TException
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
@@ -255,7 +256,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           // compatible format, which means the data source is file-based and must have a `path`.
           require(tableDefinition.storage.locationUri.isDefined,
             "External file-based data source table must have a `path` entry in storage properties.")
-          Some(new Path(tableDefinition.storage.locationUri.get).toUri.toString)
+          Some(new Path(tableDefinition.location).toUri.toString)
         } else {
           None
         }
@@ -789,7 +790,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       parts: Seq[CatalogTablePartition],
       ignoreIfExists: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    val lowerCasedParts = parts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    val tablePath = new Path(tableMeta.location)
+    val partsWithLocation = parts.map { p =>
+      // Ideally we can leave the partition location empty and let Hive metastore to set it.
+      // However, Hive metastore is not case preserving and will generate wrong partition location
+      // with lower cased partition column names. Here we set the default partition location
+      // manually to avoid this problem.
+      val partitionPath = p.storage.locationUri.map(new Path(_)).getOrElse {
+        ExternalCatalogUtils.generatePartitionPath(p.spec, partitionColumnNames, tablePath)
+      }
+      p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toString)))
+    }
+    val lowerCasedParts = partsWithLocation.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
     client.createPartitions(db, table, lowerCasedParts, ignoreIfExists)
   }
 
@@ -810,6 +825,31 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       newSpecs: Seq[TablePartitionSpec]): Unit = withClient {
     client.renamePartitions(
       db, table, specs.map(lowerCasePartitionSpec), newSpecs.map(lowerCasePartitionSpec))
+
+    val tableMeta = getTable(db, table)
+    val partitionColumnNames = tableMeta.partitionColumnNames
+    // Hive metastore is not case preserving and keeps partition columns with lower cased names.
+    // When Hive rename partition for managed tables, it will create the partition location with
+    // a default path generate by the new spec with lower cased partition column names. This is
+    // unexpected and we need to rename them manually and alter the partition location.
+    val hasUpperCasePartitionColumn = partitionColumnNames.exists(col => col.toLowerCase != col)
+    if (tableMeta.tableType == MANAGED && hasUpperCasePartitionColumn) {
+      val tablePath = new Path(tableMeta.location)
+      val newParts = newSpecs.map { spec =>
+        val partition = client.getPartition(db, table, lowerCasePartitionSpec(spec))
+        val wrongPath = new Path(partition.location)
+        val rightPath = ExternalCatalogUtils.generatePartitionPath(
+          spec, partitionColumnNames, tablePath)
+        try {
+          tablePath.getFileSystem(hadoopConf).rename(wrongPath, rightPath)
+        } catch {
+          case e: IOException => throw new SparkException(
+            s"Unable to rename partition path from $wrongPath to $rightPath", e)
+        }
+        partition.copy(storage = partition.storage.copy(locationUri = Some(rightPath.toString)))
+      }
+      alterPartitions(db, table, newParts)
+    }
   }
 
   override def alterPartitions(
@@ -817,6 +857,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       newParts: Seq[CatalogTablePartition]): Unit = withClient {
     val lowerCasedParts = newParts.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
+    // Note: Before altering table partitions in Hive, you *must* set the current database
+    // to the one that contains the table of interest. Otherwise you will end up with the
+    // most helpful error message ever: "Unable to alter partition. alter is not possible."
+    // See HIVE-2742 for more detail.
+    client.setCurrentDatabase(db)
     client.alterPartitions(db, table, lowerCasedParts)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index d3873cf6c8231..fbd705172cae6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -445,7 +445,7 @@ object SetWarehouseLocationTest extends Logging {
         catalog.getTableMetadata(TableIdentifier("testLocation", Some("default")))
       val expectedLocation =
         "file:" + expectedWarehouseLocation.toString + "/testlocation"
-      val actualLocation = tableMetadata.storage.locationUri.get
+      val actualLocation = tableMetadata.location
       if (actualLocation != expectedLocation) {
         throw new Exception(
           s"Expected table location is $expectedLocation. But, it is actually $actualLocation")
@@ -461,7 +461,7 @@ object SetWarehouseLocationTest extends Logging {
         catalog.getTableMetadata(TableIdentifier("testLocation", Some("testLocationDB")))
       val expectedLocation =
         "file:" + expectedWarehouseLocation.toString + "/testlocationdb.db/testlocation"
-      val actualLocation = tableMetadata.storage.locationUri.get
+      val actualLocation = tableMetadata.location
       if (actualLocation != expectedLocation) {
         throw new Exception(
           s"Expected table location is $expectedLocation. But, it is actually $actualLocation")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index cfc1d81d544eb..9f4401ae22560 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -29,7 +29,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
     val expectedPath =
       spark.sharedState.externalCatalog.getDatabase(dbName).locationUri + "/" + tableName
 
-    assert(metastoreTable.storage.locationUri.get === expectedPath)
+    assert(metastoreTable.location === expectedPath)
   }
 
   private def getTableNames(dbName: Option[String] = None): Array[String] = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 0076a778683ca..6efae13ddf69d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -425,7 +425,7 @@ class HiveDDLSuite
     sql("CREATE TABLE tab1 (height INT, length INT) PARTITIONED BY (a INT, b INT)")
     val part1 = Map("a" -> "1", "b" -> "5")
     val part2 = Map("a" -> "2", "b" -> "6")
-    val root = new Path(catalog.getTableMetadata(tableIdent).storage.locationUri.get)
+    val root = new Path(catalog.getTableMetadata(tableIdent).location)
     val fs = root.getFileSystem(spark.sparkContext.hadoopConfiguration)
     // valid
     fs.mkdirs(new Path(new Path(root, "a=1"), "b=5"))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index c21db3595fa19..e607af67f93e5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -542,7 +542,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         }
         userSpecifiedLocation match {
           case Some(location) =>
-            assert(r.catalogTable.storage.locationUri.get === location)
+            assert(r.catalogTable.location === location)
           case None => // OK.
         }
         // Also make sure that the format and serde are as desired.

From 064d4315f246450043a52882fcf59e95d79701e8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 10 Nov 2016 17:00:43 -0800
Subject: [PATCH 086/534] [SPARK-18185] Fix all forms of INSERT / OVERWRITE
 TABLE for Datasource tables

## What changes were proposed in this pull request?

As of current 2.1, INSERT OVERWRITE with dynamic partitions against a Datasource table will overwrite the entire table instead of only the partitions matching the static keys, as in Hive. It also doesn't respect custom partition locations.

This PR adds support for all these operations to Datasource tables managed by the Hive metastore. It is implemented as follows
- During planning time, the full set of partitions affected by an INSERT or OVERWRITE command is read from the Hive metastore.
- The planner identifies any partitions with custom locations and includes this in the write task metadata.
- FileFormatWriter tasks refer to this custom locations map when determining where to write for dynamic partition output.
- When the write job finishes, the set of written partitions is compared against the initial set of matched partitions, and the Hive metastore is updated to reflect the newly added / removed partitions.

It was necessary to introduce a method for staging files with absolute output paths to `FileCommitProtocol`. These files are not handled by the Hadoop output committer but are moved to their final locations when the job commits.

The overwrite behavior of legacy Datasource tables is also changed: no longer will the entire table be overwritten if a partial partition spec is present.

cc cloud-fan yhuai

## How was this patch tested?

Unit tests, existing tests.

Author: Eric Liang <ekl@databricks.com>
Author: Wenchen Fan <wenchen@databricks.com>

Closes #15814 from ericl/sc-5027.

(cherry picked from commit a3356343cbf58b930326f45721fb4ecade6f8029)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../internal/io/FileCommitProtocol.scala      |  15 ++
 .../io/HadoopMapReduceCommitProtocol.scala    |  63 ++++++-
 .../sql/catalyst/parser/AstBuilder.scala      |  12 +-
 .../plans/logical/basicLogicalOperators.scala |  10 +-
 .../sql/catalyst/parser/PlanParserSuite.scala |   4 +-
 .../execution/datasources/DataSource.scala    |  20 ++-
 .../datasources/DataSourceStrategy.scala      |  94 +++++++---
 .../datasources/FileFormatWriter.scala        |  26 ++-
 .../InsertIntoHadoopFsRelationCommand.scala   |  61 ++++++-
 .../datasources/PartitioningUtils.scala       |  10 ++
 .../execution/streaming/FileStreamSink.scala  |   2 +-
 .../ManifestFileCommitProtocol.scala          |   6 +
 .../PartitionProviderCompatibilitySuite.scala | 161 +++++++++++++++++-
 13 files changed, 411 insertions(+), 73 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index fb8020585cf89..afd2250c93a8a 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -82,9 +82,24 @@ abstract class FileCommitProtocol {
    *
    * The "dir" parameter specifies 2, and "ext" parameter specifies both 4 and 5, and the rest
    * are left to the commit protocol implementation to decide.
+   *
+   * Important: it is the caller's responsibility to add uniquely identifying content to "ext"
+   * if a task is going to write out multiple files to the same dir. The file commit protocol only
+   * guarantees that files written by different tasks will not conflict.
    */
   def newTaskTempFile(taskContext: TaskAttemptContext, dir: Option[String], ext: String): String
 
+  /**
+   * Similar to newTaskTempFile(), but allows files to committed to an absolute output location.
+   * Depending on the implementation, there may be weaker guarantees around adding files this way.
+   *
+   * Important: it is the caller's responsibility to add uniquely identifying content to "ext"
+   * if a task is going to write out multiple files to the same dir. The file commit protocol only
+   * guarantees that files written by different tasks will not conflict.
+   */
+  def newTaskTempFileAbsPath(
+      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String
+
   /**
    * Commits a task after the writes succeed. Must be called on the executors when running tasks.
    */
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 66ccb6d437708..c99b75e52325e 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.internal.io
 
-import java.util.Date
+import java.util.{Date, UUID}
+
+import scala.collection.mutable
 
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce._
@@ -42,17 +44,26 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
   /** OutputCommitter from Hadoop is not serializable so marking it transient. */
   @transient private var committer: OutputCommitter = _
 
+  /**
+   * Tracks files staged by this task for absolute output paths. These outputs are not managed by
+   * the Hadoop OutputCommitter, so we must move these to their final locations on job commit.
+   *
+   * The mapping is from the temp output path to the final desired output path of the file.
+   */
+  @transient private var addedAbsPathFiles: mutable.Map[String, String] = null
+
+  /**
+   * The staging directory for all files committed with absolute output paths.
+   */
+  private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
+
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     context.getOutputFormatClass.newInstance().getOutputCommitter(context)
   }
 
   override def newTaskTempFile(
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
-    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
-    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
-    // the file name is fine and won't overflow.
-    val split = taskContext.getTaskAttemptID.getTaskID.getId
-    val filename = f"part-$split%05d-$jobId$ext"
+    val filename = getFilename(taskContext, ext)
 
     val stagingDir: String = committer match {
       // For FileOutputCommitter it has its own staging path called "work path".
@@ -67,6 +78,28 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
     }
   }
 
+  override def newTaskTempFileAbsPath(
+      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
+    val filename = getFilename(taskContext, ext)
+    val absOutputPath = new Path(absoluteDir, filename).toString
+
+    // Include a UUID here to prevent file collisions for one task writing to different dirs.
+    // In principle we could include hash(absoluteDir) instead but this is simpler.
+    val tmpOutputPath = new Path(
+      absPathStagingDir, UUID.randomUUID().toString() + "-" + filename).toString
+
+    addedAbsPathFiles(tmpOutputPath) = absOutputPath
+    tmpOutputPath
+  }
+
+  private def getFilename(taskContext: TaskAttemptContext, ext: String): String = {
+    // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet
+    // Note that %05d does not truncate the split number, so if we have more than 100000 tasks,
+    // the file name is fine and won't overflow.
+    val split = taskContext.getTaskAttemptID.getTaskID.getId
+    f"part-$split%05d-$jobId$ext"
+  }
+
   override def setupJob(jobContext: JobContext): Unit = {
     // Setup IDs
     val jobId = SparkHadoopWriter.createJobID(new Date, 0)
@@ -87,25 +120,41 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
+    val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
+      .foldLeft(Map[String, String]())(_ ++ _)
+    logDebug(s"Committing files staged for absolute locations $filesToMove")
+    val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+    for ((src, dst) <- filesToMove) {
+      fs.rename(new Path(src), new Path(dst))
+    }
+    fs.delete(absPathStagingDir, true)
   }
 
   override def abortJob(jobContext: JobContext): Unit = {
     committer.abortJob(jobContext, JobStatus.State.FAILED)
+    val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+    fs.delete(absPathStagingDir, true)
   }
 
   override def setupTask(taskContext: TaskAttemptContext): Unit = {
     committer = setupCommitter(taskContext)
     committer.setupTask(taskContext)
+    addedAbsPathFiles = mutable.Map[String, String]()
   }
 
   override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
     val attemptId = taskContext.getTaskAttemptID
     SparkHadoopMapRedUtil.commitTask(
       committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
-    EmptyTaskCommitMessage
+    new TaskCommitMessage(addedAbsPathFiles.toMap)
   }
 
   override def abortTask(taskContext: TaskAttemptContext): Unit = {
     committer.abortTask(taskContext)
+    // best effort cleanup of other staged files
+    for ((src, _) <- addedAbsPathFiles) {
+      val tmp = new Path(src)
+      tmp.getFileSystem(taskContext.getConfiguration).delete(tmp, false)
+    }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 4b151c81d8f8b..2006844923cf7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -172,24 +172,20 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
     val tableIdent = visitTableIdentifier(ctx.tableIdentifier)
     val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty)
 
-    val dynamicPartitionKeys = partitionKeys.filter(_._2.isEmpty)
+    val dynamicPartitionKeys: Map[String, Option[String]] = partitionKeys.filter(_._2.isEmpty)
     if (ctx.EXISTS != null && dynamicPartitionKeys.nonEmpty) {
       throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " +
         "partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx)
     }
     val overwrite = ctx.OVERWRITE != null
-    val overwritePartition =
-      if (overwrite && partitionKeys.nonEmpty && dynamicPartitionKeys.isEmpty) {
-        Some(partitionKeys.map(t => (t._1, t._2.get)))
-      } else {
-        None
-      }
+    val staticPartitionKeys: Map[String, String] =
+      partitionKeys.filter(_._2.nonEmpty).map(t => (t._1, t._2.get))
 
     InsertIntoTable(
       UnresolvedRelation(tableIdent, None),
       partitionKeys,
       query,
-      OverwriteOptions(overwrite, overwritePartition),
+      OverwriteOptions(overwrite, if (overwrite) staticPartitionKeys else Map.empty),
       ctx.EXISTS != null)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 65ceab2ce27b1..574caf039d3d2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -350,13 +350,15 @@ case class BroadcastHint(child: LogicalPlan) extends UnaryNode {
  * Options for writing new data into a table.
  *
  * @param enabled whether to overwrite existing data in the table.
- * @param specificPartition only data in the specified partition will be overwritten.
+ * @param staticPartitionKeys if non-empty, specifies that we only want to overwrite partitions
+ *                            that match this partial partition spec. If empty, all partitions
+ *                            will be overwritten.
  */
 case class OverwriteOptions(
     enabled: Boolean,
-    specificPartition: Option[CatalogTypes.TablePartitionSpec] = None) {
-  if (specificPartition.isDefined) {
-    assert(enabled, "Overwrite must be enabled when specifying a partition to overwrite.")
+    staticPartitionKeys: CatalogTypes.TablePartitionSpec = Map.empty) {
+  if (staticPartitionKeys.nonEmpty) {
+    assert(enabled, "Overwrite must be enabled when specifying specific partitions.")
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 7400f3430e99c..e5f1f7b3bd4cf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -185,9 +185,9 @@ class PlanParserSuite extends PlanTest {
         OverwriteOptions(
           overwrite,
           if (overwrite && partition.nonEmpty) {
-            Some(partition.map(kv => (kv._1, kv._2.get)))
+            partition.map(kv => (kv._1, kv._2.get))
           } else {
-            None
+            Map.empty
           }),
         ifNotExists)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 5d663949df6b5..65422f1495f03 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -417,15 +417,17 @@ case class DataSource(
         // will be adjusted within InsertIntoHadoopFsRelation.
         val plan =
           InsertIntoHadoopFsRelationCommand(
-            outputPath,
-            columns,
-            bucketSpec,
-            format,
-            _ => Unit, // No existing table needs to be refreshed.
-            options,
-            data.logicalPlan,
-            mode,
-            catalogTable)
+            outputPath = outputPath,
+            staticPartitionKeys = Map.empty,
+            customPartitionLocations = Map.empty,
+            partitionColumns = columns,
+            bucketSpec = bucketSpec,
+            fileFormat = format,
+            refreshFunction = _ => Unit, // No existing table needs to be refreshed.
+            options = options,
+            query = data.logicalPlan,
+            mode = mode,
+            catalogTable = catalogTable)
         sparkSession.sessionState.executePlan(plan).toRdd
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring it.
         copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 739aeac877b99..4f19a2d00b0e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -24,10 +24,10 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow, TableIdentifier}
 import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
 import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SimpleCatalogRelation}
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition, SimpleCatalogRelation}
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions._
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union}
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
-import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, DDLUtils, ExecutedCommandExec}
+import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -182,41 +182,53 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
           "Cannot overwrite a path that is also being read from.")
       }
 
-      val overwritingSinglePartition =
-        overwrite.specificPartition.isDefined &&
+      val partitionSchema = query.resolve(
+        t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver)
+      val partitionsTrackedByCatalog =
         t.sparkSession.sessionState.conf.manageFilesourcePartitions &&
+        l.catalogTable.isDefined && l.catalogTable.get.partitionColumnNames.nonEmpty &&
         l.catalogTable.get.tracksPartitionsInCatalog
 
-      val effectiveOutputPath = if (overwritingSinglePartition) {
-        val partition = t.sparkSession.sessionState.catalog.getPartition(
-          l.catalogTable.get.identifier, overwrite.specificPartition.get)
-        new Path(partition.location)
-      } else {
-        outputPath
-      }
-
-      val effectivePartitionSchema = if (overwritingSinglePartition) {
-        Nil
-      } else {
-        query.resolve(t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver)
+      var initialMatchingPartitions: Seq[TablePartitionSpec] = Nil
+      var customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty
+
+      // When partitions are tracked by the catalog, compute all custom partition locations that
+      // may be relevant to the insertion job.
+      if (partitionsTrackedByCatalog) {
+        val matchingPartitions = t.sparkSession.sessionState.catalog.listPartitions(
+          l.catalogTable.get.identifier, Some(overwrite.staticPartitionKeys))
+        initialMatchingPartitions = matchingPartitions.map(_.spec)
+        customPartitionLocations = getCustomPartitionLocations(
+          t.sparkSession, l.catalogTable.get, outputPath, matchingPartitions)
       }
 
+      // Callback for updating metastore partition metadata after the insertion job completes.
+      // TODO(ekl) consider moving this into InsertIntoHadoopFsRelationCommand
       def refreshPartitionsCallback(updatedPartitions: Seq[TablePartitionSpec]): Unit = {
-        if (l.catalogTable.isDefined && updatedPartitions.nonEmpty &&
-            l.catalogTable.get.partitionColumnNames.nonEmpty &&
-            l.catalogTable.get.tracksPartitionsInCatalog) {
-          val metastoreUpdater = AlterTableAddPartitionCommand(
-            l.catalogTable.get.identifier,
-            updatedPartitions.map(p => (p, None)),
-            ifNotExists = true)
-          metastoreUpdater.run(t.sparkSession)
+        if (partitionsTrackedByCatalog) {
+          val newPartitions = updatedPartitions.toSet -- initialMatchingPartitions
+          if (newPartitions.nonEmpty) {
+            AlterTableAddPartitionCommand(
+              l.catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)),
+              ifNotExists = true).run(t.sparkSession)
+          }
+          if (overwrite.enabled) {
+            val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
+            if (deletedPartitions.nonEmpty) {
+              AlterTableDropPartitionCommand(
+                l.catalogTable.get.identifier, deletedPartitions.toSeq,
+                ifExists = true, purge = true).run(t.sparkSession)
+            }
+          }
         }
         t.location.refresh()
       }
 
       val insertCmd = InsertIntoHadoopFsRelationCommand(
-        effectiveOutputPath,
-        effectivePartitionSchema,
+        outputPath,
+        if (overwrite.enabled) overwrite.staticPartitionKeys else Map.empty,
+        customPartitionLocations,
+        partitionSchema,
         t.bucketSpec,
         t.fileFormat,
         refreshPartitionsCallback,
@@ -227,6 +239,34 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
 
       insertCmd
   }
+
+  /**
+   * Given a set of input partitions, returns those that have locations that differ from the
+   * Hive default (e.g. /k1=v1/k2=v2). These partitions were manually assigned locations by
+   * the user.
+   *
+   * @return a mapping from partition specs to their custom locations
+   */
+  private def getCustomPartitionLocations(
+      spark: SparkSession,
+      table: CatalogTable,
+      basePath: Path,
+      partitions: Seq[CatalogTablePartition]): Map[TablePartitionSpec, String] = {
+    val hadoopConf = spark.sessionState.newHadoopConf
+    val fs = basePath.getFileSystem(hadoopConf)
+    val qualifiedBasePath = basePath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+    partitions.flatMap { p =>
+      val defaultLocation = qualifiedBasePath.suffix(
+        "/" + PartitioningUtils.getPathFragment(p.spec, table.partitionSchema)).toString
+      val catalogLocation = new Path(p.location).makeQualified(
+        fs.getUri, fs.getWorkingDirectory).toString
+      if (catalogLocation != defaultLocation) {
+        Some(p.spec -> catalogLocation)
+      } else {
+        None
+      }
+    }.toMap
+  }
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 0f8ed9e23fe3b..edcce103d0963 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -47,6 +47,10 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 /** A helper object for writing FileFormat data out to a location. */
 object FileFormatWriter extends Logging {
 
+  /** Describes how output files should be placed in the filesystem. */
+  case class OutputSpec(
+    outputPath: String, customPartitionLocations: Map[TablePartitionSpec, String])
+
   /** A shared job description for all the write tasks. */
   private class WriteJobDescription(
       val uuid: String,  // prevent collision between different (appending) write jobs
@@ -56,7 +60,8 @@ object FileFormatWriter extends Logging {
       val partitionColumns: Seq[Attribute],
       val nonPartitionColumns: Seq[Attribute],
       val bucketSpec: Option[BucketSpec],
-      val path: String)
+      val path: String,
+      val customPartitionLocations: Map[TablePartitionSpec, String])
     extends Serializable {
 
     assert(AttributeSet(allColumns) == AttributeSet(partitionColumns ++ nonPartitionColumns),
@@ -83,7 +88,7 @@ object FileFormatWriter extends Logging {
       plan: LogicalPlan,
       fileFormat: FileFormat,
       committer: FileCommitProtocol,
-      outputPath: String,
+      outputSpec: OutputSpec,
       hadoopConf: Configuration,
       partitionColumns: Seq[Attribute],
       bucketSpec: Option[BucketSpec],
@@ -93,7 +98,7 @@ object FileFormatWriter extends Logging {
     val job = Job.getInstance(hadoopConf)
     job.setOutputKeyClass(classOf[Void])
     job.setOutputValueClass(classOf[InternalRow])
-    FileOutputFormat.setOutputPath(job, new Path(outputPath))
+    FileOutputFormat.setOutputPath(job, new Path(outputSpec.outputPath))
 
     val partitionSet = AttributeSet(partitionColumns)
     val dataColumns = plan.output.filterNot(partitionSet.contains)
@@ -111,7 +116,8 @@ object FileFormatWriter extends Logging {
       partitionColumns = partitionColumns,
       nonPartitionColumns = dataColumns,
       bucketSpec = bucketSpec,
-      path = outputPath)
+      path = outputSpec.outputPath,
+      customPartitionLocations = outputSpec.customPartitionLocations)
 
     SQLExecution.withNewExecutionId(sparkSession, queryExecution) {
       // This call shouldn't be put into the `try` block below because it only initializes and
@@ -308,7 +314,17 @@ object FileFormatWriter extends Logging {
       }
       val ext = bucketId + description.outputWriterFactory.getFileExtension(taskAttemptContext)
 
-      val path = committer.newTaskTempFile(taskAttemptContext, partDir, ext)
+      val customPath = partDir match {
+        case Some(dir) =>
+          description.customPartitionLocations.get(PartitioningUtils.parsePathFragment(dir))
+        case _ =>
+          None
+      }
+      val path = if (customPath.isDefined) {
+        committer.newTaskTempFileAbsPath(taskAttemptContext, customPath.get, ext)
+      } else {
+        committer.newTaskTempFile(taskAttemptContext, partDir, ext)
+      }
       val newWriter = description.outputWriterFactory.newInstance(
         path = path,
         dataSchema = description.nonPartitionColumns.toStructType,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index a0a8cb5024c33..28975e1546e79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import java.io.IOException
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql._
@@ -32,19 +32,32 @@ import org.apache.spark.sql.execution.command.RunnableCommand
 /**
  * A command for writing data to a [[HadoopFsRelation]].  Supports both overwriting and appending.
  * Writing to dynamic partitions is also supported.
+ *
+ * @param staticPartitionKeys partial partitioning spec for write. This defines the scope of
+ *                            partition overwrites: when the spec is empty, all partitions are
+ *                            overwritten. When it covers a prefix of the partition keys, only
+ *                            partitions matching the prefix are overwritten.
+ * @param customPartitionLocations mapping of partition specs to their custom locations. The
+ *                                 caller should guarantee that exactly those table partitions
+ *                                 falling under the specified static partition keys are contained
+ *                                 in this map, and that no other partitions are.
  */
 case class InsertIntoHadoopFsRelationCommand(
     outputPath: Path,
+    staticPartitionKeys: TablePartitionSpec,
+    customPartitionLocations: Map[TablePartitionSpec, String],
     partitionColumns: Seq[Attribute],
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
-    refreshFunction: (Seq[TablePartitionSpec]) => Unit,
+    refreshFunction: Seq[TablePartitionSpec] => Unit,
     options: Map[String, String],
     @transient query: LogicalPlan,
     mode: SaveMode,
     catalogTable: Option[CatalogTable])
   extends RunnableCommand {
 
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
+
   override protected def innerChildren: Seq[LogicalPlan] = query :: Nil
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
@@ -66,10 +79,7 @@ case class InsertIntoHadoopFsRelationCommand(
       case (SaveMode.ErrorIfExists, true) =>
         throw new AnalysisException(s"path $qualifiedOutputPath already exists.")
       case (SaveMode.Overwrite, true) =>
-        if (!fs.delete(qualifiedOutputPath, true /* recursively */)) {
-          throw new IOException(s"Unable to clear output " +
-            s"directory $qualifiedOutputPath prior to writing to it")
-        }
+        deleteMatchingPartitions(fs, qualifiedOutputPath)
         true
       case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) =>
         true
@@ -93,7 +103,8 @@ case class InsertIntoHadoopFsRelationCommand(
         plan = query,
         fileFormat = fileFormat,
         committer = committer,
-        outputPath = qualifiedOutputPath.toString,
+        outputSpec = FileFormatWriter.OutputSpec(
+          qualifiedOutputPath.toString, customPartitionLocations),
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
@@ -105,4 +116,40 @@ case class InsertIntoHadoopFsRelationCommand(
 
     Seq.empty[Row]
   }
+
+  /**
+   * Deletes all partition files that match the specified static prefix. Partitions with custom
+   * locations are also cleared based on the custom locations map given to this class.
+   */
+  private def deleteMatchingPartitions(fs: FileSystem, qualifiedOutputPath: Path): Unit = {
+    val staticPartitionPrefix = if (staticPartitionKeys.nonEmpty) {
+      "/" + partitionColumns.flatMap { p =>
+        staticPartitionKeys.get(p.name) match {
+          case Some(value) =>
+            Some(escapePathName(p.name) + "=" + escapePathName(value))
+          case None =>
+            None
+        }
+      }.mkString("/")
+    } else {
+      ""
+    }
+    // first clear the path determined by the static partition keys (e.g. /table/foo=1)
+    val staticPrefixPath = qualifiedOutputPath.suffix(staticPartitionPrefix)
+    if (fs.exists(staticPrefixPath) && !fs.delete(staticPrefixPath, true /* recursively */)) {
+      throw new IOException(s"Unable to clear output " +
+        s"directory $staticPrefixPath prior to writing to it")
+    }
+    // now clear all custom partition locations (e.g. /custom/dir/where/foo=2/bar=4)
+    for ((spec, customLoc) <- customPartitionLocations) {
+      assert(
+        (staticPartitionKeys.toSet -- spec).isEmpty,
+        "Custom partition location did not match static partitioning keys")
+      val path = new Path(customLoc)
+      if (fs.exists(path) && !fs.delete(path, true)) {
+        throw new IOException(s"Unable to clear partition " +
+          s"directory $path prior to writing to it")
+      }
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index a28b04ca3fb5a..bf9f318780ec2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -62,6 +62,7 @@ object PartitioningUtils {
   }
 
   import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.DEFAULT_PARTITION_NAME
+  import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
   import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.unescapePathName
 
   /**
@@ -252,6 +253,15 @@ object PartitioningUtils {
     }.toMap
   }
 
+  /**
+   * This is the inverse of parsePathFragment().
+   */
+  def getPathFragment(spec: TablePartitionSpec, partitionSchema: StructType): String = {
+    partitionSchema.map { field =>
+      escapePathName(field.name) + "=" + escapePathName(spec(field.name))
+    }.mkString("/")
+  }
+
   /**
    * Normalize the column names in partition specification, w.r.t. the real partition column names
    * and case sensitivity. e.g., if the partition spec has a column named `monTh`, and there is a
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index e849cafef4184..f1c5f9ab5067d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -80,7 +80,7 @@ class FileStreamSink(
         plan = data.logicalPlan,
         fileFormat = fileFormat,
         committer = committer,
-        outputPath = path,
+        outputSpec = FileFormatWriter.OutputSpec(path, Map.empty),
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = None,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
index 1fe13fa1623fc..92191c8b64b72 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -96,6 +96,12 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     file
   }
 
+  override def newTaskTempFileAbsPath(
+      taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = {
+    throw new UnsupportedOperationException(
+      s"$this does not support adding files with an absolute path")
+  }
+
   override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
     if (addedFiles.nonEmpty) {
       val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index ac435bf6195b0..a1aa07456fd36 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.{AnalysisException, QueryTest}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.Utils
 
 class PartitionProviderCompatibilitySuite
   extends QueryTest with TestHiveSingleton with SQLTestUtils {
@@ -135,7 +136,7 @@ class PartitionProviderCompatibilitySuite
     }
   }
 
-  test("insert overwrite partition of legacy datasource table overwrites entire table") {
+  test("insert overwrite partition of legacy datasource table") {
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {
         withTempDir { dir =>
@@ -144,9 +145,9 @@ class PartitionProviderCompatibilitySuite
             """insert overwrite table test
               |partition (partCol=1)
               |select * from range(100)""".stripMargin)
-          assert(spark.sql("select * from test").count() == 100)
+          assert(spark.sql("select * from test").count() == 104)
 
-          // Dynamic partitions case
+          // Overwriting entire table
           spark.sql("insert overwrite table test select id, id from range(10)".stripMargin)
           assert(spark.sql("select * from test").count() == 10)
         }
@@ -186,4 +187,158 @@ class PartitionProviderCompatibilitySuite
       }
     }
   }
+
+  /**
+   * Runs a test against a multi-level partitioned table, then validates that the custom locations
+   * were respected by the output writer.
+   *
+   * The initial partitioning structure is:
+   *   /P1=0/P2=0  -- custom location a
+   *   /P1=0/P2=1  -- custom location b
+   *   /P1=1/P2=0  -- custom location c
+   *   /P1=1/P2=1  -- default location
+   */
+  private def testCustomLocations(testFn: => Unit): Unit = {
+    val base = Utils.createTempDir(namePrefix = "base")
+    val a = Utils.createTempDir(namePrefix = "a")
+    val b = Utils.createTempDir(namePrefix = "b")
+    val c = Utils.createTempDir(namePrefix = "c")
+    try {
+      spark.sql(s"""
+        |create table test (id long, P1 int, P2 int)
+        |using parquet
+        |options (path "${base.getAbsolutePath}")
+        |partitioned by (P1, P2)""".stripMargin)
+      spark.sql(s"alter table test add partition (P1=0, P2=0) location '${a.getAbsolutePath}'")
+      spark.sql(s"alter table test add partition (P1=0, P2=1) location '${b.getAbsolutePath}'")
+      spark.sql(s"alter table test add partition (P1=1, P2=0) location '${c.getAbsolutePath}'")
+      spark.sql(s"alter table test add partition (P1=1, P2=1)")
+
+      testFn
+
+      // Now validate the partition custom locations were respected
+      val initialCount = spark.sql("select * from test").count()
+      val numA = spark.sql("select * from test where P1=0 and P2=0").count()
+      val numB = spark.sql("select * from test where P1=0 and P2=1").count()
+      val numC = spark.sql("select * from test where P1=1 and P2=0").count()
+      Utils.deleteRecursively(a)
+      spark.sql("refresh table test")
+      assert(spark.sql("select * from test where P1=0 and P2=0").count() == 0)
+      assert(spark.sql("select * from test").count() == initialCount - numA)
+      Utils.deleteRecursively(b)
+      spark.sql("refresh table test")
+      assert(spark.sql("select * from test where P1=0 and P2=1").count() == 0)
+      assert(spark.sql("select * from test").count() == initialCount - numA - numB)
+      Utils.deleteRecursively(c)
+      spark.sql("refresh table test")
+      assert(spark.sql("select * from test where P1=1 and P2=0").count() == 0)
+      assert(spark.sql("select * from test").count() == initialCount - numA - numB - numC)
+    } finally {
+      Utils.deleteRecursively(base)
+      Utils.deleteRecursively(a)
+      Utils.deleteRecursively(b)
+      Utils.deleteRecursively(c)
+      spark.sql("drop table test")
+    }
+  }
+
+  test("sanity check table setup") {
+    testCustomLocations {
+      assert(spark.sql("select * from test").count() == 0)
+      assert(spark.sql("show partitions test").count() == 4)
+    }
+  }
+
+  test("insert into partial dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert into test partition (P1=0, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert into test partition (P1=0, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 20)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert into test partition (P1=1, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 30)
+      assert(spark.sql("show partitions test").count() == 20)
+      spark.sql("insert into test partition (P1=2, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 40)
+      assert(spark.sql("show partitions test").count() == 30)
+    }
+  }
+
+  test("insert into fully dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert into test partition (P1, P2) select id, id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert into test partition (P1, P2) select id, id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 20)
+      assert(spark.sql("show partitions test").count() == 12)
+    }
+  }
+
+  test("insert into static partition") {
+    testCustomLocations {
+      spark.sql("insert into test partition (P1=0, P2=0) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert into test partition (P1=0, P2=0) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 20)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert into test partition (P1=1, P2=1) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 30)
+      assert(spark.sql("show partitions test").count() == 4)
+    }
+  }
+
+  test("overwrite partial dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert overwrite table test partition (P1=0, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 12)
+      spark.sql("insert overwrite table test partition (P1=0, P2) select id, id from range(5)")
+      assert(spark.sql("select * from test").count() == 5)
+      assert(spark.sql("show partitions test").count() == 7)
+      spark.sql("insert overwrite table test partition (P1=0, P2) select id, id from range(1)")
+      assert(spark.sql("select * from test").count() == 1)
+      assert(spark.sql("show partitions test").count() == 3)
+      spark.sql("insert overwrite table test partition (P1=1, P2) select id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 11)
+      assert(spark.sql("show partitions test").count() == 11)
+      spark.sql("insert overwrite table test partition (P1=1, P2) select id, id from range(1)")
+      assert(spark.sql("select * from test").count() == 2)
+      assert(spark.sql("show partitions test").count() == 2)
+      spark.sql("insert overwrite table test partition (P1=3, P2) select id, id from range(100)")
+      assert(spark.sql("select * from test").count() == 102)
+      assert(spark.sql("show partitions test").count() == 102)
+    }
+  }
+
+  test("overwrite fully dynamic partitions") {
+    testCustomLocations {
+      spark.sql("insert overwrite table test partition (P1, P2) select id, id, id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 10)
+      spark.sql("insert overwrite table test partition (P1, P2) select id, id, id from range(5)")
+      assert(spark.sql("select * from test").count() == 5)
+      assert(spark.sql("show partitions test").count() == 5)
+    }
+  }
+
+  test("overwrite static partition") {
+    testCustomLocations {
+      spark.sql("insert overwrite table test partition (P1=0, P2=0) select id from range(10)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert overwrite table test partition (P1=0, P2=0) select id from range(5)")
+      assert(spark.sql("select * from test").count() == 5)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert overwrite table test partition (P1=1, P2=1) select id from range(5)")
+      assert(spark.sql("select * from test").count() == 10)
+      assert(spark.sql("show partitions test").count() == 4)
+      spark.sql("insert overwrite table test partition (P1=1, P2=2) select id from range(5)")
+      assert(spark.sql("select * from test").count() == 15)
+      assert(spark.sql("show partitions test").count() == 5)
+    }
+  }
 }

From 51dca6143670ec1c1cb090047c3941becaf41fa9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Nov 2016 17:13:10 -0800
Subject: [PATCH 087/534] [SPARK-18401][SPARKR][ML] SparkR random forest should
 support output original label.

## What changes were proposed in this pull request?
SparkR ```spark.randomForest``` classification prediction should output original label rather than the indexed label. This issue is very similar with [SPARK-18291](https://issues.apache.org/jira/browse/SPARK-18291).

## How was this patch tested?
Add unit tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15842 from yanboliang/spark-18401.

(cherry picked from commit 5ddf69470b93c0b8a28bb4ac905e7670d9c50a95)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/inst/tests/testthat/test_mllib.R        | 24 ++++++++++++++++
 .../r/RandomForestClassificationWrapper.scala | 28 ++++++++++++++++---
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 1e456ef5c6b16..33e85b78de4fe 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -935,6 +935,10 @@ test_that("spark.randomForest Classification", {
   expect_equal(stats$numTrees, 20)
   expect_error(capture.output(stats), NA)
   expect_true(length(capture.output(stats)) > 6)
+  # Test string prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
 
   modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
   write.ml(model, modelPath)
@@ -947,6 +951,26 @@ test_that("spark.randomForest Classification", {
   expect_equal(stats$numClasses, stats2$numClasses)
 
   unlink(modelPath)
+
+  # Test numeric response variable
+  labelToIndex <- function(species) {
+    switch(as.character(species),
+      setosa = 0.0,
+      versicolor = 1.0,
+      virginica = 2.0
+    )
+  }
+  iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
+  data <- suppressWarnings(createDataFrame(iris[-5]))
+  model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  # Test numeric prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("1.0", predictions)), 50)
+  expect_equal(length(grep("2.0", predictions)), 50)
 })
 
 test_that("spark.gbt", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
index 6947ba7e7597a..31f846dc6cfec 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -23,9 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -35,6 +35,8 @@ private[r] class RandomForestClassifierWrapper private (
   val formula: String,
   val features: Array[String]) extends MLWritable {
 
+  import RandomForestClassifierWrapper._
+
   private val rfcModel: RandomForestClassificationModel =
     pipeline.stages(1).asInstanceOf[RandomForestClassificationModel]
 
@@ -46,7 +48,9 @@ private[r] class RandomForestClassifierWrapper private (
   def summary: String = rfcModel.toDebugString
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(rfcModel.getFeaturesCol)
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(rfcModel.getFeaturesCol)
   }
 
   override def write: MLWriter = new
@@ -54,6 +58,10 @@ private[r] class RandomForestClassifierWrapper private (
 }
 
 private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestClassifierWrapper] {
+
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
   def fit(  // scalastyle:ignore
       data: DataFrame,
       formula: String,
@@ -73,6 +81,7 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
 
     val rFormula = new RFormula()
       .setFormula(formula)
+      .setForceIndexLabel(true)
     RWrapperUtils.checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
@@ -82,6 +91,11 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       .attributes.get
     val features = featureAttrs.map(_.name.get)
 
+    // get label names from output schema
+    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+
     // assemble and fit the pipeline
     val rfc = new RandomForestClassifier()
       .setMaxDepth(maxDepth)
@@ -97,10 +111,16 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       .setCacheNodeIds(cacheNodeIds)
       .setProbabilityCol(probabilityCol)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, rfc))
+      .setStages(Array(rFormulaModel, rfc, idxToStr))
       .fit(data)
 
     new RandomForestClassifierWrapper(pipeline, formula, features)

From 00c9c7d96489778dfe38a36675d3162bf8844880 Mon Sep 17 00:00:00 2001
From: Vinayak <vijoshi5@in.ibm.com>
Date: Fri, 11 Nov 2016 12:54:16 -0600
Subject: [PATCH 088/534] [SPARK-17843][WEB UI] Indicate event logs pending for
 processing on history server UI

## What changes were proposed in this pull request?

History Server UI's application listing to display information on currently under process event logs so a user knows that pending this processing an application may not list on the UI.

When there are no event logs under process, the application list page has a "Last Updated" date-time at the top indicating the date-time of the last _completed_ scan of the event logs. The value is displayed to the user in his/her local time zone.
## How was this patch tested?

All unit tests pass. Particularly all the suites under org.apache.spark.deploy.history.\* were run to test changes.
- Very first startup - Pending logs - no logs processed yet:

<img width="1280" alt="screen shot 2016-10-24 at 3 07 04 pm" src="https://cloud.githubusercontent.com/assets/12079825/19640981/b8d2a96a-99fc-11e6-9b1f-2d736fe90e48.png">
- Very first startup - Pending logs - some logs processed:

<img width="1280" alt="screen shot 2016-10-24 at 3 18 42 pm" src="https://cloud.githubusercontent.com/assets/12079825/19641087/3f8e3bae-99fd-11e6-9ef1-e0e70d71d8ef.png">
- Last updated - No currently pending logs:

<img width="1280" alt="screen shot 2016-10-17 at 8 34 37 pm" src="https://cloud.githubusercontent.com/assets/12079825/19443100/4d13946c-94a9-11e6-8ee2-c442729bb206.png">
- Last updated - With some currently pending logs:

<img width="1280" alt="screen shot 2016-10-24 at 3 09 31 pm" src="https://cloud.githubusercontent.com/assets/12079825/19640903/7323ba3a-99fc-11e6-8359-6a45753dbb28.png">
- No applications found and No currently pending logs:

<img width="1280" alt="screen shot 2016-10-24 at 3 24 26 pm" src="https://cloud.githubusercontent.com/assets/12079825/19641364/03a2cb04-99fe-11e6-87d6-d09587fc6201.png">

Author: Vinayak <vijoshi5@in.ibm.com>

Closes #15410 from vijoshi/SAAS-608_master.

(cherry picked from commit a531fe1a82ec515314f2db2e2305283fef24067f)
Signed-off-by: Tom Graves <tgraves@yahoo-inc.com>
---
 .../spark/ui/static/historypage-common.js     | 24 ++++++++
 .../history/ApplicationHistoryProvider.scala  | 24 ++++++++
 .../deploy/history/FsHistoryProvider.scala    | 59 +++++++++++++------
 .../spark/deploy/history/HistoryPage.scala    | 19 ++++++
 .../spark/deploy/history/HistoryServer.scala  |  8 +++
 5 files changed, 116 insertions(+), 18 deletions(-)
 create mode 100644 core/src/main/resources/org/apache/spark/ui/static/historypage-common.js

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js b/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
new file mode 100644
index 0000000000000..55d540d8317a0
--- /dev/null
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+$(document).ready(function() {
+    if ($('#last-updated').length) {
+      var lastUpdatedMillis = Number($('#last-updated').text());
+      var updatedDate = new Date(lastUpdatedMillis);
+      $('#last-updated').text(updatedDate.toLocaleDateString()+", "+updatedDate.toLocaleTimeString())
+    }
+});
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index 06530ff836466..d7d82800b8b55 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -74,6 +74,30 @@ private[history] case class LoadedAppUI(
 
 private[history] abstract class ApplicationHistoryProvider {
 
+  /**
+   * Returns the count of application event logs that the provider is currently still processing.
+   * History Server UI can use this to indicate to a user that the application listing on the UI
+   * can be expected to list additional known applications once the processing of these
+   * application event logs completes.
+   *
+   * A History Provider that does not have a notion of count of event logs that may be pending
+   * for processing need not override this method.
+   *
+   * @return Count of application event logs that are currently under process
+   */
+  def getEventLogsUnderProcess(): Int = {
+    return 0;
+  }
+
+  /**
+   * Returns the time the history provider last updated the application history information
+   *
+   * @return 0 if this is undefined or unsupported, otherwise the last updated time in millis
+   */
+  def getLastUpdatedTime(): Long = {
+    return 0;
+  }
+
   /**
    * Returns a list of applications available for the history server to show.
    *
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index dfc1aad64c818..ca38a47639422 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.history
 
 import java.io.{FileNotFoundException, IOException, OutputStream}
 import java.util.UUID
-import java.util.concurrent.{Executors, ExecutorService, TimeUnit}
+import java.util.concurrent.{Executors, ExecutorService, Future, TimeUnit}
 import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.mutable
@@ -108,7 +108,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
   // The modification time of the newest log detected during the last scan.   Currently only
   // used for logging msgs (logs are re-scanned based on file size, rather than modtime)
-  private var lastScanTime = -1L
+  private val lastScanTime = new java.util.concurrent.atomic.AtomicLong(-1)
 
   // Mapping of application IDs to their metadata, in descending end time order. Apps are inserted
   // into the map in order, so the LinkedHashMap maintains the correct ordering.
@@ -120,6 +120,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   // List of application logs to be deleted by event log cleaner.
   private var attemptsToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
 
+  private val pendingReplayTasksCount = new java.util.concurrent.atomic.AtomicInteger(0)
+
   /**
    * Return a runnable that performs the given operation on the event logs.
    * This operation is expected to be executed periodically.
@@ -226,6 +228,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     applications.get(appId)
   }
 
+  override def getEventLogsUnderProcess(): Int = pendingReplayTasksCount.get()
+
+  override def getLastUpdatedTime(): Long = lastScanTime.get()
+
   override def getAppUI(appId: String, attemptId: Option[String]): Option[LoadedAppUI] = {
     try {
       applications.get(appId).flatMap { appInfo =>
@@ -329,26 +335,43 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       if (logInfos.nonEmpty) {
         logDebug(s"New/updated attempts found: ${logInfos.size} ${logInfos.map(_.getPath)}")
       }
-      logInfos.map { file =>
-          replayExecutor.submit(new Runnable {
+
+      var tasks = mutable.ListBuffer[Future[_]]()
+
+      try {
+        for (file <- logInfos) {
+          tasks += replayExecutor.submit(new Runnable {
             override def run(): Unit = mergeApplicationListing(file)
           })
         }
-        .foreach { task =>
-          try {
-            // Wait for all tasks to finish. This makes sure that checkForLogs
-            // is not scheduled again while some tasks are already running in
-            // the replayExecutor.
-            task.get()
-          } catch {
-            case e: InterruptedException =>
-              throw e
-            case e: Exception =>
-              logError("Exception while merging application listings", e)
-          }
+      } catch {
+        // let the iteration over logInfos break, since an exception on
+        // replayExecutor.submit (..) indicates the ExecutorService is unable
+        // to take any more submissions at this time
+
+        case e: Exception =>
+          logError(s"Exception while submitting event log for replay", e)
+      }
+
+      pendingReplayTasksCount.addAndGet(tasks.size)
+
+      tasks.foreach { task =>
+        try {
+          // Wait for all tasks to finish. This makes sure that checkForLogs
+          // is not scheduled again while some tasks are already running in
+          // the replayExecutor.
+          task.get()
+        } catch {
+          case e: InterruptedException =>
+            throw e
+          case e: Exception =>
+            logError("Exception while merging application listings", e)
+        } finally {
+          pendingReplayTasksCount.decrementAndGet()
         }
+      }
 
-      lastScanTime = newLastScanTime
+      lastScanTime.set(newLastScanTime)
     } catch {
       case e: Exception => logError("Exception in checking for event log updates", e)
     }
@@ -365,7 +388,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     } catch {
       case e: Exception =>
         logError("Exception encountered when attempting to update last scan time", e)
-        lastScanTime
+        lastScanTime.get()
     } finally {
       if (!fs.delete(path, true)) {
         logWarning(s"Error deleting ${path}")
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 96b9ecf43b14c..0e7a6c24d4fa5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -30,13 +30,30 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
       Option(request.getParameter("showIncomplete")).getOrElse("false").toBoolean
 
     val allAppsSize = parent.getApplicationList().count(_.completed != requestedIncomplete)
+    val eventLogsUnderProcessCount = parent.getEventLogsUnderProcess()
+    val lastUpdatedTime = parent.getLastUpdatedTime()
     val providerConfig = parent.getProviderConfig()
     val content =
+      <script src={UIUtils.prependBaseUri("/static/historypage-common.js")}></script>
       <div>
           <div class="span12">
             <ul class="unstyled">
               {providerConfig.map { case (k, v) => <li><strong>{k}:</strong> {v}</li> }}
             </ul>
+            {
+            if (eventLogsUnderProcessCount > 0) {
+              <p>There are {eventLogsUnderProcessCount} event log(s) currently being
+                processed which may result in additional applications getting listed on this page.
+                Refresh the page to view updates. </p>
+            }
+            }
+
+            {
+            if (lastUpdatedTime > 0) {
+              <p>Last updated: <span id="last-updated">{lastUpdatedTime}</span></p>
+            }
+            }
+
             {
             if (allAppsSize > 0) {
               <script src={UIUtils.prependBaseUri("/static/dataTables.rowsGroup.js")}></script> ++
@@ -46,6 +63,8 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
                 <script>setAppLimit({parent.maxApplications})</script>
             } else if (requestedIncomplete) {
               <h4>No incomplete applications found!</h4>
+            } else if (eventLogsUnderProcessCount > 0) {
+              <h4>No completed applications found!</h4>
             } else {
               <h4>No completed applications found!</h4> ++ parent.emptyListingHtml
             }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 3175b36b3e56f..7e21fa681aa1e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -179,6 +179,14 @@ class HistoryServer(
     provider.getListing()
   }
 
+  def getEventLogsUnderProcess(): Int = {
+    provider.getEventLogsUnderProcess()
+  }
+
+  def getLastUpdatedTime(): Long = {
+    provider.getLastUpdatedTime()
+  }
+
   def getApplicationInfoList: Iterator[ApplicationInfo] = {
     getApplicationList().map(ApplicationsListResource.appHistoryInfoToPublicAppInfo)
   }

From 465e4b40b3b7760bfcd0f03a14b805029ed599f1 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 11 Nov 2016 13:28:18 -0800
Subject: [PATCH 089/534] [SPARK-17982][SQL] SQLBuilder should wrap the
 generated SQL with parenthesis for LIMIT

## What changes were proposed in this pull request?

Currently, `SQLBuilder` handles `LIMIT` by always adding `LIMIT` at the end of the generated subSQL. It makes `RuntimeException`s like the following. This PR adds a parenthesis always except `SubqueryAlias` is used together with `LIMIT`.

**Before**

``` scala
scala> sql("CREATE TABLE tbl(id INT)")
scala> sql("CREATE VIEW v1(id2) AS SELECT id FROM tbl LIMIT 2")
java.lang.RuntimeException: Failed to analyze the canonicalized SQL: ...
```

**After**

``` scala
scala> sql("CREATE TABLE tbl(id INT)")
scala> sql("CREATE VIEW v1(id2) AS SELECT id FROM tbl LIMIT 2")
scala> sql("SELECT id2 FROM v1")
res4: org.apache.spark.sql.DataFrame = [id2: int]
```

**Fixed cases in this PR**

The following two cases are the detail query plans having problematic SQL generations.

1. `SELECT * FROM (SELECT id FROM tbl LIMIT 2)`

    Please note that **FROM SELECT** part of the generated SQL in the below. When we don't use '()' for limit, this fails.

```scala
# Original logical plan:
Project [id#1]
+- GlobalLimit 2
   +- LocalLimit 2
      +- Project [id#1]
         +- MetastoreRelation default, tbl

# Canonicalized logical plan:
Project [gen_attr_0#1 AS id#4]
+- SubqueryAlias tbl
   +- Project [gen_attr_0#1]
      +- GlobalLimit 2
         +- LocalLimit 2
            +- Project [gen_attr_0#1]
               +- SubqueryAlias gen_subquery_0
                  +- Project [id#1 AS gen_attr_0#1]
                     +- SQLTable default, tbl, [id#1]

# Generated SQL:
SELECT `gen_attr_0` AS `id` FROM (SELECT `gen_attr_0` FROM SELECT `gen_attr_0` FROM (SELECT `id` AS `gen_attr_0` FROM `default`.`tbl`) AS gen_subquery_0 LIMIT 2) AS tbl
```

2. `SELECT * FROM (SELECT id FROM tbl TABLESAMPLE (2 ROWS))`

    Please note that **((~~~) AS gen_subquery_0 LIMIT 2)** in the below. When we use '()' for limit on `SubqueryAlias`, this fails.

```scala
# Original logical plan:
Project [id#1]
+- Project [id#1]
   +- GlobalLimit 2
      +- LocalLimit 2
         +- MetastoreRelation default, tbl

# Canonicalized logical plan:
Project [gen_attr_0#1 AS id#4]
+- SubqueryAlias tbl
   +- Project [gen_attr_0#1]
      +- GlobalLimit 2
         +- LocalLimit 2
            +- SubqueryAlias gen_subquery_0
               +- Project [id#1 AS gen_attr_0#1]
                  +- SQLTable default, tbl, [id#1]

# Generated SQL:
SELECT `gen_attr_0` AS `id` FROM (SELECT `gen_attr_0` FROM ((SELECT `id` AS `gen_attr_0` FROM `default`.`tbl`) AS gen_subquery_0 LIMIT 2)) AS tbl
```

## How was this patch tested?

Pass the Jenkins test with a newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15546 from dongjoon-hyun/SPARK-17982.

(cherry picked from commit d42bb7cc4e32c173769bd7da5b9b5eafb510860c)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../org/apache/spark/sql/catalyst/SQLBuilder.scala     |  7 ++++++-
 .../test/resources/sqlgen/generate_with_other_1.sql    |  2 +-
 .../test/resources/sqlgen/generate_with_other_2.sql    |  2 +-
 sql/hive/src/test/resources/sqlgen/limit.sql           |  4 ++++
 .../spark/sql/catalyst/LogicalPlanToSQLSuite.scala     | 10 ++++++++++
 5 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 sql/hive/src/test/resources/sqlgen/limit.sql

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
index 6f821f80cc4c5..380454267eaf4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala
@@ -138,9 +138,14 @@ class SQLBuilder private (
     case g: Generate =>
       generateToSQL(g)
 
-    case Limit(limitExpr, child) =>
+    // This prevents a pattern of `((...) AS gen_subquery_0 LIMIT 1)` which does not work.
+    // For example, `SELECT * FROM (SELECT id FROM tbl TABLESAMPLE (2 ROWS))` makes this plan.
+    case Limit(limitExpr, child: SubqueryAlias) =>
       s"${toSQL(child)} LIMIT ${limitExpr.sql}"
 
+    case Limit(limitExpr, child) =>
+      s"(${toSQL(child)} LIMIT ${limitExpr.sql})"
+
     case Filter(condition, child) =>
       val whereOrHaving = child match {
         case _: Aggregate => "HAVING"
diff --git a/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql b/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql
index ab444d0c70936..0739f8fff5467 100644
--- a/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql
+++ b/sql/hive/src/test/resources/sqlgen/generate_with_other_1.sql
@@ -5,4 +5,4 @@ WHERE id > 2
 ORDER BY val, id
 LIMIT 5
 --------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT gen_subquery_0.`gen_attr_2`, gen_subquery_0.`gen_attr_3`, gen_subquery_0.`gen_attr_4`, gen_subquery_0.`gen_attr_1` FROM (SELECT `arr` AS `gen_attr_2`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_4`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 WHERE (`gen_attr_1` > CAST(2 AS BIGINT))) AS gen_subquery_1 LATERAL VIEW explode(`gen_attr_2`) gen_subquery_2 AS `gen_attr_0` ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5) AS parquet_t3
+SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM ((SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT gen_subquery_0.`gen_attr_2`, gen_subquery_0.`gen_attr_3`, gen_subquery_0.`gen_attr_4`, gen_subquery_0.`gen_attr_1` FROM (SELECT `arr` AS `gen_attr_2`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_4`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 WHERE (`gen_attr_1` > CAST(2 AS BIGINT))) AS gen_subquery_1 LATERAL VIEW explode(`gen_attr_2`) gen_subquery_2 AS `gen_attr_0` ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5)) AS parquet_t3
diff --git a/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql b/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql
index 42a2369f34d1c..c4b344ee238a5 100644
--- a/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql
+++ b/sql/hive/src/test/resources/sqlgen/generate_with_other_2.sql
@@ -7,4 +7,4 @@ WHERE val > 2
 ORDER BY val, id
 LIMIT 5
 --------------------------------------------------------------------------------
-SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM (SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `arr` AS `gen_attr_4`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_5`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr_3`) gen_subquery_2 AS `gen_attr_2` LATERAL VIEW explode(`gen_attr_2`) gen_subquery_3 AS `gen_attr_0` WHERE (`gen_attr_0` > CAST(2 AS BIGINT)) ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5) AS gen_subquery_1
+SELECT `gen_attr_0` AS `val`, `gen_attr_1` AS `id` FROM ((SELECT `gen_attr_0`, `gen_attr_1` FROM (SELECT `arr` AS `gen_attr_4`, `arr2` AS `gen_attr_3`, `json` AS `gen_attr_5`, `id` AS `gen_attr_1` FROM `default`.`parquet_t3`) AS gen_subquery_0 LATERAL VIEW explode(`gen_attr_3`) gen_subquery_2 AS `gen_attr_2` LATERAL VIEW explode(`gen_attr_2`) gen_subquery_3 AS `gen_attr_0` WHERE (`gen_attr_0` > CAST(2 AS BIGINT)) ORDER BY `gen_attr_0` ASC NULLS FIRST, `gen_attr_1` ASC NULLS FIRST LIMIT 5)) AS gen_subquery_1
diff --git a/sql/hive/src/test/resources/sqlgen/limit.sql b/sql/hive/src/test/resources/sqlgen/limit.sql
new file mode 100644
index 0000000000000..7a6b060fbf505
--- /dev/null
+++ b/sql/hive/src/test/resources/sqlgen/limit.sql
@@ -0,0 +1,4 @@
+-- This file is automatically generated by LogicalPlanToSQLSuite.
+SELECT * FROM (SELECT id FROM tbl LIMIT 2)
+--------------------------------------------------------------------------------
+SELECT `gen_attr_0` AS `id` FROM (SELECT `gen_attr_0` FROM (SELECT `gen_attr_0` FROM (SELECT `id` AS `gen_attr_0`, `name` AS `gen_attr_1` FROM `default`.`tbl`) AS gen_subquery_0 LIMIT 2)) AS tbl
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
index 8696337b9dc8a..557ea44d1c80b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/LogicalPlanToSQLSuite.scala
@@ -1173,4 +1173,14 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
       )
     }
   }
+
+  test("SPARK-17982 - limit") {
+    withTable("tbl") {
+      sql("CREATE TABLE tbl(id INT, name STRING)")
+      checkSQL(
+        "SELECT * FROM (SELECT id FROM tbl LIMIT 2)",
+        "limit"
+      )
+    }
+  }
 }

From 87820da782fd2d08078227a2ce5c363c3e1cb0f0 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Fri, 11 Nov 2016 13:52:10 -0800
Subject: [PATCH 090/534] [SPARK-18387][SQL] Add serialization to
 checkEvaluation.

## What changes were proposed in this pull request?

This removes the serialization test from RegexpExpressionsSuite and
replaces it by serializing all expressions in checkEvaluation.

This also fixes math constant expressions by making LeafMathExpression
Serializable and fixes NumberFormat values that are null or invalid
after serialization.

## How was this patch tested?

This patch is to tests.

Author: Ryan Blue <blue@apache.org>

Closes #15847 from rdblue/SPARK-18387-fix-serializable-expressions.

(cherry picked from commit 6e95325fc3726d260054bd6e7c0717b3c139917e)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../expressions/mathExpressions.scala         |  2 +-
 .../expressions/stringExpressions.scala       | 44 +++++++++++--------
 .../expressions/ExpressionEvalHelper.scala    | 15 ++++---
 .../expressions/RegexpExpressionsSuite.scala  | 16 +------
 4 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index a60494a5bb69d..65273a77b1054 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -36,7 +36,7 @@ import org.apache.spark.unsafe.types.UTF8String
  * @param name The short name of the function
  */
 abstract class LeafMathExpression(c: Double, name: String)
-  extends LeafExpression with CodegenFallback {
+  extends LeafExpression with CodegenFallback with Serializable {
 
   override def dataType: DataType = DoubleType
   override def foldable: Boolean = true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 5f533fecf8d07..e74ef9a08750e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1431,18 +1431,20 @@ case class FormatNumber(x: Expression, d: Expression)
 
   // Associated with the pattern, for the last d value, and we will update the
   // pattern (DecimalFormat) once the new coming d value differ with the last one.
+  // This is an Option to distinguish between 0 (numberFormat is valid) and uninitialized after
+  // serialization (numberFormat has not been updated for dValue = 0).
   @transient
-  private var lastDValue: Int = -100
+  private var lastDValue: Option[Int] = None
 
   // A cached DecimalFormat, for performance concern, we will change it
   // only if the d value changed.
   @transient
-  private val pattern: StringBuffer = new StringBuffer()
+  private lazy val pattern: StringBuffer = new StringBuffer()
 
   // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.')
   // as a decimal separator.
   @transient
-  private val numberFormat = new DecimalFormat("", new DecimalFormatSymbols(Locale.US))
+  private lazy val numberFormat = new DecimalFormat("", new DecimalFormatSymbols(Locale.US))
 
   override protected def nullSafeEval(xObject: Any, dObject: Any): Any = {
     val dValue = dObject.asInstanceOf[Int]
@@ -1450,24 +1452,28 @@ case class FormatNumber(x: Expression, d: Expression)
       return null
     }
 
-    if (dValue != lastDValue) {
-      // construct a new DecimalFormat only if a new dValue
-      pattern.delete(0, pattern.length)
-      pattern.append("#,###,###,###,###,###,##0")
-
-      // decimal place
-      if (dValue > 0) {
-        pattern.append(".")
-
-        var i = 0
-        while (i < dValue) {
-          i += 1
-          pattern.append("0")
+    lastDValue match {
+      case Some(last) if last == dValue =>
+        // use the current pattern
+      case _ =>
+        // construct a new DecimalFormat only if a new dValue
+        pattern.delete(0, pattern.length)
+        pattern.append("#,###,###,###,###,###,##0")
+
+        // decimal place
+        if (dValue > 0) {
+          pattern.append(".")
+
+          var i = 0
+          while (i < dValue) {
+            i += 1
+            pattern.append("0")
+          }
         }
-      }
-      lastDValue = dValue
 
-      numberFormat.applyLocalizedPattern(pattern.toString)
+        lastDValue = Some(dValue)
+
+        numberFormat.applyLocalizedPattern(pattern.toString)
     }
 
     x.dataType match {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
index 9ceb709185417..f83650424a964 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala
@@ -22,7 +22,8 @@ import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.exceptions.TestFailedException
 import org.scalatest.prop.GeneratorDrivenPropertyChecks
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.JavaSerializer
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer
@@ -43,13 +44,15 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks {
 
   protected def checkEvaluation(
       expression: => Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = {
+    val serializer = new JavaSerializer(new SparkConf()).newInstance
+    val expr: Expression = serializer.deserialize(serializer.serialize(expression))
     val catalystValue = CatalystTypeConverters.convertToCatalyst(expected)
-    checkEvaluationWithoutCodegen(expression, catalystValue, inputRow)
-    checkEvaluationWithGeneratedMutableProjection(expression, catalystValue, inputRow)
-    if (GenerateUnsafeProjection.canSupport(expression.dataType)) {
-      checkEvalutionWithUnsafeProjection(expression, catalystValue, inputRow)
+    checkEvaluationWithoutCodegen(expr, catalystValue, inputRow)
+    checkEvaluationWithGeneratedMutableProjection(expr, catalystValue, inputRow)
+    if (GenerateUnsafeProjection.canSupport(expr.dataType)) {
+      checkEvalutionWithUnsafeProjection(expr, catalystValue, inputRow)
     }
-    checkEvaluationWithOptimization(expression, catalystValue, inputRow)
+    checkEvaluationWithOptimization(expr, catalystValue, inputRow)
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index d0d1aaa9d299d..5299549e7b4da 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.types.StringType
 
@@ -192,17 +191,4 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(StringSplit(s1, s2), null, row3)
   }
 
-  test("RegExpReplace serialization") {
-    val serializer = new JavaSerializer(new SparkConf()).newInstance
-
-    val row = create_row("abc", "b", "")
-
-    val s = 's.string.at(0)
-    val p = 'p.string.at(1)
-    val r = 'r.string.at(2)
-
-    val expr: RegExpReplace = serializer.deserialize(serializer.serialize(RegExpReplace(s, p, r)))
-    checkEvaluation(expr, "ac", row)
-  }
-
 }

From c2ebda443b2678e554d859d866af53e2e94822f2 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 11 Nov 2016 15:49:55 -0800
Subject: [PATCH 091/534] [SPARK-18264][SPARKR] build vignettes with package,
 update vignettes for CRAN release build and add info on release

## What changes were proposed in this pull request?

Changes to DESCRIPTION to build vignettes.
Changes the metadata for vignettes to generate the recommended format (which is about <10% of size before). Unfortunately it does not look as nice
(before - left, after - right)

![image](https://cloud.githubusercontent.com/assets/8969467/20040492/b75883e6-a40d-11e6-9534-25cdd5d59a8b.png)

![image](https://cloud.githubusercontent.com/assets/8969467/20040490/a40f4d42-a40d-11e6-8c91-af00ddcbdad9.png)

Also add information on how to run build/release to CRAN later.

## How was this patch tested?

manually, unit tests

shivaram

We need this for branch-2.1

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #15790 from felixcheung/rpkgvignettes.

(cherry picked from commit ba23f768f7419039df85530b84258ec31f0c22b4)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/CRAN_RELEASE.md                    | 91 ++++++++++++++++++++++++++++
 R/README.md                          |  8 +--
 R/check-cran.sh                      | 33 ++++++++--
 R/create-docs.sh                     | 19 +-----
 R/pkg/DESCRIPTION                    |  9 ++-
 R/pkg/vignettes/sparkr-vignettes.Rmd |  9 +--
 6 files changed, 134 insertions(+), 35 deletions(-)
 create mode 100644 R/CRAN_RELEASE.md

diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
new file mode 100644
index 0000000000000..bea8f9fbe4eec
--- /dev/null
+++ b/R/CRAN_RELEASE.md
@@ -0,0 +1,91 @@
+# SparkR CRAN Release
+
+To release SparkR as a package to CRAN, we would use the `devtools` package. Please work with the
+`dev@spark.apache.org` community and R package maintainer on this.
+
+### Release
+
+First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
+
+Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release.
+
+To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
+
+Once everything is in place, run in R under the `SPARK_HOME/R` directory:
+
+```R
+paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::release(); .libPaths(paths)
+```
+
+For more information please refer to http://r-pkgs.had.co.nz/release.html#release-check
+
+### Testing: build package manually
+
+To build package manually such as to inspect the resulting `.tar.gz` file content, we would also use the `devtools` package.
+
+Source package is what get released to CRAN. CRAN would then build platform-specific binary packages from the source package.
+
+#### Build source package
+
+To build source package locally without releasing to CRAN, run in R under the `SPARK_HOME/R` directory:
+
+```R
+paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::build("pkg"); .libPaths(paths)
+```
+
+(http://r-pkgs.had.co.nz/vignettes.html#vignette-workflow-2)
+
+Similarly, the source package is also created by `check-cran.sh` with `R CMD build pkg`.
+
+For example, this should be the content of the source package:
+
+```sh
+DESCRIPTION	R		inst		tests
+NAMESPACE	build		man		vignettes
+
+inst/doc/
+sparkr-vignettes.html
+sparkr-vignettes.Rmd
+sparkr-vignettes.Rman
+
+build/
+vignette.rds
+
+man/
+ *.Rd files...
+
+vignettes/
+sparkr-vignettes.Rmd
+```
+
+#### Test source package
+
+To install, run this:
+
+```sh
+R CMD INSTALL SparkR_2.1.0.tar.gz
+```
+
+With "2.1.0" replaced with the version of SparkR.
+
+This command installs SparkR to the default libPaths. Once that is done, you should be able to start R and run:
+
+```R
+library(SparkR)
+vignette("sparkr-vignettes", package="SparkR")
+```
+
+#### Build binary package
+
+To build binary package locally, run in R under the `SPARK_HOME/R` directory:
+
+```R
+paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); devtools::build("pkg", binary = TRUE); .libPaths(paths)
+```
+
+For example, this should be the content of the binary package:
+
+```sh
+DESCRIPTION	Meta		R		html		tests
+INDEX		NAMESPACE	help		profile		worker
+```
diff --git a/R/README.md b/R/README.md
index 932d5272d0b4f..47f9a86dfde11 100644
--- a/R/README.md
+++ b/R/README.md
@@ -6,7 +6,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
 
 Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`.
 By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script.
-Example: 
+Example:
 ```bash
 # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript
 export R_HOME=/home/username/R
@@ -46,7 +46,7 @@ Sys.setenv(SPARK_HOME="/Users/username/spark")
 # This line loads SparkR from the installed directory
 .libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
 library(SparkR)
-sc <- sparkR.init(master="local")
+sparkR.session()
 ```
 
 #### Making changes to SparkR
@@ -54,11 +54,11 @@ sc <- sparkR.init(master="local")
 The [instructions](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) for making contributions to Spark also apply to SparkR.
 If you only make R file changes (i.e. no Scala changes) then you can just re-install the R package using `R/install-dev.sh` and test your changes.
 Once you have made your changes, please include unit tests for them and run existing unit tests using the `R/run-tests.sh` script as described below.
-    
+
 #### Generating documentation
 
 The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. Also, you may need to install these [prerequisites](https://github.com/apache/spark/tree/master/docs#prerequisites). See also, `R/DOCUMENTATION.md`
-    
+
 ### Examples, Unit tests
 
 SparkR comes with several sample programs in the `examples/src/main/r` directory.
diff --git a/R/check-cran.sh b/R/check-cran.sh
index bb331466ae931..c5f042848c90c 100755
--- a/R/check-cran.sh
+++ b/R/check-cran.sh
@@ -36,11 +36,27 @@ if [ ! -z "$R_HOME" ]
 fi
 echo "USING R_HOME = $R_HOME"
 
-# Build the latest docs
+# Build the latest docs, but not vignettes, which is built with the package next
 $FWDIR/create-docs.sh
 
-# Build a zip file containing the source package
-"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
+# Build source package with vignettes
+SPARK_HOME="$(cd "${FWDIR}"/..; pwd)"
+. "${SPARK_HOME}"/bin/load-spark-env.sh
+if [ -f "${SPARK_HOME}/RELEASE" ]; then
+  SPARK_JARS_DIR="${SPARK_HOME}/jars"
+else
+  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
+fi
+
+if [ -d "$SPARK_JARS_DIR" ]; then
+  # Build a zip file containing the source package with vignettes
+  SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
+
+  find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
+else
+  echo "Error Spark JARs not found in $SPARK_HOME"
+  exit 1
+fi
 
 # Run check as-cran.
 VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
@@ -54,11 +70,16 @@ fi
 
 if [ -n "$NO_MANUAL" ]
 then
-  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual"
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual --no-vignettes"
 fi
 
 echo "Running CRAN check with $CRAN_CHECK_OPTIONS options"
 
-"$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
-
+if [ -n "$NO_TESTS" ] && [ -n "$NO_MANUAL" ]
+then
+  "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+else
+  # This will run tests and/or build vignettes, and require SPARK_HOME
+  SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+fi
 popd > /dev/null
diff --git a/R/create-docs.sh b/R/create-docs.sh
index 69ffc5f678c36..84e6aa928cb0f 100755
--- a/R/create-docs.sh
+++ b/R/create-docs.sh
@@ -20,7 +20,7 @@
 # Script to create API docs and vignettes for SparkR
 # This requires `devtools`, `knitr` and `rmarkdown` to be installed on the machine.
 
-# After running this script the html docs can be found in 
+# After running this script the html docs can be found in
 # $SPARK_HOME/R/pkg/html
 # The vignettes can be found in
 # $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html
@@ -52,21 +52,4 @@ Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knit
 
 popd
 
-# Find Spark jars.
-if [ -f "${SPARK_HOME}/RELEASE" ]; then
-  SPARK_JARS_DIR="${SPARK_HOME}/jars"
-else
-  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
-fi
-
-# Only create vignettes if Spark JARs exist
-if [ -d "$SPARK_JARS_DIR" ]; then
-  # render creates SparkR vignettes
-  Rscript -e 'library(rmarkdown); paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); render("pkg/vignettes/sparkr-vignettes.Rmd"); .libPaths(paths)'
-
-  find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
-else
-  echo "Skipping R vignettes as Spark JARs not found in $SPARK_HOME"
-fi
-
 popd
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 5a83883089e0e..fe41a9e7dabbd 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: SparkR
 Type: Package
 Title: R Frontend for Apache Spark
-Version: 2.0.0
-Date: 2016-08-27
+Version: 2.1.0
+Date: 2016-11-06
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",
@@ -18,7 +18,9 @@ Depends:
 Suggests:
     testthat,
     e1071,
-    survival
+    survival,
+    knitr,
+    rmarkdown
 Description: The SparkR package provides an R frontend for Apache Spark.
 License: Apache License (== 2.0)
 Collate:
@@ -48,3 +50,4 @@ Collate:
     'utils.R'
     'window.R'
 RoxygenNote: 5.0.1
+VignetteBuilder: knitr
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 80e876027bddb..73a5e26a3ba9c 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -1,12 +1,13 @@
 ---
 title: "SparkR - Practical Guide"
 output:
-  html_document:
-    theme: united
+  rmarkdown::html_vignette:
     toc: true
     toc_depth: 4
-    toc_float: true
-    highlight: textmate
+vignette: >
+  %\VignetteIndexEntry{SparkR - Practical Guide}
+  %\VignetteEngine{knitr::rmarkdown}
+  \usepackage[utf8]{inputenc}
 ---
 
 ## Overview

From 56859c029476bc41b2d2e05043c119146b287bce Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Sat, 12 Nov 2016 01:38:26 +0000
Subject: [PATCH 092/534] [SPARK-18060][ML] Avoid unnecessary computation for
 MLOR

## What changes were proposed in this pull request?

Before this patch, the gradient updates for multinomial logistic regression were computed by an outer loop over the number of classes and an inner loop over the number of features. Inside the inner loop, we standardized the feature value (`value / featuresStd(index)`), which means we performed the computation `numFeatures * numClasses` times. We only need to perform that computation `numFeatures` times, however. If we re-order the inner and outer loop, we can avoid this, but then we lose sequential memory access. In this patch, we instead lay out the coefficients in column major order while we train, so that we can avoid the extra computation and retain sequential memory access. We convert back to row-major order when we create the model.

## How was this patch tested?

This is an implementation detail only, so the original behavior should be maintained. All tests pass. I ran some performance tests to verify speedups. The results are below, and show significant speedups.
## Performance Tests

**Setup**

3 node bare-metal cluster
120 cores total
384 gb RAM total

**Results**

NOTE: The `currentMasterTime` and `thisPatchTime` are times in seconds for a single iteration of L-BFGS or OWL-QN.

|    |   numPoints |   numFeatures |   numClasses |   regParam |   elasticNetParam |   currentMasterTime (sec) |   thisPatchTime (sec) |   pctSpeedup |
|----|-------------|---------------|--------------|------------|-------------------|---------------------------|-----------------------|--------------|
|  0 |       1e+07 |           100 |          500 |       0.5  |                 0 |                        90 |                    18 |           80 |
|  1 |       1e+08 |           100 |           50 |       0.5  |                 0 |                        90 |                    19 |           78 |
|  2 |       1e+08 |           100 |           50 |       0.05 |                 1 |                        72 |                    19 |           73 |
|  3 |       1e+06 |           100 |         5000 |       0.5  |                 0 |                        93 |                    53 |           43 |
|  4 |       1e+07 |           100 |         5000 |       0.5  |                 0 |                       900 |                   390 |           56 |
|  5 |       1e+08 |           100 |          500 |       0.5  |                 0 |                       840 |                   174 |           79 |
|  6 |       1e+08 |           100 |          200 |       0.5  |                 0 |                       360 |                    72 |           80 |
|  7 |       1e+08 |          1000 |            5 |       0.5  |                 0 |                         9 |                     3 |           66 |

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15593 from sethah/MLOR_PERF_COL_MAJOR_COEF.

(cherry picked from commit 46b2550bcd3690a260b995fd4d024a73b92a0299)
Signed-off-by: DB Tsai <dbtsai@dbtsai.com>
---
 .../classification/LogisticRegression.scala   | 125 +++++++++++-------
 1 file changed, 74 insertions(+), 51 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c4651054fd765..18b9b3043db8a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -438,18 +438,14 @@ class LogisticRegression @Since("1.2.0") (
           val standardizationParam = $(standardization)
           def regParamL1Fun = (index: Int) => {
             // Remove the L1 penalization on the intercept
-            val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0)
+            val isIntercept = $(fitIntercept) && index >= numFeatures * numCoefficientSets
             if (isIntercept) {
               0.0
             } else {
               if (standardizationParam) {
                 regParamL1
               } else {
-                val featureIndex = if ($(fitIntercept)) {
-                  index % numFeaturesPlusIntercept
-                } else {
-                  index % numFeatures
-                }
+                val featureIndex = index / numCoefficientSets
                 // If `standardization` is false, we still standardize the data
                 // to improve the rate of convergence; as a result, we have to
                 // perform this reverse standardization by penalizing each component
@@ -466,6 +462,15 @@ class LogisticRegression @Since("1.2.0") (
           new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
         }
 
+        /*
+          The coefficients are laid out in column major order during training. e.g. for
+          `numClasses = 3` and `numFeatures = 2` and `fitIntercept = true` the layout is:
+
+           Array(beta_11, beta_21, beta_31, beta_12, beta_22, beta_32, intercept_1, intercept_2,
+             intercept_3)
+
+           where beta_jk corresponds to the coefficient for class `j` and feature `k`.
+         */
         val initialCoefficientsWithIntercept =
           Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept)
 
@@ -489,13 +494,14 @@ class LogisticRegression @Since("1.2.0") (
           val initialCoefWithInterceptArray = initialCoefficientsWithIntercept.toArray
           val providedCoef = optInitialModel.get.coefficientMatrix
           providedCoef.foreachActive { (row, col, value) =>
-            val flatIndex = row * numFeaturesPlusIntercept + col
+            // convert matrix to column major for training
+            val flatIndex = col * numCoefficientSets + row
             // We need to scale the coefficients since they will be trained in the scaled space
             initialCoefWithInterceptArray(flatIndex) = value * featuresStd(col)
           }
           if ($(fitIntercept)) {
             optInitialModel.get.interceptVector.foreachActive { (index, value) =>
-              val coefIndex = (index + 1) * numFeaturesPlusIntercept - 1
+              val coefIndex = numCoefficientSets * numFeatures + index
               initialCoefWithInterceptArray(coefIndex) = value
             }
           }
@@ -526,7 +532,7 @@ class LogisticRegression @Since("1.2.0") (
           val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
           val rawMean = rawIntercepts.sum / rawIntercepts.length
           rawIntercepts.indices.foreach { i =>
-            initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) =
+            initialCoefficientsWithIntercept.toArray(numClasses * numFeatures + i) =
               rawIntercepts(i) - rawMean
           }
         } else if ($(fitIntercept)) {
@@ -572,16 +578,20 @@ class LogisticRegression @Since("1.2.0") (
         /*
            The coefficients are trained in the scaled space; we're converting them back to
            the original space.
+
+           Additionally, since the coefficients were laid out in column major order during training
+           to avoid extra computation, we convert them back to row major before passing them to the
+           model.
+
            Note that the intercept in scaled space and original space is the same;
            as a result, no scaling is needed.
          */
         val rawCoefficients = state.x.toArray.clone()
         val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i =>
-          // flatIndex will loop though rawCoefficients, and skip the intercept terms.
-          val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
+          val colMajorIndex = (i % numFeatures) * numCoefficientSets + i / numFeatures
           val featureIndex = i % numFeatures
           if (featuresStd(featureIndex) != 0.0) {
-            rawCoefficients(flatIndex) / featuresStd(featureIndex)
+            rawCoefficients(colMajorIndex) / featuresStd(featureIndex)
           } else {
             0.0
           }
@@ -618,7 +628,7 @@ class LogisticRegression @Since("1.2.0") (
 
         val interceptsArray: Array[Double] = if ($(fitIntercept)) {
           Array.tabulate(numCoefficientSets) { i =>
-            val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1
+            val coefIndex = numFeatures * numCoefficientSets + i
             rawCoefficients(coefIndex)
           }
         } else {
@@ -697,6 +707,7 @@ class LogisticRegressionModel private[spark] (
   /**
    * A vector of model coefficients for "binomial" logistic regression. If this model was trained
    * using the "multinomial" family then an exception is thrown.
+   *
    * @return Vector
    */
   @Since("2.0.0")
@@ -720,6 +731,7 @@ class LogisticRegressionModel private[spark] (
   /**
    * The model intercept for "binomial" logistic regression. If this model was fit with the
    * "multinomial" family then an exception is thrown.
+   *
    * @return Double
    */
   @Since("1.3.0")
@@ -1389,6 +1401,12 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    $$
  * </blockquote></p>
  *
+ * @note In order to avoid unnecessary computation during calculation of the gradient updates
+ *       we lay out the coefficients in column major order during training. This allows us to
+ *       perform feature standardization once, while still retaining sequential memory access
+ *       for speed. We convert back to row major order when we create the model,
+ *       since this form is optimal for the matrix operations used for prediction.
+ *
  * @param bcCoefficients The broadcast coefficients corresponding to the features.
  * @param bcFeaturesStd The broadcast standard deviation values of the features.
  * @param numClasses the number of possible outcomes for k classes classification problem in
@@ -1486,23 +1504,25 @@ private class LogisticAggregator(
     var marginOfLabel = 0.0
     var maxMargin = Double.NegativeInfinity
 
-    val margins = Array.tabulate(numClasses) { i =>
-      var margin = 0.0
-      features.foreachActive { (index, value) =>
-        if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-          margin += localCoefficients(i * numFeaturesPlusIntercept + index) *
-            value / localFeaturesStd(index)
-        }
+    val margins = new Array[Double](numClasses)
+    features.foreachActive { (index, value) =>
+      val stdValue = value / localFeaturesStd(index)
+      var j = 0
+      while (j < numClasses) {
+        margins(j) += localCoefficients(index * numClasses + j) * stdValue
+        j += 1
       }
-
+    }
+    var i = 0
+    while (i < numClasses) {
       if (fitIntercept) {
-        margin += localCoefficients(i * numFeaturesPlusIntercept + numFeatures)
+        margins(i) += localCoefficients(numClasses * numFeatures + i)
       }
-      if (i == label.toInt) marginOfLabel = margin
-      if (margin > maxMargin) {
-        maxMargin = margin
+      if (i == label.toInt) marginOfLabel = margins(i)
+      if (margins(i) > maxMargin) {
+        maxMargin = margins(i)
       }
-      margin
+      i += 1
     }
 
     /**
@@ -1510,33 +1530,39 @@ private class LogisticAggregator(
      * We address this by subtracting maxMargin from all the margins, so it's guaranteed
      * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
      */
+    val multipliers = new Array[Double](numClasses)
     val sum = {
       var temp = 0.0
-      if (maxMargin > 0) {
-        for (i <- 0 until numClasses) {
-          margins(i) -= maxMargin
-          temp += math.exp(margins(i))
-        }
-      } else {
-        for (i <- 0 until numClasses) {
-          temp += math.exp(margins(i))
-        }
+      var i = 0
+      while (i < numClasses) {
+        if (maxMargin > 0) margins(i) -= maxMargin
+        val exp = math.exp(margins(i))
+        temp += exp
+        multipliers(i) = exp
+        i += 1
       }
       temp
     }
 
-    for (i <- 0 until numClasses) {
-      val multiplier = math.exp(margins(i)) / sum - {
-        if (label == i) 1.0 else 0.0
-      }
-      features.foreachActive { (index, value) =>
-        if (localFeaturesStd(index) != 0.0 && value != 0.0) {
-          localGradientArray(i * numFeaturesPlusIntercept + index) +=
-            weight * multiplier * value / localFeaturesStd(index)
+    margins.indices.foreach { i =>
+      multipliers(i) = multipliers(i) / sum - (if (label == i) 1.0 else 0.0)
+    }
+    features.foreachActive { (index, value) =>
+      if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+        val stdValue = value / localFeaturesStd(index)
+        var j = 0
+        while (j < numClasses) {
+          localGradientArray(index * numClasses + j) +=
+            weight * multipliers(j) * stdValue
+          j += 1
         }
       }
-      if (fitIntercept) {
-        localGradientArray(i * numFeaturesPlusIntercept + numFeatures) += weight * multiplier
+    }
+    if (fitIntercept) {
+      var i = 0
+      while (i < numClasses) {
+        localGradientArray(numFeatures * numClasses + i) += weight * multipliers(i)
+        i += 1
       }
     }
 
@@ -1637,6 +1663,7 @@ private class LogisticCostFun(
     val bcCoeffs = instances.context.broadcast(coeffs)
     val featuresStd = bcFeaturesStd.value
     val numFeatures = featuresStd.length
+    val numCoefficientSets = if (multinomial) numClasses else 1
 
     val logisticAggregator = {
       val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance)
@@ -1656,7 +1683,7 @@ private class LogisticCostFun(
       var sum = 0.0
       coeffs.foreachActive { case (index, value) =>
         // We do not apply regularization to the intercepts
-        val isIntercept = fitIntercept && ((index + 1) % (numFeatures + 1) == 0)
+        val isIntercept = fitIntercept && index >= numCoefficientSets * numFeatures
         if (!isIntercept) {
           // The following code will compute the loss of the regularization; also
           // the gradient of the regularization, and add back to totalGradientArray.
@@ -1665,11 +1692,7 @@ private class LogisticCostFun(
               totalGradientArray(index) += regParamL2 * value
               value * value
             } else {
-              val featureIndex = if (fitIntercept) {
-                index % (numFeatures + 1)
-              } else {
-                index % numFeatures
-              }
+              val featureIndex = index / numCoefficientSets
               if (featuresStd(featureIndex) != 0.0) {
                 // If `standardization` is false, we still standardize the data
                 // to improve the rate of convergence; as a result, we have to

From 893355143a177f1fea1d2fb6f6e617574e5c5e52 Mon Sep 17 00:00:00 2001
From: Guoqiang Li <witgo@qq.com>
Date: Sat, 12 Nov 2016 09:49:14 +0000
Subject: [PATCH 093/534] [SPARK-18375][SPARK-18383][BUILD][CORE] Upgrade netty
 to 4.0.42.Final

## What changes were proposed in this pull request?

One of the important changes for 4.0.42.Final is "Support any FileRegion implementation when using epoll transport netty/netty#5825".
In 4.0.42.Final, `MessageWithHeader` can work properly when `spark.[shuffle|rpc].io.mode` is set to epoll

## How was this patch tested?

Existing tests

Author: Guoqiang Li <witgo@qq.com>

Closes #15830 from witgo/SPARK-18375_netty-4.0.42.

(cherry picked from commit bc41d997ea287080f549219722b6d9049adef4e2)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++++
 dev/deps/spark-deps-hadoop-2.2                        | 2 +-
 dev/deps/spark-deps-hadoop-2.3                        | 2 +-
 dev/deps/spark-deps-hadoop-2.4                        | 2 +-
 dev/deps/spark-deps-hadoop-2.6                        | 2 +-
 dev/deps/spark-deps-hadoop-2.7                        | 2 +-
 pom.xml                                               | 2 +-
 7 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 1de66af632a8a..892e112e18f85 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -39,6 +39,7 @@ import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.{ControlThrowable, NonFatal}
 
+import _root_.io.netty.channel.unix.Errors.NativeIoException
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import com.google.common.io.{ByteStreams, Files => GFiles}
 import com.google.common.net.InetAddresses
@@ -2222,6 +2223,9 @@ private[spark] object Utils extends Logging {
         isBindCollision(e.getCause)
       case e: MultiException =>
         e.getThrowables.asScala.exists(isBindCollision)
+      case e: NativeIoException =>
+        (e.getMessage != null && e.getMessage.startsWith("bind() failed: ")) ||
+          isBindCollision(e.getCause)
       case e: Exception => isBindCollision(e.getCause)
       case _ => false
     }
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 6e749ac16cac0..bbdea069f9496 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -123,7 +123,7 @@ metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 515995a0a46bd..a2dec41d64519 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -130,7 +130,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index d2139fd952406..c1f02b93d751c 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -130,7 +130,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index b5cecf72ec35f..4f04636be712b 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -138,7 +138,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index a5e03a78e7ea8..da3af9ffa155b 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -139,7 +139,7 @@ metrics-jvm-3.1.2.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
-netty-all-4.0.41.Final.jar
+netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
diff --git a/pom.xml b/pom.xml
index 8aa0a6c3caab9..650b4cd965b66 100644
--- a/pom.xml
+++ b/pom.xml
@@ -552,7 +552,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-        <version>4.0.41.Final</version>
+        <version>4.0.42.Final</version>
       </dependency>
       <dependency>
         <groupId>io.netty</groupId>

From b2ba83d10ac06614c0126f4b0d913f6979051682 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 12 Nov 2016 06:13:22 -0800
Subject: [PATCH 094/534] [SPARK-14077][ML][FOLLOW-UP] Minor refactor and
 cleanup for NaiveBayes

## What changes were proposed in this pull request?
* Refactor out ```trainWithLabelCheck``` and make ```mllib.NaiveBayes``` call into it.
* Avoid capturing the outer object for ```modelType```.
* Move ```requireNonnegativeValues``` and ```requireZeroOneBernoulliValues``` to companion object.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15826 from yanboliang/spark-14077-2.

(cherry picked from commit 22cb3a060a440205281b71686637679645454ca6)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../spark/ml/classification/NaiveBayes.scala  | 72 +++++++++----------
 .../mllib/classification/NaiveBayes.scala     |  6 +-
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index b03a07a6bc1e7..f1a7676c74b0e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -76,7 +76,7 @@ class NaiveBayes @Since("1.5.0") (
   extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
   with NaiveBayesParams with DefaultParamsWritable {
 
-  import NaiveBayes.{Bernoulli, Multinomial}
+  import NaiveBayes._
 
   @Since("1.5.0")
   def this() = this(Identifiable.randomUID("nb"))
@@ -110,21 +110,20 @@ class NaiveBayes @Since("1.5.0") (
   @Since("2.1.0")
   def setWeightCol(value: String): this.type = set(weightCol, value)
 
+  override protected def train(dataset: Dataset[_]): NaiveBayesModel = {
+    trainWithLabelCheck(dataset, positiveLabel = true)
+  }
+
   /**
    * ml assumes input labels in range [0, numClasses). But this implementation
    * is also called by mllib NaiveBayes which allows other kinds of input labels
-   * such as {-1, +1}. Here we use this parameter to switch between different processing logic.
-   * It should be removed when we remove mllib NaiveBayes.
+   * such as {-1, +1}. `positiveLabel` is used to determine whether the label
+   * should be checked and it should be removed when we remove mllib NaiveBayes.
    */
-  private[spark] var isML: Boolean = true
-
-  private[spark] def setIsML(isML: Boolean): this.type = {
-    this.isML = isML
-    this
-  }
-
-  override protected def train(dataset: Dataset[_]): NaiveBayesModel = {
-    if (isML) {
+  private[spark] def trainWithLabelCheck(
+      dataset: Dataset[_],
+      positiveLabel: Boolean): NaiveBayesModel = {
+    if (positiveLabel) {
       val numClasses = getNumClasses(dataset)
       if (isDefined(thresholds)) {
         require($(thresholds).length == numClasses, this.getClass.getSimpleName +
@@ -133,28 +132,9 @@ class NaiveBayes @Since("1.5.0") (
       }
     }
 
-    val requireNonnegativeValues: Vector => Unit = (v: Vector) => {
-      val values = v match {
-        case sv: SparseVector => sv.values
-        case dv: DenseVector => dv.values
-      }
-
-      require(values.forall(_ >= 0.0),
-        s"Naive Bayes requires nonnegative feature values but found $v.")
-    }
-
-    val requireZeroOneBernoulliValues: Vector => Unit = (v: Vector) => {
-      val values = v match {
-        case sv: SparseVector => sv.values
-        case dv: DenseVector => dv.values
-      }
-
-      require(values.forall(v => v == 0.0 || v == 1.0),
-        s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
-    }
-
+    val modelTypeValue = $(modelType)
     val requireValues: Vector => Unit = {
-      $(modelType) match {
+      modelTypeValue match {
         case Multinomial =>
           requireNonnegativeValues
         case Bernoulli =>
@@ -226,13 +206,33 @@ class NaiveBayes @Since("1.5.0") (
 @Since("1.6.0")
 object NaiveBayes extends DefaultParamsReadable[NaiveBayes] {
   /** String name for multinomial model type. */
-  private[spark] val Multinomial: String = "multinomial"
+  private[classification] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[spark] val Bernoulli: String = "bernoulli"
+  private[classification] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
+
+  private[NaiveBayes] def requireNonnegativeValues(v: Vector): Unit = {
+    val values = v match {
+      case sv: SparseVector => sv.values
+      case dv: DenseVector => dv.values
+    }
+
+    require(values.forall(_ >= 0.0),
+      s"Naive Bayes requires nonnegative feature values but found $v.")
+  }
+
+  private[NaiveBayes] def requireZeroOneBernoulliValues(v: Vector): Unit = {
+    val values = v match {
+      case sv: SparseVector => sv.values
+      case dv: DenseVector => dv.values
+    }
+
+    require(values.forall(v => v == 0.0 || v == 1.0),
+      s"Bernoulli naive Bayes requires 0 or 1 feature values but found $v.")
+  }
 
   @Since("1.6.0")
   override def load(path: String): NaiveBayes = super.load(path)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 33561be4b5bc1..767d056861a8b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -364,12 +364,12 @@ class NaiveBayes private (
     val nb = new NewNaiveBayes()
       .setModelType(modelType)
       .setSmoothing(lambda)
-      .setIsML(false)
 
     val dataset = data.map { case LabeledPoint(label, features) => (label, features.asML) }
       .toDF("label", "features")
 
-    val newModel = nb.fit(dataset)
+    // mllib NaiveBayes allows input labels like {-1, +1}, so set `positiveLabel` as false.
+    val newModel = nb.trainWithLabelCheck(dataset, positiveLabel = false)
 
     val pi = newModel.pi.toArray
     val theta = Array.fill[Double](newModel.numClasses, newModel.numFeatures)(0.0)
@@ -378,7 +378,7 @@ class NaiveBayes private (
         theta(i)(j) = v
     }
 
-    require(newModel.oldLabels != null,
+    assert(newModel.oldLabels != null,
       "The underlying ML NaiveBayes training does not produce labels.")
     new NaiveBayesModel(newModel.oldLabels, pi, theta, modelType)
   }

From 6fae4241f281638d52071102c7f0ee6c2c73a8c7 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Sat, 12 Nov 2016 14:50:37 -0800
Subject: [PATCH 095/534] [SPARK-18418] Fix flags for make_binary_release for
 hadoop profile

## What changes were proposed in this pull request?

Fix the flags used to specify the hadoop version

## How was this patch tested?

Manually tested as part of https://github.com/apache/spark/pull/15659 by having the build succeed.

cc joshrosen

Author: Holden Karau <holden@us.ibm.com>

Closes #15860 from holdenk/minor-fix-release-build-script.

(cherry picked from commit 1386fd28daf798bf152606f4da30a36223d75d18)
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
---
 dev/create-release/release-build.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 96f9b5714ebb8..81f0d63054e29 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -187,10 +187,10 @@ if [[ "$1" == "package" ]]; then
   # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
   # share the same Zinc server.
   FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos"
-  make_binary_release "hadoop2.3" "-Phadoop2.3 $FLAGS" "3033" &
-  make_binary_release "hadoop2.4" "-Phadoop2.4 $FLAGS" "3034" &
-  make_binary_release "hadoop2.6" "-Phadoop2.6 $FLAGS" "3035" &
-  make_binary_release "hadoop2.7" "-Phadoop2.7 $FLAGS" "3036" &
+  make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" &
+  make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" &
+  make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" &
+  make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" &
   make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" &
   make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" &
   wait

From 0c69224ed752c25be1545cfe8ba0db8487a70bf2 Mon Sep 17 00:00:00 2001
From: Denny Lee <dennylee@gallifrey.local>
Date: Sun, 13 Nov 2016 18:10:06 -0800
Subject: [PATCH 096/534] [SPARK-18426][STRUCTURED STREAMING] Python
 Documentation Fix for Structured Streaming Programming Guide

## What changes were proposed in this pull request?

Update the python section of the Structured Streaming Guide from .builder() to .builder

## How was this patch tested?

Validated documentation and successfully running the test example.

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

'Builder' object is not callable object hence changed .builder() to
.builder

Author: Denny Lee <dennylee@gallifrey.local>

Closes #15872 from dennyglee/master.

(cherry picked from commit b91a51bb231af321860415075a7f404bc46e0a74)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 docs/structured-streaming-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index d838ed35a14fd..d2545584ae3b0 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -58,7 +58,7 @@ from pyspark.sql.functions import explode
 from pyspark.sql.functions import split
 
 spark = SparkSession \
-    .builder() \
+    .builder \
     .appName("StructuredNetworkWordCount") \
     .getOrCreate()
 {% endhighlight %}

From 8fc6455c0b77f81be79908bb65e6264bf61c90e7 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sun, 13 Nov 2016 20:25:12 -0800
Subject: [PATCH 097/534] [SPARK-18412][SPARKR][ML] Fix exception for some
 SparkR ML algorithms training on libsvm data

## What changes were proposed in this pull request?
* Fix the following exceptions which throws when ```spark.randomForest```(classification), ```spark.gbt```(classification), ```spark.naiveBayes``` and ```spark.glm```(binomial family) were fitted on libsvm data.
```
java.lang.IllegalArgumentException: requirement failed: If label column already exists, forceIndexLabel can not be set with true.
```
See [SPARK-18412](https://issues.apache.org/jira/browse/SPARK-18412) for more detail about how to reproduce this bug.
* Refactor out ```getFeaturesAndLabels``` to RWrapperUtils, since lots of ML algorithm wrappers use this function.
* Drop some unwanted columns when making prediction.

## How was this patch tested?
Add unit test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15851 from yanboliang/spark-18412.

(cherry picked from commit 07be232ea12dfc8dc3701ca948814be7dbebf4ee)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/inst/tests/testthat/test_mllib.R        | 18 ++++++++--
 .../spark/ml/r/GBTClassificationWrapper.scala | 18 ++++------
 .../GeneralizedLinearRegressionWrapper.scala  |  5 ++-
 .../apache/spark/ml/r/NaiveBayesWrapper.scala | 14 +++-----
 .../org/apache/spark/ml/r/RWrapperUtils.scala | 36 ++++++++++++++++---
 .../r/RandomForestClassificationWrapper.scala | 18 ++++------
 6 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 33e85b78de4fe..4831ce27bec8a 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -881,7 +881,8 @@ test_that("spark.kstest", {
   expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
 })
 
-test_that("spark.randomForest Regression", {
+test_that("spark.randomForest", {
+  # regression
   data <- suppressWarnings(createDataFrame(longley))
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 1)
@@ -923,9 +924,8 @@ test_that("spark.randomForest Regression", {
   expect_equal(stats$treeWeights, stats2$treeWeights)
 
   unlink(modelPath)
-})
 
-test_that("spark.randomForest Classification", {
+  # classification
   data <- suppressWarnings(createDataFrame(iris))
   model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
                               maxDepth = 5, maxBins = 16)
@@ -971,6 +971,12 @@ test_that("spark.randomForest Classification", {
   predictions <- collect(predict(model, data))$prediction
   expect_equal(length(grep("1.0", predictions)), 50)
   expect_equal(length(grep("2.0", predictions)), 50)
+
+  # spark.randomForest classification can work on libsvm data
+  data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+                source = "libsvm")
+  model <- spark.randomForest(data, label ~ features, "classification")
+  expect_equal(summary(model)$numFeatures, 4)
 })
 
 test_that("spark.gbt", {
@@ -1039,6 +1045,12 @@ test_that("spark.gbt", {
   expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
   expect_equal(s$numFeatures, 5)
   expect_equal(s$numTrees, 20)
+
+  # spark.gbt classification can work on libsvm data
+  data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
+                source = "libsvm")
+  model <- spark.gbt(data, label ~ features, "classification")
+  expect_equal(summary(model)$numFeatures, 692)
 })
 
 sparkR.session.stop()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
index 8946025032200..aacb41ee2659b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GBTClassificationWrapper.scala
@@ -23,10 +23,10 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -51,6 +51,7 @@ private[r] class GBTClassifierWrapper private (
     pipeline.transform(dataset)
       .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(gbtcModel.getFeaturesCol)
+      .drop(gbtcModel.getLabelCol)
   }
 
   override def write: MLWriter = new
@@ -81,19 +82,11 @@ private[r] object GBTClassifierWrapper extends MLReadable[GBTClassifierWrapper]
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
-    // get feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
-
-    // get label names from output schema
-    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
-      .asInstanceOf[NominalAttribute]
-    val labels = labelAttr.values.get
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val rfc = new GBTClassifier()
@@ -109,6 +102,7 @@ private[r] object GBTClassifierWrapper extends MLReadable[GBTClassifierWrapper]
       .setMaxMemoryInMB(maxMemoryInMB)
       .setCacheNodeIds(cacheNodeIds)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 995b1ef03bcec..add4d49110d16 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -29,6 +29,7 @@ import org.apache.spark.ml.regression._
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
@@ -64,6 +65,7 @@ private[r] class GeneralizedLinearRegressionWrapper private (
         .drop(PREDICTED_LABEL_PROB_COL)
         .drop(PREDICTED_LABEL_INDEX_COL)
         .drop(glm.getFeaturesCol)
+        .drop(glm.getLabelCol)
     } else {
       pipeline.transform(dataset)
         .drop(glm.getFeaturesCol)
@@ -92,7 +94,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       regParam: Double): GeneralizedLinearRegressionWrapper = {
     val rFormula = new RFormula().setFormula(formula)
     if (family == "binomial") rFormula.setForceIndexLabel(true)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
     val schema = rFormulaModel.transform(data).schema
@@ -109,6 +111,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setWeightCol(weightCol)
       .setRegParam(regParam)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
     val pipeline = if (family == "binomial") {
       // Convert prediction from probability to label index.
       val probToPred = new ProbabilityToPrediction()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
index 4fdab2dd94655..0afea4be3d1dd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
@@ -23,9 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -46,6 +46,7 @@ private[r] class NaiveBayesWrapper private (
     pipeline.transform(dataset)
       .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(naiveBayesModel.getFeaturesCol)
+      .drop(naiveBayesModel.getLabelCol)
   }
 
   override def write: MLWriter = new NaiveBayesWrapper.NaiveBayesWrapperWriter(this)
@@ -60,21 +61,16 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] {
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
-      .asInstanceOf[NominalAttribute]
-    val labels = labelAttr.values.get
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
     // assemble and fit the pipeline
     val naiveBayes = new NaiveBayes()
       .setSmoothing(smoothing)
       .setModelType("bernoulli")
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     val idxToStr = new IndexToString()
       .setInputCol(PREDICTED_LABEL_INDEX_COL)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala
index 379007c4d948d..665e50af67d46 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RWrapperUtils.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.ml.r
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
+import org.apache.spark.ml.feature.{RFormula, RFormulaModel}
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.Dataset
 
-object RWrapperUtils extends Logging {
+private[r] object RWrapperUtils extends Logging {
 
   /**
    * DataFrame column check.
@@ -32,14 +33,41 @@ object RWrapperUtils extends Logging {
    *
    * @param rFormula RFormula instance
    * @param data Input dataset
-   * @return Unit
    */
   def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = {
     if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) {
       val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}"
-      logWarning(s"data containing ${rFormula.getFeaturesCol} column, " +
+      logInfo(s"data containing ${rFormula.getFeaturesCol} column, " +
         s"using new name $newFeaturesName instead")
       rFormula.setFeaturesCol(newFeaturesName)
     }
+
+    if (rFormula.getForceIndexLabel && data.schema.fieldNames.contains(rFormula.getLabelCol)) {
+      val newLabelName = s"${Identifiable.randomUID(rFormula.getLabelCol)}"
+      logInfo(s"data containing ${rFormula.getLabelCol} column and we force to index label, " +
+        s"using new name $newLabelName instead")
+      rFormula.setLabelCol(newLabelName)
+    }
+  }
+
+  /**
+   * Get the feature names and original labels from the schema
+   * of DataFrame transformed by RFormulaModel.
+   *
+   * @param rFormulaModel The RFormulaModel instance.
+   * @param data Input dataset.
+   * @return The feature names and original labels.
+   */
+  def getFeaturesAndLabels(
+      rFormulaModel: RFormulaModel,
+      data: Dataset[_]): (Array[String], Array[String]) = {
+    val schema = rFormulaModel.transform(data).schema
+    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
+      .attributes.get
+    val features = featureAttrs.map(_.name.get)
+    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+    (features, labels)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
index 31f846dc6cfec..0b860e5af96e3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -23,10 +23,10 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -51,6 +51,7 @@ private[r] class RandomForestClassifierWrapper private (
     pipeline.transform(dataset)
       .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(rfcModel.getFeaturesCol)
+      .drop(rfcModel.getLabelCol)
   }
 
   override def write: MLWriter = new
@@ -82,19 +83,11 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
-    // get feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
-
-    // get label names from output schema
-    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
-      .asInstanceOf[NominalAttribute]
-    val labels = labelAttr.values.get
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val rfc = new RandomForestClassifier()
@@ -111,6 +104,7 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       .setCacheNodeIds(cacheNodeIds)
       .setProbabilityCol(probabilityCol)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     if (seed != null && seed.length > 0) rfc.setSeed(seed.toLong)
 

From 12bde11ca0613dbd7d917c81a8b480d5a9355da5 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 14 Nov 2016 16:52:07 +0900
Subject: [PATCH 098/534] [SPARK-18382][WEBUI] "run at null:-1" in UI when no
 file/line info in call site info

## What changes were proposed in this pull request?

Avoid reporting null/-1 file / line number in call sites if encountering StackTraceElement without this info

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15862 from srowen/SPARK-18382.

(cherry picked from commit f95b124c68ccc2e318f6ac30685aa47770eea8f3)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 892e112e18f85..a2386d6b9e12f 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1419,8 +1419,12 @@ private[spark] object Utils extends Logging {
             }
             callStack(0) = ste.toString // Put last Spark method on top of the stack trace.
           } else {
-            firstUserLine = ste.getLineNumber
-            firstUserFile = ste.getFileName
+            if (ste.getFileName != null) {
+              firstUserFile = ste.getFileName
+              if (ste.getLineNumber >= 0) {
+                firstUserLine = ste.getLineNumber
+              }
+            }
             callStack += ste.toString
             insideSpark = false
           }

From d554c02f4f50d3d58661d5f87aacf34152545c24 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Mon, 14 Nov 2016 12:08:06 +0100
Subject: [PATCH 099/534] [SPARK-18166][MLLIB] Fix Poisson GLM bug due to wrong
 requirement of response values

## What changes were proposed in this pull request?

The current implementation of Poisson GLM seems to allow only positive values. This is incorrect since the support of Poisson includes the origin. The bug is easily fixed by changing the test of the Poisson variable from  'require(y **>** 0.0' to  'require(y **>=** 0.0'.

mengxr  srowen

Author: actuaryzhang <actuaryzhang10@gmail.com>
Author: actuaryzhang <actuaryzhang@uber.com>

Closes #15683 from actuaryzhang/master.

(cherry picked from commit ae6cddb78742be94aa0851ce719f293e0a64ce4f)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../GeneralizedLinearRegression.scala         |  4 +-
 .../GeneralizedLinearRegressionSuite.scala    | 45 +++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 1938e8ecc513d..1d2961e0277f5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -501,8 +501,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
     val defaultLink: Link = Log
 
     override def initialize(y: Double, weight: Double): Double = {
-      require(y > 0.0, "The response variable of Poisson family " +
-        s"should be positive, but got $y")
+      require(y >= 0.0, "The response variable of Poisson family " +
+        s"should be non-negative, but got $y")
       y
     }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 111bc974642d9..6a4ac1735b2cb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -44,6 +44,7 @@ class GeneralizedLinearRegressionSuite
   @transient var datasetGaussianInverse: DataFrame = _
   @transient var datasetBinomial: DataFrame = _
   @transient var datasetPoissonLog: DataFrame = _
+  @transient var datasetPoissonLogWithZero: DataFrame = _
   @transient var datasetPoissonIdentity: DataFrame = _
   @transient var datasetPoissonSqrt: DataFrame = _
   @transient var datasetGammaInverse: DataFrame = _
@@ -88,6 +89,12 @@ class GeneralizedLinearRegressionSuite
       xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01,
       family = "poisson", link = "log").toDF()
 
+    datasetPoissonLogWithZero = generateGeneralizedLinearRegressionInput(
+      intercept = -1.5, coefficients = Array(0.22, 0.06), xMean = Array(2.9, 10.5),
+      xVariance = Array(0.7, 1.2), nPoints = 100, seed, noiseLevel = 0.01,
+      family = "poisson", link = "log")
+      .map{x => LabeledPoint(if (x.label < 0.7) 0.0 else x.label, x.features)}.toDF()
+
     datasetPoissonIdentity = generateGeneralizedLinearRegressionInput(
       intercept = 2.5, coefficients = Array(2.2, 0.6), xMean = Array(2.9, 10.5),
       xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01,
@@ -139,6 +146,10 @@ class GeneralizedLinearRegressionSuite
       label + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile(
       "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLog")
+    datasetPoissonLogWithZero.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLogWithZero")
     datasetPoissonIdentity.rdd.map { case Row(label: Double, features: Vector) =>
       label + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile(
@@ -456,6 +467,40 @@ class GeneralizedLinearRegressionSuite
     }
   }
 
+  test("generalized linear regression: poisson family against glm (with zero values)") {
+    /*
+       R code:
+       f1 <- data$V1 ~ data$V2 + data$V3 - 1
+       f2 <- data$V1 ~ data$V2 + data$V3
+
+       data <- read.csv("path", header=FALSE)
+       for (formula in c(f1, f2)) {
+         model <- glm(formula, family="poisson", data=data)
+         print(as.vector(coef(model)))
+       }
+       [1]  0.4272661 -0.1565423
+       [1] -3.6911354  0.6214301  0.1295814
+     */
+    val expected = Seq(
+      Vectors.dense(0.0, 0.4272661, -0.1565423),
+      Vectors.dense(-3.6911354, 0.6214301, 0.1295814))
+
+    import GeneralizedLinearRegression._
+
+    var idx = 0
+    val link = "log"
+    val dataset = datasetPoissonLogWithZero
+    for (fitIntercept <- Seq(false, true)) {
+      val trainer = new GeneralizedLinearRegression().setFamily("poisson").setLink(link)
+        .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction")
+      val model = trainer.fit(dataset)
+      val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1))
+      assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with poisson family, " +
+        s"$link link and fitIntercept = $fitIntercept (with zero values).")
+      idx += 1
+    }
+  }
+
   test("generalized linear regression: gamma family against glm") {
     /*
        R code:

From 518dc1e1e63a8955b16a3f2ca7592264fd637ae6 Mon Sep 17 00:00:00 2001
From: WangTaoTheTonic <wangtao111@huawei.com>
Date: Mon, 14 Nov 2016 12:22:36 +0100
Subject: [PATCH 100/534] [SPARK-18396][HISTORYSERVER] Duration" column makes
 search result confused, maybe we should make it unsearchable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

When we search data in History Server, it will check if any columns contains the search string. Duration is represented as long value in table, so if we search simple string like "003", "111", the duration containing "003", ‘111“ will be showed, which make not much sense to users.
We cannot simply transfer the long value to meaning format like "1 h", "3.2 min" because they are also used for sorting. Better way to handle it is ban "Duration" columns from searching.

## How was this patch tested

manually tests.

Before("local-1478225166651" pass the filter because its duration in long value, which is "257244245" contains search string "244"):
![before](https://cloud.githubusercontent.com/assets/5276001/20203166/f851ffc6-a7ff-11e6-8fe6-91a90ca92b23.jpg)

After:
![after](https://cloud.githubusercontent.com/assets/5276001/20178646/2129fbb0-a78d-11e6-9edb-39f885ce3ed0.jpg)

Author: WangTaoTheTonic <wangtao111@huawei.com>

Closes #15838 from WangTaoTheTonic/duration.

(cherry picked from commit 637a0bb88f74712001f32a53ff66fd0b8cb67e4a)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/resources/org/apache/spark/ui/static/historypage.js   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index 6c0ec8d5fce54..8fd91865b0429 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -139,6 +139,9 @@ $(document).ready(function() {
                         {name: 'eighth'},
                         {name: 'ninth'},
                     ],
+                    "columnDefs": [
+                        {"searchable": false, "targets": [5]}
+                    ],
                     "autoWidth": false,
                     "order": [[ 4, "desc" ]]
         };

From c07fe1c5924e167fb569427e5e6b78adcfde648e Mon Sep 17 00:00:00 2001
From: Noritaka Sekiyama <moomindani@gmail.com>
Date: Mon, 14 Nov 2016 21:07:59 +0900
Subject: [PATCH 101/534] [SPARK-18432][DOC] Changed HDFS default block size
 from 64MB to 128MB

Changed HDFS default block size from 64MB to 128MB.
https://issues.apache.org/jira/browse/SPARK-18432

Author: Noritaka Sekiyama <moomindani@gmail.com>

Closes #15879 from moomindani/SPARK-18432.

(cherry picked from commit 9d07ceee7860921eafb55b47852f1b51089c98da)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
---
 docs/programming-guide.md | 6 +++---
 docs/tuning.md            | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index b9a2110b602a0..58bf17b4a84ef 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -343,7 +343,7 @@ Some notes on reading files with Spark:
 
 * All of Spark's file-based input methods, including `textFile`, support running on directories, compressed files, and wildcards as well. For example, you can use `textFile("/my/directory")`, `textFile("/my/directory/*.txt")`, and `textFile("/my/directory/*.gz")`.
 
-* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
+* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
 
 Apart from text files, Spark's Scala API also supports several other data formats:
 
@@ -375,7 +375,7 @@ Some notes on reading files with Spark:
 
 * All of Spark's file-based input methods, including `textFile`, support running on directories, compressed files, and wildcards as well. For example, you can use `textFile("/my/directory")`, `textFile("/my/directory/*.txt")`, and `textFile("/my/directory/*.gz")`.
 
-* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
+* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
 
 Apart from text files, Spark's Java API also supports several other data formats:
 
@@ -407,7 +407,7 @@ Some notes on reading files with Spark:
 
 * All of Spark's file-based input methods, including `textFile`, support running on directories, compressed files, and wildcards as well. For example, you can use `textFile("/my/directory")`, `textFile("/my/directory/*.txt")`, and `textFile("/my/directory/*.gz")`.
 
-* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 64MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
+* The `textFile` method also takes an optional second argument for controlling the number of partitions of the file. By default, Spark creates one partition for each block of the file (blocks being 128MB by default in HDFS), but you can also ask for a higher number of partitions by passing a larger value. Note that you cannot have fewer partitions than blocks.
 
 Apart from text files, Spark's Python API also supports several other data formats:
 
diff --git a/docs/tuning.md b/docs/tuning.md
index 9c43b315bbb9e..0de303a3bd9bf 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -224,8 +224,8 @@ temporary objects created during task execution. Some steps which may be useful
 
 * As an example, if your task is reading data from HDFS, the amount of memory used by the task can be estimated using
   the size of the data block read from HDFS. Note that the size of a decompressed block is often 2 or 3 times the
-  size of the block. So if we wish to have 3 or 4 tasks' worth of working space, and the HDFS block size is 64 MB,
-  we can estimate size of Eden to be `4*3*64MB`.
+  size of the block. So if we wish to have 3 or 4 tasks' worth of working space, and the HDFS block size is 128 MB,
+  we can estimate size of Eden to be `4*3*128MB`.
 
 * Monitor how the frequency and time taken by garbage collection changes with the new settings.
 

From 3c623d226a0c495c36c86d199879b9e922d1ece2 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 14 Nov 2016 10:03:01 -0800
Subject: [PATCH 102/534] [SPARK-18416][STRUCTURED STREAMING] Fixed temp file
 leak in state store

## What changes were proposed in this pull request?

StateStore.get() causes temporary files to be created immediately, even if the store is not used to make updates for new version. The temp file is not closed as store.commit() is not called in those cases, thus keeping the output stream to temp file open forever.

This PR fixes it by opening the temp file only when there are updates being made.

## How was this patch tested?

New unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15859 from tdas/SPARK-18416.

(cherry picked from commit bdfe60ac921172be0fb77de2f075cc7904a3b238)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../state/HDFSBackedStateStoreProvider.scala  | 10 +--
 .../streaming/state/StateStoreSuite.scala     | 63 +++++++++++++++++++
 2 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index 808713161c316..f07feaad5dc71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -87,8 +87,7 @@ private[state] class HDFSBackedStateStoreProvider(
 
     private val newVersion = version + 1
     private val tempDeltaFile = new Path(baseDir, s"temp-${Random.nextLong}")
-    private val tempDeltaFileStream = compressStream(fs.create(tempDeltaFile, true))
-
+    private lazy val tempDeltaFileStream = compressStream(fs.create(tempDeltaFile, true))
     private val allUpdates = new java.util.HashMap[UnsafeRow, StoreUpdate]()
 
     @volatile private var state: STATE = UPDATING
@@ -101,7 +100,7 @@ private[state] class HDFSBackedStateStoreProvider(
     }
 
     override def put(key: UnsafeRow, value: UnsafeRow): Unit = {
-      verify(state == UPDATING, "Cannot remove after already committed or aborted")
+      verify(state == UPDATING, "Cannot put after already committed or aborted")
 
       val isNewKey = !mapToUpdate.containsKey(key)
       mapToUpdate.put(key, value)
@@ -125,6 +124,7 @@ private[state] class HDFSBackedStateStoreProvider(
     /** Remove keys that match the following condition */
     override def remove(condition: UnsafeRow => Boolean): Unit = {
       verify(state == UPDATING, "Cannot remove after already committed or aborted")
+
       val keyIter = mapToUpdate.keySet().iterator()
       while (keyIter.hasNext) {
         val key = keyIter.next
@@ -154,7 +154,7 @@ private[state] class HDFSBackedStateStoreProvider(
         finalizeDeltaFile(tempDeltaFileStream)
         finalDeltaFile = commitUpdates(newVersion, mapToUpdate, tempDeltaFile)
         state = COMMITTED
-        logInfo(s"Committed version $newVersion for $this")
+        logInfo(s"Committed version $newVersion for $this to file $finalDeltaFile")
         newVersion
       } catch {
         case NonFatal(e) =>
@@ -174,7 +174,7 @@ private[state] class HDFSBackedStateStoreProvider(
       if (tempDeltaFile != null) {
         fs.delete(tempDeltaFile, true)
       }
-      logInfo("Aborted")
+      logInfo(s"Aborted version $newVersion for $this")
     }
 
     /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index 504a26516107f..533cd0cd2a2ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -468,6 +468,69 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     assert(e.getCause.getMessage.contains("Failed to rename"))
   }
 
+  test("SPARK-18416: do not create temp delta file until the store is updated") {
+    val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
+    val storeId = StateStoreId(dir, 0, 0)
+    val storeConf = StateStoreConf.empty
+    val hadoopConf = new Configuration()
+    val deltaFileDir = new File(s"$dir/0/0/")
+
+    def numTempFiles: Int = {
+      if (deltaFileDir.exists) {
+        deltaFileDir.listFiles.map(_.getName).count(n => n.contains("temp") && !n.startsWith("."))
+      } else 0
+    }
+
+    def numDeltaFiles: Int = {
+      if (deltaFileDir.exists) {
+        deltaFileDir.listFiles.map(_.getName).count(n => n.contains(".delta") && !n.startsWith("."))
+      } else 0
+    }
+
+    def shouldNotCreateTempFile[T](body: => T): T = {
+      val before = numTempFiles
+      val result = body
+      assert(numTempFiles === before)
+      result
+    }
+
+    // Getting the store should not create temp file
+    val store0 = shouldNotCreateTempFile {
+      StateStore.get(storeId, keySchema, valueSchema, 0, storeConf, hadoopConf)
+    }
+
+    // Put should create a temp file
+    put(store0, "a", 1)
+    assert(numTempFiles === 1)
+    assert(numDeltaFiles === 0)
+
+    // Commit should remove temp file and create a delta file
+    store0.commit()
+    assert(numTempFiles === 0)
+    assert(numDeltaFiles === 1)
+
+    // Remove should create a temp file
+    val store1 = shouldNotCreateTempFile {
+      StateStore.get(storeId, keySchema, valueSchema, 1, storeConf, hadoopConf)
+    }
+    remove(store1, _ == "a")
+    assert(numTempFiles === 1)
+    assert(numDeltaFiles === 1)
+
+    // Commit should remove temp file and create a delta file
+    store1.commit()
+    assert(numTempFiles === 0)
+    assert(numDeltaFiles === 2)
+
+    // Commit without any updates should create a delta file
+    val store2 = shouldNotCreateTempFile {
+      StateStore.get(storeId, keySchema, valueSchema, 2, storeConf, hadoopConf)
+    }
+    store2.commit()
+    assert(numTempFiles === 0)
+    assert(numDeltaFiles === 3)
+  }
+
   def getDataFromFiles(
       provider: HDFSBackedStateStoreProvider,
     version: Int = -1): Set[(String, Int)] = {

From db691f05cec9e03f507c5ed544bcc6edefb3842d Mon Sep 17 00:00:00 2001
From: cody koeninger <cody@koeninger.org>
Date: Mon, 14 Nov 2016 11:10:37 -0800
Subject: [PATCH 103/534] [SPARK-17510][STREAMING][KAFKA] config max rate on a
 per-partition basis

## What changes were proposed in this pull request?

Allow configuration of max rate on a per-topicpartition basis.
## How was this patch tested?

Unit tests.

The reporter (Jeff Nadler) said he could test on his workload, so let's wait on that report.

Author: cody koeninger <cody@koeninger.org>

Closes #15132 from koeninger/SPARK-17510.

(cherry picked from commit 89d1fa58dbe88560b1f2b0362fcc3035ccc888be)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../kafka010/DirectKafkaInputDStream.scala    | 11 ++--
 .../spark/streaming/kafka010/KafkaUtils.scala | 53 ++++++++++++++++++-
 .../kafka010/PerPartitionConfig.scala         | 47 ++++++++++++++++
 .../kafka010/DirectKafkaStreamSuite.scala     | 34 ++++++++----
 .../kafka/DirectKafkaInputDStream.scala       |  4 +-
 5 files changed, 131 insertions(+), 18 deletions(-)
 create mode 100644 external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala

diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index 7e57bb18cbd50..794f53c5abfd0 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -57,7 +57,8 @@ import org.apache.spark.streaming.scheduler.rate.RateEstimator
 private[spark] class DirectKafkaInputDStream[K, V](
     _ssc: StreamingContext,
     locationStrategy: LocationStrategy,
-    consumerStrategy: ConsumerStrategy[K, V]
+    consumerStrategy: ConsumerStrategy[K, V],
+    ppc: PerPartitionConfig
   ) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {
 
   val executorKafkaParams = {
@@ -128,12 +129,9 @@ private[spark] class DirectKafkaInputDStream[K, V](
     }
   }
 
-  private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
-    "spark.streaming.kafka.maxRatePerPartition", 0)
-
   protected[streaming] def maxMessagesPerPartition(
     offsets: Map[TopicPartition, Long]): Option[Map[TopicPartition, Long]] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
+    val estimatedRateLimit = rateController.map(_.getLatestRate())
 
     // calculate a per-partition rate limit based on current lag
     val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {
@@ -144,11 +142,12 @@ private[spark] class DirectKafkaInputDStream[K, V](
         val totalLag = lagPerPartition.values.sum
 
         lagPerPartition.map { case (tp, lag) =>
+          val maxRateLimitPerPartition = ppc.maxRatePerPartition(tp)
           val backpressureRate = Math.round(lag / totalLag.toFloat * rate)
           tp -> (if (maxRateLimitPerPartition > 0) {
             Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate)
         }
-      case None => offsets.map { case (tp, offset) => tp -> maxRateLimitPerPartition }
+      case None => offsets.map { case (tp, offset) => tp -> ppc.maxRatePerPartition(tp) }
     }
 
     if (effectiveRateLimitPerPartition.values.sum > 0) {
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala
index b2190bfa05a3a..c11917f59d5b8 100644
--- a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala
@@ -123,7 +123,31 @@ object KafkaUtils extends Logging {
       locationStrategy: LocationStrategy,
       consumerStrategy: ConsumerStrategy[K, V]
     ): InputDStream[ConsumerRecord[K, V]] = {
-    new DirectKafkaInputDStream[K, V](ssc, locationStrategy, consumerStrategy)
+    val ppc = new DefaultPerPartitionConfig(ssc.sparkContext.getConf)
+    createDirectStream[K, V](ssc, locationStrategy, consumerStrategy, ppc)
+  }
+
+  /**
+   * :: Experimental ::
+   * Scala constructor for a DStream where
+   * each given Kafka topic/partition corresponds to an RDD partition.
+   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+   *   see [[LocationStrategies]] for more details.
+   * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
+   *   see [[ConsumerStrategies]] for more details.
+   * @param perPartitionConfig configuration of settings such as max rate on a per-partition basis.
+   *   see [[PerPartitionConfig]] for more details.
+   * @tparam K type of Kafka message key
+   * @tparam V type of Kafka message value
+   */
+  @Experimental
+  def createDirectStream[K, V](
+      ssc: StreamingContext,
+      locationStrategy: LocationStrategy,
+      consumerStrategy: ConsumerStrategy[K, V],
+      perPartitionConfig: PerPartitionConfig
+    ): InputDStream[ConsumerRecord[K, V]] = {
+    new DirectKafkaInputDStream[K, V](ssc, locationStrategy, consumerStrategy, perPartitionConfig)
   }
 
   /**
@@ -150,6 +174,33 @@ object KafkaUtils extends Logging {
         jssc.ssc, locationStrategy, consumerStrategy))
   }
 
+  /**
+   * :: Experimental ::
+   * Java constructor for a DStream where
+   * each given Kafka topic/partition corresponds to an RDD partition.
+   * @param keyClass Class of the keys in the Kafka records
+   * @param valueClass Class of the values in the Kafka records
+   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+   *   see [[LocationStrategies]] for more details.
+   * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
+   *   see [[ConsumerStrategies]] for more details
+   * @param perPartitionConfig configuration of settings such as max rate on a per-partition basis.
+   *   see [[PerPartitionConfig]] for more details.
+   * @tparam K type of Kafka message key
+   * @tparam V type of Kafka message value
+   */
+  @Experimental
+  def createDirectStream[K, V](
+      jssc: JavaStreamingContext,
+      locationStrategy: LocationStrategy,
+      consumerStrategy: ConsumerStrategy[K, V],
+      perPartitionConfig: PerPartitionConfig
+    ): JavaInputDStream[ConsumerRecord[K, V]] = {
+    new JavaInputDStream(
+      createDirectStream[K, V](
+        jssc.ssc, locationStrategy, consumerStrategy, perPartitionConfig))
+  }
+
   /**
    * Tweak kafka params to prevent issues on executors
    */
diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala
new file mode 100644
index 0000000000000..4792f2a955110
--- /dev/null
+++ b/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka010
+
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.SparkConf
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Interface for user-supplied configurations that can't otherwise be set via Spark properties,
+ * because they need tweaking on a per-partition basis,
+ */
+@Experimental
+abstract class PerPartitionConfig extends Serializable {
+  /**
+   *  Maximum rate (number of records per second) at which data will be read
+   *  from each Kafka partition.
+   */
+  def maxRatePerPartition(topicPartition: TopicPartition): Long
+}
+
+/**
+ * Default per-partition configuration
+ */
+private class DefaultPerPartitionConfig(conf: SparkConf)
+    extends PerPartitionConfig {
+  val maxRate = conf.getLong("spark.streaming.kafka.maxRatePerPartition", 0)
+
+  def maxRatePerPartition(topicPartition: TopicPartition): Long = maxRate
+}
diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
index 02aec43c3b34f..f36e0a901f7b0 100644
--- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
+++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
@@ -252,7 +252,8 @@ class DirectKafkaStreamSuite
       val s = new DirectKafkaInputDStream[String, String](
         ssc,
         preferredHosts,
-        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala))
+        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala),
+        new DefaultPerPartitionConfig(sparkConf))
       s.consumer.poll(0)
       assert(
         s.consumer.position(topicPartition) >= offsetBeforeStart,
@@ -306,7 +307,8 @@ class DirectKafkaStreamSuite
         ConsumerStrategies.Assign[String, String](
           List(topicPartition),
           kafkaParams.asScala,
-          Map(topicPartition -> 11L)))
+          Map(topicPartition -> 11L)),
+        new DefaultPerPartitionConfig(sparkConf))
       s.consumer.poll(0)
       assert(
         s.consumer.position(topicPartition) >= offsetBeforeStart,
@@ -518,7 +520,7 @@ class DirectKafkaStreamSuite
 
   test("maxMessagesPerPartition with backpressure disabled") {
     val topic = "maxMessagesPerPartition"
-    val kafkaStream = getDirectKafkaStream(topic, None)
+    val kafkaStream = getDirectKafkaStream(topic, None, None)
 
     val input = Map(new TopicPartition(topic, 0) -> 50L, new TopicPartition(topic, 1) -> 50L)
     assert(kafkaStream.maxMessagesPerPartition(input).get ==
@@ -528,7 +530,7 @@ class DirectKafkaStreamSuite
   test("maxMessagesPerPartition with no lag") {
     val topic = "maxMessagesPerPartition"
     val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 100))
-    val kafkaStream = getDirectKafkaStream(topic, rateController)
+    val kafkaStream = getDirectKafkaStream(topic, rateController, None)
 
     val input = Map(new TopicPartition(topic, 0) -> 0L, new TopicPartition(topic, 1) -> 0L)
     assert(kafkaStream.maxMessagesPerPartition(input).isEmpty)
@@ -537,11 +539,19 @@ class DirectKafkaStreamSuite
   test("maxMessagesPerPartition respects max rate") {
     val topic = "maxMessagesPerPartition"
     val rateController = Some(new ConstantRateController(0, new ConstantEstimator(100), 1000))
-    val kafkaStream = getDirectKafkaStream(topic, rateController)
+    val ppc = Some(new PerPartitionConfig {
+      def maxRatePerPartition(tp: TopicPartition) =
+        if (tp.topic == topic && tp.partition == 0) {
+          50
+        } else {
+          100
+        }
+    })
+    val kafkaStream = getDirectKafkaStream(topic, rateController, ppc)
 
     val input = Map(new TopicPartition(topic, 0) -> 1000L, new TopicPartition(topic, 1) -> 1000L)
     assert(kafkaStream.maxMessagesPerPartition(input).get ==
-      Map(new TopicPartition(topic, 0) -> 10L, new TopicPartition(topic, 1) -> 10L))
+      Map(new TopicPartition(topic, 0) -> 5L, new TopicPartition(topic, 1) -> 10L))
   }
 
   test("using rate controller") {
@@ -570,7 +580,9 @@ class DirectKafkaStreamSuite
       new DirectKafkaInputDStream[String, String](
         ssc,
         preferredHosts,
-        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala)) {
+        ConsumerStrategies.Subscribe[String, String](List(topic), kafkaParams.asScala),
+        new DefaultPerPartitionConfig(sparkConf)
+      ) {
         override protected[streaming] val rateController =
           Some(new DirectKafkaRateController(id, estimator))
       }.map(r => (r.key, r.value))
@@ -616,7 +628,10 @@ class DirectKafkaStreamSuite
     }.toSeq.sortBy { _._1 }
   }
 
-  private def getDirectKafkaStream(topic: String, mockRateController: Option[RateController]) = {
+  private def getDirectKafkaStream(
+      topic: String,
+      mockRateController: Option[RateController],
+      ppc: Option[PerPartitionConfig]) = {
     val batchIntervalMilliseconds = 100
 
     val sparkConf = new SparkConf()
@@ -643,7 +658,8 @@ class DirectKafkaStreamSuite
           tps.foreach(tp => consumer.seek(tp, 0))
           consumer
         }
-      }
+      },
+      ppc.getOrElse(new DefaultPerPartitionConfig(sparkConf))
     ) {
         override protected[streaming] val rateController = mockRateController
     }
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
index c3c799375bbeb..d52c230eb7849 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -88,12 +88,12 @@ class DirectKafkaInputDStream[
 
   protected val kc = new KafkaCluster(kafkaParams)
 
-  private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
+  private val maxRateLimitPerPartition: Long = context.sparkContext.getConf.getLong(
       "spark.streaming.kafka.maxRatePerPartition", 0)
 
   protected[streaming] def maxMessagesPerPartition(
       offsets: Map[TopicAndPartition, Long]): Option[Map[TopicAndPartition, Long]] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
+    val estimatedRateLimit = rateController.map(_.getLatestRate())
 
     // calculate a per-partition rate limit based on current lag
     val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ > 0) match {

From cff7a70b59c3ac2cb1fab2216e9e6dcf2a6ac89a Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 14 Nov 2016 19:42:00 +0000
Subject: [PATCH 104/534] [SPARK-11496][GRAPHX][FOLLOWUP] Add param checking
 for runParallelPersonalizedPageRank

## What changes were proposed in this pull request?
add the param checking to keep in line with other algos

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15876 from zhengruifeng/param_check_runParallelPersonalizedPageRank.

(cherry picked from commit 75934457d75996be71ffd0d4b448497d656c0d40)
Signed-off-by: DB Tsai <dbtsai@dbtsai.com>
---
 .../main/scala/org/apache/spark/graphx/lib/PageRank.scala  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index f4b00757a8b54..c0c3c73463aab 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -185,6 +185,13 @@ object PageRank extends Logging {
   def runParallelPersonalizedPageRank[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
     numIter: Int, resetProb: Double = 0.15,
     sources: Array[VertexId]): Graph[Vector, Double] = {
+    require(numIter > 0, s"Number of iterations must be greater than 0," +
+      s" but got ${numIter}")
+    require(resetProb >= 0 && resetProb <= 1, s"Random reset probability must belong" +
+      s" to [0, 1], but got ${resetProb}")
+    require(sources.nonEmpty, s"The list of sources must be non-empty," +
+      s" but got ${sources.mkString("[", ",", "]")}")
+
     // TODO if one sources vertex id is outside of the int range
     // we won't be able to store its activations in a sparse vector
     val zero = Vectors.sparse(sources.size, List()).asBreeze

From ae66799feec895751f49418885da58f35fc2aaa6 Mon Sep 17 00:00:00 2001
From: Nattavut Sutyanyong <nsy.can@gmail.com>
Date: Mon, 14 Nov 2016 20:59:15 +0100
Subject: [PATCH 105/534] [SPARK-17348][SQL] Incorrect results from subquery
 transformation

## What changes were proposed in this pull request?

Return an Analysis exception when there is a correlated non-equality predicate in a subquery and the correlated column from the outer reference is not from the immediate parent operator of the subquery. This PR prevents incorrect results from subquery transformation in such case.

Test cases, both positive and negative tests, are added.

## How was this patch tested?

sql/test, catalyst/test, hive/test, and scenarios that will produce incorrect results without this PR and product correct results when subquery transformation does happen.

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #15763 from nsyca/spark-17348.

(cherry picked from commit bd85603ba5f9e61e1aa8326d3e4d5703b5977a4c)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 44 +++++++++
 .../sql/catalyst/analysis/CheckAnalysis.scala |  7 --
 .../org/apache/spark/sql/SubquerySuite.scala  | 95 ++++++++++++++++++-
 3 files changed, 137 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8dbec408002f1..dcee2e4b1fe73 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -972,6 +972,37 @@ class Analyzer(
         }
       }
 
+      // SPARK-17348: A potential incorrect result case.
+      // When a correlated predicate is a non-equality predicate,
+      // certain operators are not permitted from the operator
+      // hosting the correlated predicate up to the operator on the outer table.
+      // Otherwise, the pull up of the correlated predicate
+      // will generate a plan with a different semantics
+      // which could return incorrect result.
+      // Currently we check for Aggregate and Window operators
+      //
+      // Below shows an example of a Logical Plan during Analyzer phase that
+      // show this problem. Pulling the correlated predicate [outer(c2#77) >= ..]
+      // through the Aggregate (or Window) operator could alter the result of
+      // the Aggregate.
+      //
+      // Project [c1#76]
+      // +- Project [c1#87, c2#88]
+      // :  (Aggregate or Window operator)
+      // :  +- Filter [outer(c2#77) >= c2#88)]
+      // :     +- SubqueryAlias t2, `t2`
+      // :        +- Project [_1#84 AS c1#87, _2#85 AS c2#88]
+      // :           +- LocalRelation [_1#84, _2#85]
+      // +- SubqueryAlias t1, `t1`
+      // +- Project [_1#73 AS c1#76, _2#74 AS c2#77]
+      // +- LocalRelation [_1#73, _2#74]
+      def failOnNonEqualCorrelatedPredicate(found: Boolean, p: LogicalPlan): Unit = {
+        if (found) {
+          // Report a non-supported case as an exception
+          failAnalysis(s"Correlated column is not allowed in a non-equality predicate:\n$p")
+        }
+      }
+
       /** Determine which correlated predicate references are missing from this plan. */
       def missingReferences(p: LogicalPlan): AttributeSet = {
         val localPredicateReferences = p.collect(predicateMap)
@@ -982,12 +1013,20 @@ class Analyzer(
         localPredicateReferences -- p.outputSet
       }
 
+      var foundNonEqualCorrelatedPred : Boolean = false
+
       // Simplify the predicates before pulling them out.
       val transformed = BooleanSimplification(sub) transformUp {
         case f @ Filter(cond, child) =>
           // Find all predicates with an outer reference.
           val (correlated, local) = splitConjunctivePredicates(cond).partition(containsOuter)
 
+          // Find any non-equality correlated predicates
+          foundNonEqualCorrelatedPred = foundNonEqualCorrelatedPred || correlated.exists {
+            case _: EqualTo | _: EqualNullSafe => false
+            case _ => true
+          }
+
           // Rewrite the filter without the correlated predicates if any.
           correlated match {
             case Nil => f
@@ -1009,12 +1048,17 @@ class Analyzer(
           }
         case a @ Aggregate(grouping, expressions, child) =>
           failOnOuterReference(a)
+          failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a)
+
           val referencesToAdd = missingReferences(a)
           if (referencesToAdd.nonEmpty) {
             Aggregate(grouping ++ referencesToAdd, expressions ++ referencesToAdd, child)
           } else {
             a
           }
+        case w : Window =>
+          failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, w)
+          w
         case j @ Join(left, _, RightOuter, _) =>
           failOnOuterReference(j)
           failOnOuterReferenceInSubTree(left, "a RIGHT OUTER JOIN")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 3455a567b7786..7b75c1f70974b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -119,13 +119,6 @@ trait CheckAnalysis extends PredicateHelper {
             }
 
           case s @ ScalarSubquery(query, conditions, _) if conditions.nonEmpty =>
-            // Make sure we are using equi-joins.
-            conditions.foreach {
-              case _: EqualTo | _: EqualNullSafe => // ok
-              case e => failAnalysis(
-                s"The correlated scalar subquery can only contain equality predicates: $e")
-            }
-
             // Make sure correlated scalar subqueries contain one row for every outer row by
             // enforcing that they are aggregates which contain exactly one aggregate expressions.
             // The analyzer has already checked that subquery contained only one output column, and
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 89348668340be..c84a6f161893c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -498,10 +498,10 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
 
   test("non-equal correlated scalar subquery") {
     val msg1 = intercept[AnalysisException] {
-      sql("select a, (select b from l l2 where l2.a < l1.a) sum_b from l l1")
+      sql("select a, (select sum(b) from l l2 where l2.a < l1.a) sum_b from l l1")
     }
     assert(msg1.getMessage.contains(
-      "The correlated scalar subquery can only contain equality predicates"))
+      "Correlated column is not allowed in a non-equality predicate:"))
   }
 
   test("disjunctive correlated scalar subquery") {
@@ -639,6 +639,97 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
             |         from   t1 left join t2 on t1.c1=t2.c2) t3
             | where  c3 not in (select c2 from t2)""".stripMargin),
         Row(2) :: Nil)
+     }
+   }
+
+   test("SPARK-17348: Correlated subqueries with non-equality predicate (good case)") {
+     withTempView("t1", "t2") {
+       Seq((1, 1)).toDF("c1", "c2").createOrReplaceTempView("t1")
+       Seq((1, 1), (2, 0)).toDF("c1", "c2").createOrReplaceTempView("t2")
+
+       // Simple case
+       checkAnswer(
+         sql(
+           """
+             | select c1
+             | from   t1
+             | where  c1 in (select t2.c1
+             |               from   t2
+             |               where  t1.c2 >= t2.c2)""".stripMargin),
+         Row(1) :: Nil)
+
+       // More complex case with OR predicate
+       checkAnswer(
+         sql(
+           """
+             | select t1.c1
+             | from   t1, t1 as t3
+             | where  t1.c1 = t3.c1
+             | and    (t1.c1 in (select t2.c1
+             |                   from   t2
+             |                   where  t1.c2 >= t2.c2
+             |                          or t3.c2 < t2.c2)
+             |         or t1.c2 >= 0)""".stripMargin),
+         Row(1) :: Nil)
+    }
+  }
+
+  test("SPARK-17348: Correlated subqueries with non-equality predicate (error case)") {
+    withTempView("t1", "t2", "t3", "t4") {
+      Seq((1, 1)).toDF("c1", "c2").createOrReplaceTempView("t1")
+      Seq((1, 1), (2, 0)).toDF("c1", "c2").createOrReplaceTempView("t2")
+      Seq((2, 1)).toDF("c1", "c2").createOrReplaceTempView("t3")
+      Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t4")
+
+      // Simplest case
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  t1.c1 in (select max(t2.c1)
+            |                  from   t2
+            |                  where  t1.c2 >= t2.c2)""".stripMargin).collect()
+      }
+
+      // Add a HAVING on top and augmented within an OR predicate
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  t1.c1 in (select max(t2.c1)
+            |                  from   t2
+            |                  where  t1.c2 >= t2.c2
+            |                  having count(*) > 0 )
+            |         or t1.c2 >= 0""".stripMargin).collect()
+      }
+
+      // Add a HAVING on top and augmented within an OR predicate
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1, t1 as t3
+            | where  t1.c1 = t3.c1
+            | and    (t1.c1 in (select max(t2.c1)
+            |                   from   t2
+            |                   where  t1.c2 = t2.c2
+            |                          or t3.c2 = t2.c2)
+            |        )""".stripMargin).collect()
+      }
+
+      // In Window expression: changing the data set to
+      // demonstrate if this query ran, it would return incorrect result.
+      intercept[AnalysisException] {
+        sql(
+          """
+          | select c1
+          | from   t3
+          | where  c1 in (select max(t4.c1) over ()
+          |               from   t4
+          |               where t3.c2 >= t4.c2)""".stripMargin).collect()
+      }
     }
   }
 }

From 27999b3661481c0232135dbe021787afe963d812 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Mon, 14 Nov 2016 16:46:26 -0800
Subject: [PATCH 106/534] [SPARK-18124] Observed delay based Event Time
 Watermarks

This PR adds a new method `withWatermark` to the `Dataset` API, which can be used specify an _event time watermark_.  An event time watermark allows the streaming engine to reason about the point in time after which we no longer expect to see late data.  This PR also has augmented `StreamExecution` to use this watermark for several purposes:
  - To know when a given time window aggregation is finalized and thus results can be emitted when using output modes that do not allow updates (e.g. `Append` mode).
  - To minimize the amount of state that we need to keep for on-going aggregations, by evicting state for groups that are no longer expected to change.  Although, we do still maintain all state if the query requires (i.e. if the event time is not present in the `groupBy` or when running in `Complete` mode).

An example that emits windowed counts of records, waiting up to 5 minutes for late data to arrive.
```scala
df.withWatermark("eventTime", "5 minutes")
  .groupBy(window($"eventTime", "1 minute") as 'window)
  .count()
  .writeStream
  .format("console")
  .mode("append") // In append mode, we only output finalized aggregations.
  .start()
```

### Calculating the watermark.
The current event time is computed by looking at the `MAX(eventTime)` seen this epoch across all of the partitions in the query minus some user defined _delayThreshold_.  An additional constraint is that the watermark must increase monotonically.

Note that since we must coordinate this value across partitions occasionally, the actual watermark used is only guaranteed to be at least `delay` behind the actual event time.  In some cases we may still process records that arrive more than delay late.

This mechanism was chosen for the initial implementation over processing time for two reasons:
  - it is robust to downtime that could affect processing delay
  - it does not require syncing of time or timezones between the producer and the processing engine.

### Other notable implementation details
 - A new trigger metric `eventTimeWatermark` outputs the current value of the watermark.
 - We mark the event time column in the `Attribute` metadata using the key `spark.watermarkDelay`.  This allows downstream operations to know which column holds the event time.  Operations like `window` propagate this metadata.
 - `explain()` marks the watermark with a suffix of `-T${delayMs}` to ease debugging of how this information is propagated.
 - Currently, we don't filter out late records, but instead rely on the state store to avoid emitting records that are both added and filtered in the same epoch.

### Remaining in this PR
 - [ ] The test for recovery is currently failing as we don't record the watermark used in the offset log.  We will need to do so to ensure determinism, but this is deferred until #15626 is merged.

### Other follow-ups
There are some natural additional features that we should consider for future work:
 - Ability to write records that arrive too late to some external store in case any out-of-band remediation is required.
 - `Update` mode so you can get partial results before a group is evicted.
 - Other mechanisms for calculating the watermark.  In particular a watermark based on quantiles would be more robust to outliers.

Author: Michael Armbrust <michael@databricks.com>

Closes #15702 from marmbrus/watermarks.

(cherry picked from commit c07187823a98f0d1a0f58c06e28a27e1abed157a)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../spark/unsafe/types/CalendarInterval.java  |   4 +
 .../apache/spark/sql/AnalysisException.scala  |   3 +-
 .../sql/catalyst/analysis/Analyzer.scala      |   8 +-
 .../sql/catalyst/analysis/CheckAnalysis.scala |  10 +
 .../UnsupportedOperationChecker.scala         |  18 +-
 .../sql/catalyst/analysis/unresolved.scala    |   3 +-
 .../expressions/namedExpressions.scala        |  17 +-
 .../plans/logical/EventTimeWatermark.scala    |  51 +++++
 .../scala/org/apache/spark/sql/Dataset.scala  |  40 +++-
 .../spark/sql/execution/SparkStrategies.scala |  12 +-
 .../sql/execution/aggregate/AggUtils.scala    |   9 +-
 .../sql/execution/command/commands.scala      |   2 +-
 .../streaming/EventTimeWatermarkExec.scala    |  93 +++++++++
 .../sql/execution/streaming/ForeachSink.scala |   3 +-
 .../streaming/IncrementalExecution.scala      |  12 +-
 .../streaming/StatefulAggregate.scala         | 170 +++++++++-------
 .../execution/streaming/StreamExecution.scala |  25 ++-
 .../execution/streaming/StreamMetrics.scala   |   1 +
 .../state/HDFSBackedStateStoreProvider.scala  |  23 ++-
 .../streaming/state/StateStore.scala          |   7 +-
 .../streaming/state/StateStoreSuite.scala     |   6 +-
 .../spark/sql/streaming/WatermarkSuite.scala  | 191 ++++++++++++++++++
 22 files changed, 597 insertions(+), 111 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
index 518ed6470a753..a7b0e6f80c2b6 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
@@ -252,6 +252,10 @@ public static long parseSecondNano(String secondNano) throws IllegalArgumentExce
   public final int months;
   public final long microseconds;
 
+  public final long milliseconds() {
+    return this.microseconds / MICROS_PER_MILLI;
+  }
+
   public CalendarInterval(int months, long microseconds) {
     this.months = months;
     this.microseconds = microseconds;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
index 7defb9df862c0..ff8576157305b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/AnalysisException.scala
@@ -31,7 +31,8 @@ class AnalysisException protected[sql] (
     val message: String,
     val line: Option[Int] = None,
     val startPosition: Option[Int] = None,
-    val plan: Option[LogicalPlan] = None,
+    // Some plans fail to serialize due to bugs in scala collections.
+    @transient val plan: Option[LogicalPlan] = None,
     val cause: Option[Throwable] = None)
   extends Exception(message, cause.orNull) with Serializable {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index dcee2e4b1fe73..b7e167557c559 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2213,7 +2213,13 @@ object TimeWindowing extends Rule[LogicalPlan] {
           windowExpressions.head.timeColumn.resolved &&
           windowExpressions.head.checkInputDataTypes().isSuccess) {
         val window = windowExpressions.head
-        val windowAttr = AttributeReference("window", window.dataType)()
+
+        val metadata = window.timeColumn match {
+          case a: Attribute => a.metadata
+          case _ => Metadata.empty
+        }
+        val windowAttr =
+          AttributeReference("window", window.dataType, metadata = metadata)()
 
         val maxNumOverlapping = math.ceil(window.windowDuration * 1.0 / window.slideDuration).toInt
         val windows = Seq.tabulate(maxNumOverlapping + 1) { i =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 7b75c1f70974b..98e50d0d3c674 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -148,6 +148,16 @@ trait CheckAnalysis extends PredicateHelper {
         }
 
         operator match {
+          case etw: EventTimeWatermark =>
+            etw.eventTime.dataType match {
+              case s: StructType
+                if s.find(_.name == "end").map(_.dataType) == Some(TimestampType) =>
+              case _: TimestampType =>
+              case _ =>
+                failAnalysis(
+                  s"Event time must be defined on a window or a timestamp, but " +
+                  s"${etw.eventTime.name} is of type ${etw.eventTime.dataType.simpleString}")
+            }
           case f: Filter if f.condition.dataType != BooleanType =>
             failAnalysis(
               s"filter expression '${f.condition.sql}' " +
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index e81370c504abb..c054fcbef36f3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.{AnalysisException, InternalOutputModes}
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.streaming.OutputMode
@@ -55,9 +56,20 @@ object UnsupportedOperationChecker {
     // Disallow some output mode
     outputMode match {
       case InternalOutputModes.Append if aggregates.nonEmpty =>
-        throwError(
-          s"$outputMode output mode not supported when there are streaming aggregations on " +
-            s"streaming DataFrames/DataSets")(plan)
+        val aggregate = aggregates.head
+
+        // Find any attributes that are associated with an eventTime watermark.
+        val watermarkAttributes = aggregate.groupingExpressions.collect {
+          case a: Attribute if a.metadata.contains(EventTimeWatermark.delayKey) => a
+        }
+
+        // We can append rows to the sink once the group is under the watermark. Without this
+        // watermark a group is never "finished" so we would never output anything.
+        if (watermarkAttributes.isEmpty) {
+          throwError(
+            s"$outputMode output mode not supported when there are streaming aggregations on " +
+                s"streaming DataFrames/DataSets")(plan)
+        }
 
       case InternalOutputModes.Complete | InternalOutputModes.Update if aggregates.isEmpty =>
         throwError(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 235ae04782455..36ed9ba50372b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, Codege
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, Metadata, StructType}
 
 /**
  * Thrown when an invalid attempt is made to access a property of a tree that has yet to be fully
@@ -98,6 +98,7 @@ case class UnresolvedAttribute(nameParts: Seq[String]) extends Attribute with Un
   override def withNullability(newNullability: Boolean): UnresolvedAttribute = this
   override def withQualifier(newQualifier: Option[String]): UnresolvedAttribute = this
   override def withName(newName: String): UnresolvedAttribute = UnresolvedAttribute.quoted(newName)
+  override def withMetadata(newMetadata: Metadata): Attribute = this
 
   override def toString: String = s"'$name"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 306a99d5a37bf..1274757136051 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -22,6 +22,7 @@ import java.util.{Objects, UUID}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 import org.apache.spark.sql.types._
 
@@ -104,6 +105,7 @@ abstract class Attribute extends LeafExpression with NamedExpression with NullIn
   def withNullability(newNullability: Boolean): Attribute
   def withQualifier(newQualifier: Option[String]): Attribute
   def withName(newName: String): Attribute
+  def withMetadata(newMetadata: Metadata): Attribute
 
   override def toAttribute: Attribute = this
   def newInstance(): Attribute
@@ -292,11 +294,22 @@ case class AttributeReference(
     }
   }
 
+  override def withMetadata(newMetadata: Metadata): Attribute = {
+    AttributeReference(name, dataType, nullable, newMetadata)(exprId, qualifier, isGenerated)
+  }
+
   override protected final def otherCopyArgs: Seq[AnyRef] = {
     exprId :: qualifier :: isGenerated :: Nil
   }
 
-  override def toString: String = s"$name#${exprId.id}$typeSuffix"
+  /** Used to signal the column used to calculate an eventTime watermark (e.g. a#1-T{delayMs}) */
+  private def delaySuffix = if (metadata.contains(EventTimeWatermark.delayKey)) {
+    s"-T${metadata.getLong(EventTimeWatermark.delayKey)}ms"
+  } else {
+    ""
+  }
+
+  override def toString: String = s"$name#${exprId.id}$typeSuffix$delaySuffix"
 
   // Since the expression id is not in the first constructor it is missing from the default
   // tree string.
@@ -332,6 +345,8 @@ case class PrettyAttribute(
   override def withQualifier(newQualifier: Option[String]): Attribute =
     throw new UnsupportedOperationException
   override def withName(newName: String): Attribute = throw new UnsupportedOperationException
+  override def withMetadata(newMetadata: Metadata): Attribute =
+    throw new UnsupportedOperationException
   override def qualifier: Option[String] = throw new UnsupportedOperationException
   override def exprId: ExprId = throw new UnsupportedOperationException
   override def nullable: Boolean = true
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
new file mode 100644
index 0000000000000..4224a7997c410
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
+import org.apache.spark.sql.types.MetadataBuilder
+import org.apache.spark.unsafe.types.CalendarInterval
+
+object EventTimeWatermark {
+  /** The [[org.apache.spark.sql.types.Metadata]] key used to hold the eventTime watermark delay. */
+  val delayKey = "spark.watermarkDelayMs"
+}
+
+/**
+ * Used to mark a user specified column as holding the event time for a row.
+ */
+case class EventTimeWatermark(
+    eventTime: Attribute,
+    delay: CalendarInterval,
+    child: LogicalPlan) extends LogicalPlan {
+
+  // Update the metadata on the eventTime column to include the desired delay.
+  override val output: Seq[Attribute] = child.output.map { a =>
+    if (a semanticEquals eventTime) {
+      val updatedMetadata = new MetadataBuilder()
+        .withMetadata(a.metadata)
+        .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
+        .build()
+      a.withMetadata(updatedMetadata)
+    } else {
+      a
+    }
+  }
+
+  override val children: Seq[LogicalPlan] = child :: Nil
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index eb2b20afc37cf..af30683cc01c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -50,6 +50,7 @@ import org.apache.spark.sql.execution.python.EvaluatePython
 import org.apache.spark.sql.streaming.DataStreamWriter
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.Utils
 
 private[sql] object Dataset {
@@ -476,7 +477,7 @@ class Dataset[T] private[sql](
    * `collect()`, will throw an [[AnalysisException]] when there is a streaming
    * source present.
    *
-   * @group basic
+   * @group streaming
    * @since 2.0.0
    */
   @Experimental
@@ -496,8 +497,6 @@ class Dataset[T] private[sql](
   /**
    * Returns a checkpointed version of this Dataset.
    *
-   * @param eager When true, materializes the underlying checkpointed RDD eagerly.
-   *
    * @group basic
    * @since 2.1.0
    */
@@ -535,6 +534,41 @@ class Dataset[T] private[sql](
       )(sparkSession)).as[T]
   }
 
+  /**
+   * :: Experimental ::
+   * Defines an event time watermark for this [[Dataset]]. A watermark tracks a point in time
+   * before which we assume no more late data is going to arrive.
+   *
+   * Spark will use this watermark for several purposes:
+   *  - To know when a given time window aggregation can be finalized and thus can be emitted when
+   *    using output modes that do not allow updates.
+   *  - To minimize the amount of state that we need to keep for on-going aggregations.
+   *
+   *  The current watermark is computed by looking at the `MAX(eventTime)` seen across
+   *  all of the partitions in the query minus a user specified `delayThreshold`.  Due to the cost
+   *  of coordinating this value across partitions, the actual watermark used is only guaranteed
+   *  to be at least `delayThreshold` behind the actual event time.  In some cases we may still
+   *  process records that arrive more than `delayThreshold` late.
+   *
+   * @param eventTime the name of the column that contains the event time of the row.
+   * @param delayThreshold the minimum delay to wait to data to arrive late, relative to the latest
+   *                       record that has been processed in the form of an interval
+   *                       (e.g. "1 minute" or "5 hours").
+   *
+   * @group streaming
+   * @since 2.1.0
+   */
+  @Experimental
+  @InterfaceStability.Evolving
+  // We only accept an existing column name, not a derived column here as a watermark that is
+  // defined on a derived column cannot referenced elsewhere in the plan.
+  def withWatermark(eventTime: String, delayThreshold: String): Dataset[T] = withTypedPlan {
+    val parsedDelay =
+      Option(CalendarInterval.fromString("interval " + delayThreshold))
+        .getOrElse(throw new AnalysisException(s"Unable to parse time delay '$delayThreshold'"))
+    EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, logicalPlan)
+  }
+
   /**
    * Displays the Dataset in a tabular form. Strings more than 20 characters will be truncated,
    * and all cells will be aligned right. For example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 190fdd84343ee..2308ae8a6c611 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -18,20 +18,23 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{execution, SaveMode, Strategy}
+import org.apache.spark.sql.{SaveMode, Strategy}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, EventTimeWatermark, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.exchange.ShuffleExchange
 import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight}
-import org.apache.spark.sql.execution.streaming.{MemoryPlan, StreamingExecutionRelation, StreamingRelation, StreamingRelationExec}
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.StreamingQuery
 
 /**
  * Converts a logical plan into zero or more SparkPlans.  This API is exposed for experimenting
@@ -224,6 +227,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    */
   object StatefulAggregationStrategy extends Strategy {
     override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case EventTimeWatermark(columnName, delay, child) =>
+        EventTimeWatermarkExec(columnName, delay, planLater(child)) :: Nil
+
       case PhysicalAggregation(
         namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) =>
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
index 4fbb9d554c9bf..f7ea8970edf90 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
@@ -313,8 +313,13 @@ object AggUtils {
     }
     // Note: stateId and returnAllStates are filled in later with preparation rules
     // in IncrementalExecution.
-    val saved = StateStoreSaveExec(
-      groupingAttributes, stateId = None, returnAllStates = None, partialMerged2)
+    val saved =
+      StateStoreSaveExec(
+        groupingAttributes,
+        stateId = None,
+        outputMode = None,
+        eventTimeWatermark = None,
+        partialMerged2)
 
     val finalAndCompleteAggregate: SparkPlan = {
       val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index d82e54e57564c..52d8dc22a2d4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -104,7 +104,7 @@ case class ExplainCommand(
       if (logicalPlan.isStreaming) {
         // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the
         // output mode does not matter since there is no `Sink`.
-        new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0)
+        new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0)
       } else {
         sparkSession.sessionState.executePlan(logicalPlan)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
new file mode 100644
index 0000000000000..4c8cb069d23a0
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import scala.math.max
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.MetadataBuilder
+import org.apache.spark.unsafe.types.CalendarInterval
+import org.apache.spark.util.AccumulatorV2
+
+/** Tracks the maximum positive long seen. */
+class MaxLong(protected var currentValue: Long = 0)
+  extends AccumulatorV2[Long, Long] {
+
+  override def isZero: Boolean = value == 0
+  override def value: Long = currentValue
+  override def copy(): AccumulatorV2[Long, Long] = new MaxLong(currentValue)
+
+  override def reset(): Unit = {
+    currentValue = 0
+  }
+
+  override def add(v: Long): Unit = {
+    currentValue = max(v, value)
+  }
+
+  override def merge(other: AccumulatorV2[Long, Long]): Unit = {
+    currentValue = max(value, other.value)
+  }
+}
+
+/**
+ * Used to mark a column as the containing the event time for a given record. In addition to
+ * adding appropriate metadata to this column, this operator also tracks the maximum observed event
+ * time. Based on the maximum observed time and a user specified delay, we can calculate the
+ * `watermark` after which we assume we will no longer see late records for a particular time
+ * period.
+ */
+case class EventTimeWatermarkExec(
+    eventTime: Attribute,
+    delay: CalendarInterval,
+    child: SparkPlan) extends SparkPlan {
+
+  // TODO: Use Spark SQL Metrics?
+  val maxEventTime = new MaxLong
+  sparkContext.register(maxEventTime)
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    child.execute().mapPartitions { iter =>
+      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
+      iter.map { row =>
+        maxEventTime.add(getEventTime(row).getLong(0))
+        row
+      }
+    }
+  }
+
+  // Update the metadata on the eventTime column to include the desired delay.
+  override val output: Seq[Attribute] = child.output.map { a =>
+    if (a semanticEquals eventTime) {
+      val updatedMetadata = new MetadataBuilder()
+          .withMetadata(a.metadata)
+          .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
+          .build()
+
+      a.withMetadata(updatedMetadata)
+    } else {
+      a
+    }
+  }
+
+  override def children: Seq[SparkPlan] = child :: Nil
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
index 24f98b9211f12..f5c550dd6ac3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
@@ -60,7 +60,8 @@ class ForeachSink[T : Encoder](writer: ForeachWriter[T]) extends Sink with Seria
             deserialized,
             data.queryExecution.asInstanceOf[IncrementalExecution].outputMode,
             data.queryExecution.asInstanceOf[IncrementalExecution].checkpointLocation,
-            data.queryExecution.asInstanceOf[IncrementalExecution].currentBatchId)
+            data.queryExecution.asInstanceOf[IncrementalExecution].currentBatchId,
+            data.queryExecution.asInstanceOf[IncrementalExecution].currentEventTimeWatermark)
           incrementalExecution.toRdd.mapPartitions { rows =>
             rows.map(_.get(0, objectType))
           }.asInstanceOf[RDD[T]]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index 05294df2673dc..e9d072f8a98b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -32,11 +32,13 @@ class IncrementalExecution(
     logicalPlan: LogicalPlan,
     val outputMode: OutputMode,
     val checkpointLocation: String,
-    val currentBatchId: Long)
+    val currentBatchId: Long,
+    val currentEventTimeWatermark: Long)
   extends QueryExecution(sparkSession, logicalPlan) {
 
   // TODO: make this always part of planning.
-  val stateStrategy = sparkSession.sessionState.planner.StatefulAggregationStrategy +:
+  val stateStrategy =
+    sparkSession.sessionState.planner.StatefulAggregationStrategy +:
     sparkSession.sessionState.planner.StreamingRelationStrategy +:
     sparkSession.sessionState.experimentalMethods.extraStrategies
 
@@ -57,17 +59,17 @@ class IncrementalExecution(
   val state = new Rule[SparkPlan] {
 
     override def apply(plan: SparkPlan): SparkPlan = plan transform {
-      case StateStoreSaveExec(keys, None, None,
+      case StateStoreSaveExec(keys, None, None, None,
              UnaryExecNode(agg,
                StateStoreRestoreExec(keys2, None, child))) =>
         val stateId = OperatorStateId(checkpointLocation, operatorId, currentBatchId)
-        val returnAllStates = if (outputMode == InternalOutputModes.Complete) true else false
         operatorId += 1
 
         StateStoreSaveExec(
           keys,
           Some(stateId),
-          Some(returnAllStates),
+          Some(outputMode),
+          Some(currentEventTimeWatermark),
           agg.withNewChildren(
             StateStoreRestoreExec(
               keys,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
index ad8238f189c64..7af978a9c4aa2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StatefulAggregate.scala
@@ -21,12 +21,17 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratePredicate, GenerateUnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.execution
+import org.apache.spark.sql.InternalOutputModes._
+import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.StructType
+
 
 /** Used to identify the state store for a given operator. */
 case class OperatorStateId(
@@ -92,8 +97,9 @@ case class StateStoreRestoreExec(
  */
 case class StateStoreSaveExec(
     keyExpressions: Seq[Attribute],
-    stateId: Option[OperatorStateId],
-    returnAllStates: Option[Boolean],
+    stateId: Option[OperatorStateId] = None,
+    outputMode: Option[OutputMode] = None,
+    eventTimeWatermark: Option[Long] = None,
     child: SparkPlan)
   extends execution.UnaryExecNode with StatefulOperator {
 
@@ -104,9 +110,9 @@ case class StateStoreSaveExec(
 
   override protected def doExecute(): RDD[InternalRow] = {
     metrics // force lazy init at driver
-    assert(returnAllStates.nonEmpty,
-      "Incorrect planning in IncrementalExecution, returnAllStates have not been set")
-    val saveAndReturnFunc = if (returnAllStates.get) saveAndReturnAll _ else saveAndReturnUpdated _
+    assert(outputMode.nonEmpty,
+      "Incorrect planning in IncrementalExecution, outputMode has not been set")
+
     child.execute().mapPartitionsWithStateStore(
       getStateId.checkpointLocation,
       operatorId = getStateId.operatorId,
@@ -114,75 +120,95 @@ case class StateStoreSaveExec(
       keyExpressions.toStructType,
       child.output.toStructType,
       sqlContext.sessionState,
-      Some(sqlContext.streams.stateStoreCoordinator)
-    )(saveAndReturnFunc)
+      Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
+        val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
+        val numOutputRows = longMetric("numOutputRows")
+        val numTotalStateRows = longMetric("numTotalStateRows")
+        val numUpdatedStateRows = longMetric("numUpdatedStateRows")
+
+        outputMode match {
+          // Update and output all rows in the StateStore.
+          case Some(Complete) =>
+            while (iter.hasNext) {
+              val row = iter.next().asInstanceOf[UnsafeRow]
+              val key = getKey(row)
+              store.put(key.copy(), row.copy())
+              numUpdatedStateRows += 1
+            }
+            store.commit()
+            numTotalStateRows += store.numKeys()
+            store.iterator().map { case (k, v) =>
+              numOutputRows += 1
+              v.asInstanceOf[InternalRow]
+            }
+
+          // Update and output only rows being evicted from the StateStore
+          case Some(Append) =>
+            while (iter.hasNext) {
+              val row = iter.next().asInstanceOf[UnsafeRow]
+              val key = getKey(row)
+              store.put(key.copy(), row.copy())
+              numUpdatedStateRows += 1
+            }
+
+            val watermarkAttribute =
+              keyExpressions.find(_.metadata.contains(EventTimeWatermark.delayKey)).get
+            // If we are evicting based on a window, use the end of the window.  Otherwise just
+            // use the attribute itself.
+            val evictionExpression =
+              if (watermarkAttribute.dataType.isInstanceOf[StructType]) {
+                LessThanOrEqual(
+                  GetStructField(watermarkAttribute, 1),
+                  Literal(eventTimeWatermark.get * 1000))
+              } else {
+                LessThanOrEqual(
+                  watermarkAttribute,
+                  Literal(eventTimeWatermark.get * 1000))
+              }
+
+            logInfo(s"Filtering state store on: $evictionExpression")
+            val predicate = newPredicate(evictionExpression, keyExpressions)
+            store.remove(predicate.eval)
+
+            store.commit()
+
+            numTotalStateRows += store.numKeys()
+            store.updates().filter(_.isInstanceOf[ValueRemoved]).map { removed =>
+              numOutputRows += 1
+              removed.value.asInstanceOf[InternalRow]
+            }
+
+          // Update and output modified rows from the StateStore.
+          case Some(Update) =>
+            new Iterator[InternalRow] {
+              private[this] val baseIterator = iter
+
+              override def hasNext: Boolean = {
+                if (!baseIterator.hasNext) {
+                  store.commit()
+                  numTotalStateRows += store.numKeys()
+                  false
+                } else {
+                  true
+                }
+              }
+
+              override def next(): InternalRow = {
+                val row = baseIterator.next().asInstanceOf[UnsafeRow]
+                val key = getKey(row)
+                store.put(key.copy(), row.copy())
+                numOutputRows += 1
+                numUpdatedStateRows += 1
+                row
+              }
+            }
+
+          case _ => throw new UnsupportedOperationException(s"Invalid output mode: $outputMode")
+        }
+    }
   }
 
   override def output: Seq[Attribute] = child.output
 
   override def outputPartitioning: Partitioning = child.outputPartitioning
-
-  /**
-   * Save all the rows to the state store, and return all the rows in the state store.
-   * Note that this returns an iterator that pipelines the saving to store with downstream
-   * processing.
-   */
-  private def saveAndReturnUpdated(
-      store: StateStore,
-      iter: Iterator[InternalRow]): Iterator[InternalRow] = {
-    val numOutputRows = longMetric("numOutputRows")
-    val numTotalStateRows = longMetric("numTotalStateRows")
-    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
-
-    new Iterator[InternalRow] {
-      private[this] val baseIterator = iter
-      private[this] val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
-
-      override def hasNext: Boolean = {
-        if (!baseIterator.hasNext) {
-          store.commit()
-          numTotalStateRows += store.numKeys()
-          false
-        } else {
-          true
-        }
-      }
-
-      override def next(): InternalRow = {
-        val row = baseIterator.next().asInstanceOf[UnsafeRow]
-        val key = getKey(row)
-        store.put(key.copy(), row.copy())
-        numOutputRows += 1
-        numUpdatedStateRows += 1
-        row
-      }
-    }
-  }
-
-  /**
-   * Save all the rows to the state store, and return all the rows in the state store.
-   * Note that the saving to store is blocking; only after all the rows have been saved
-   * is the iterator on the update store data is generated.
-   */
-  private def saveAndReturnAll(
-      store: StateStore,
-      iter: Iterator[InternalRow]): Iterator[InternalRow] = {
-    val getKey = GenerateUnsafeProjection.generate(keyExpressions, child.output)
-    val numOutputRows = longMetric("numOutputRows")
-    val numTotalStateRows = longMetric("numTotalStateRows")
-    val numUpdatedStateRows = longMetric("numUpdatedStateRows")
-
-    while (iter.hasNext) {
-      val row = iter.next().asInstanceOf[UnsafeRow]
-      val key = getKey(row)
-      store.put(key.copy(), row.copy())
-      numUpdatedStateRows += 1
-    }
-    store.commit()
-    numTotalStateRows += store.numKeys()
-    store.iterator().map { case (k, v) =>
-      numOutputRows += 1
-      v.asInstanceOf[InternalRow]
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 57e89f85361e4..3ca6feac05cef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -92,6 +92,9 @@ class StreamExecution(
   /** The current batchId or -1 if execution has not yet been initialized. */
   private var currentBatchId: Long = -1
 
+  /** The current eventTime watermark, used to bound the lateness of data that will processed. */
+  private var currentEventTimeWatermark: Long = 0
+
   /** All stream sources present in the query plan. */
   private val sources =
     logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
@@ -427,7 +430,8 @@ class StreamExecution(
         triggerLogicalPlan,
         outputMode,
         checkpointFile("state"),
-        currentBatchId)
+        currentBatchId,
+        currentEventTimeWatermark)
       lastExecution.executedPlan // Force the lazy generation of execution plan
     }
 
@@ -436,6 +440,25 @@ class StreamExecution(
     sink.addBatch(currentBatchId, nextBatch)
     reportNumRows(executedPlan, triggerLogicalPlan, newData)
 
+    // Update the eventTime watermark if we find one in the plan.
+    // TODO: Does this need to be an AttributeMap?
+    lastExecution.executedPlan.collect {
+      case e: EventTimeWatermarkExec =>
+        logTrace(s"Maximum observed eventTime: ${e.maxEventTime.value}")
+        (e.maxEventTime.value / 1000) - e.delay.milliseconds()
+    }.headOption.foreach { newWatermark =>
+      if (newWatermark > currentEventTimeWatermark) {
+        logInfo(s"Updating eventTime watermark to: $newWatermark ms")
+        currentEventTimeWatermark = newWatermark
+      } else {
+        logTrace(s"Event time didn't move: $newWatermark < $currentEventTimeWatermark")
+      }
+
+      if (newWatermark != 0) {
+        streamMetrics.reportTriggerDetail(EVENT_TIME_WATERMARK, newWatermark)
+      }
+    }
+
     awaitBatchLock.lock()
     try {
       // Wake up any threads that are waiting for the stream to progress.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
index e98d1883e4596..5645554a58f6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
@@ -221,6 +221,7 @@ object StreamMetrics extends Logging {
   val IS_TRIGGER_ACTIVE = "isTriggerActive"
   val IS_DATA_PRESENT_IN_TRIGGER = "isDataPresentInTrigger"
   val STATUS_MESSAGE = "statusMessage"
+  val EVENT_TIME_WATERMARK = "eventTimeWatermark"
 
   val START_TIMESTAMP = "timestamp.triggerStart"
   val GET_OFFSET_TIMESTAMP = "timestamp.afterGetOffset"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index f07feaad5dc71..493fdaaec5069 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -109,7 +109,7 @@ private[state] class HDFSBackedStateStoreProvider(
         case Some(ValueAdded(_, _)) =>
           // Value did not exist in previous version and was added already, keep it marked as added
           allUpdates.put(key, ValueAdded(key, value))
-        case Some(ValueUpdated(_, _)) | Some(KeyRemoved(_)) =>
+        case Some(ValueUpdated(_, _)) | Some(ValueRemoved(_, _)) =>
           // Value existed in previous version and updated/removed, mark it as updated
           allUpdates.put(key, ValueUpdated(key, value))
         case None =>
@@ -124,24 +124,25 @@ private[state] class HDFSBackedStateStoreProvider(
     /** Remove keys that match the following condition */
     override def remove(condition: UnsafeRow => Boolean): Unit = {
       verify(state == UPDATING, "Cannot remove after already committed or aborted")
-
-      val keyIter = mapToUpdate.keySet().iterator()
-      while (keyIter.hasNext) {
-        val key = keyIter.next
-        if (condition(key)) {
-          keyIter.remove()
+      val entryIter = mapToUpdate.entrySet().iterator()
+      while (entryIter.hasNext) {
+        val entry = entryIter.next
+        if (condition(entry.getKey)) {
+          val value = entry.getValue
+          val key = entry.getKey
+          entryIter.remove()
 
           Option(allUpdates.get(key)) match {
             case Some(ValueUpdated(_, _)) | None =>
               // Value existed in previous version and maybe was updated, mark removed
-              allUpdates.put(key, KeyRemoved(key))
+              allUpdates.put(key, ValueRemoved(key, value))
             case Some(ValueAdded(_, _)) =>
               // Value did not exist in previous version and was added, should not appear in updates
               allUpdates.remove(key)
-            case Some(KeyRemoved(_)) =>
+            case Some(ValueRemoved(_, _)) =>
               // Remove already in update map, no need to change
           }
-          writeToDeltaFile(tempDeltaFileStream, KeyRemoved(key))
+          writeToDeltaFile(tempDeltaFileStream, ValueRemoved(key, value))
         }
       }
     }
@@ -334,7 +335,7 @@ private[state] class HDFSBackedStateStoreProvider(
         writeUpdate(key, value)
       case ValueUpdated(key, value) =>
         writeUpdate(key, value)
-      case KeyRemoved(key) =>
+      case ValueRemoved(key, value) =>
         writeRemove(key)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
index 7132e284c28f4..9bc6c0e2b9334 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala
@@ -99,13 +99,16 @@ trait StateStoreProvider {
 
 
 /** Trait representing updates made to a [[StateStore]]. */
-sealed trait StoreUpdate
+sealed trait StoreUpdate {
+  def key: UnsafeRow
+  def value: UnsafeRow
+}
 
 case class ValueAdded(key: UnsafeRow, value: UnsafeRow) extends StoreUpdate
 
 case class ValueUpdated(key: UnsafeRow, value: UnsafeRow) extends StoreUpdate
 
-case class KeyRemoved(key: UnsafeRow) extends StoreUpdate
+case class ValueRemoved(key: UnsafeRow, value: UnsafeRow) extends StoreUpdate
 
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index 533cd0cd2a2ea..05fc7345a7daf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -668,11 +668,11 @@ private[state] object StateStoreSuite {
   }
 
   def updatesToSet(iterator: Iterator[StoreUpdate]): Set[TestUpdate] = {
-    iterator.map { _ match {
+    iterator.map {
       case ValueAdded(key, value) => Added(rowToString(key), rowToInt(value))
       case ValueUpdated(key, value) => Updated(rowToString(key), rowToInt(value))
-      case KeyRemoved(key) => Removed(rowToString(key))
-    }}.toSet
+      case ValueRemoved(key, _) => Removed(rowToString(key))
+    }.toSet
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
new file mode 100644
index 0000000000000..3617ec0f564c1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.functions.{count, window}
+
+class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
+
+  import testImplicits._
+
+  after {
+    sqlContext.streams.active.foreach(_.stop())
+  }
+
+  test("error on bad column") {
+    val inputData = MemoryStream[Int].toDF()
+    val e = intercept[AnalysisException] {
+      inputData.withWatermark("badColumn", "1 minute")
+    }
+    assert(e.getMessage contains "badColumn")
+  }
+
+  test("error on wrong type") {
+    val inputData = MemoryStream[Int].toDF()
+    val e = intercept[AnalysisException] {
+      inputData.withWatermark("value", "1 minute")
+    }
+    assert(e.getMessage contains "value")
+    assert(e.getMessage contains "int")
+  }
+
+
+  test("watermark metric") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 15),
+      AssertOnLastQueryStatus { status =>
+        status.triggerDetails.get(StreamMetrics.EVENT_TIME_WATERMARK) === "5000"
+      },
+      AddData(inputData, 15),
+      AssertOnLastQueryStatus { status =>
+        status.triggerDetails.get(StreamMetrics.EVENT_TIME_WATERMARK) === "5000"
+      },
+      AddData(inputData, 25),
+      AssertOnLastQueryStatus { status =>
+        status.triggerDetails.get(StreamMetrics.EVENT_TIME_WATERMARK) === "15000"
+      }
+    )
+  }
+
+  test("append-mode watermark aggregation") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10, 11, 12, 13, 14, 15),
+      CheckAnswer(),
+      AddData(inputData, 25), // Advance watermark to 15 seconds
+      CheckAnswer(),
+      AddData(inputData, 25), // Evict items less than previous watermark.
+      CheckAnswer((10, 5))
+    )
+  }
+
+  ignore("recovery") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10, 11, 12, 13, 14, 15),
+      CheckAnswer(),
+      AddData(inputData, 25), // Advance watermark to 15 seconds
+      StopStream,
+      StartStream(),
+      CheckAnswer(),
+      AddData(inputData, 25), // Evict items less than previous watermark.
+      StopStream,
+      StartStream(),
+      CheckAnswer((10, 5))
+    )
+  }
+
+  test("dropping old data") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10, 11, 12),
+      CheckAnswer(),
+      AddData(inputData, 25),     // Advance watermark to 15 seconds
+      CheckAnswer(),
+      AddData(inputData, 25),     // Evict items less than previous watermark.
+      CheckAnswer((10, 3)),
+      AddData(inputData, 10),     // 10 is later than 15 second watermark
+      CheckAnswer((10, 3)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 3))        // Should not emit an incorrect partial result.
+    )
+  }
+
+  test("complete mode") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy(window($"eventTime", "5 seconds") as 'window)
+        .agg(count("*") as 'count)
+        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
+
+    // No eviction when asked to compute complete results.
+    testStream(windowedAggregation, OutputMode.Complete)(
+      AddData(inputData, 10, 11, 12),
+      CheckAnswer((10, 3)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 3), (25, 1)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 3), (25, 2)),
+      AddData(inputData, 10),
+      CheckAnswer((10, 4), (25, 2)),
+      AddData(inputData, 25),
+      CheckAnswer((10, 4), (25, 3))
+    )
+  }
+
+  test("group by on raw timestamp") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+        .withColumn("eventTime", $"value".cast("timestamp"))
+        .withWatermark("eventTime", "10 seconds")
+        .groupBy($"eventTime")
+        .agg(count("*") as 'count)
+        .select($"eventTime".cast("long").as[Long], $"count".as[Long])
+
+    testStream(windowedAggregation)(
+      AddData(inputData, 10),
+      CheckAnswer(),
+      AddData(inputData, 25), // Advance watermark to 15 seconds
+      CheckAnswer(),
+      AddData(inputData, 25), // Evict items less than previous watermark.
+      CheckAnswer((10, 1))
+    )
+  }
+}

From 649c15fae423a415cb6165aa0ef6d97ab4949afb Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 14 Nov 2016 21:15:39 -0800
Subject: [PATCH 107/534] [SPARK-18428][DOC] Update docs for GraphX

## What changes were proposed in this pull request?
1, Add link of `VertexRDD` and `EdgeRDD`
2, Notify in `Vertex and Edge RDDs` that not all methods are listed
3, `VertexID` -> `VertexId`

## How was this patch tested?
No tests, only docs is modified

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15875 from zhengruifeng/update_graphop_doc.

(cherry picked from commit c31def1ddcbed340bfc071d54fb3dc7945cb525a)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 docs/graphx-programming-guide.md | 68 ++++++++++++++++----------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 58671e6f146d8..1097cf1211c1f 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -11,6 +11,7 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT
 <!-- All the documentation links  -->
 
 [EdgeRDD]: api/scala/index.html#org.apache.spark.graphx.EdgeRDD
+[VertexRDD]: api/scala/index.html#org.apache.spark.graphx.VertexRDD
 [Edge]: api/scala/index.html#org.apache.spark.graphx.Edge
 [EdgeTriplet]: api/scala/index.html#org.apache.spark.graphx.EdgeTriplet
 [Graph]: api/scala/index.html#org.apache.spark.graphx.Graph
@@ -89,7 +90,7 @@ with user defined objects attached to each vertex and edge.  A directed multigra
 graph with potentially multiple parallel edges sharing the same source and destination vertex.  The
 ability to support parallel edges simplifies modeling scenarios where there can be multiple
 relationships (e.g., co-worker and friend) between the same vertices.  Each vertex is keyed by a
-*unique* 64-bit long identifier (`VertexID`).  GraphX does not impose any ordering constraints on
+*unique* 64-bit long identifier (`VertexId`).  GraphX does not impose any ordering constraints on
 the vertex identifiers.  Similarly, edges have corresponding source and destination vertex
 identifiers.
 
@@ -130,12 +131,12 @@ class Graph[VD, ED] {
 }
 {% endhighlight %}
 
-The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexID,
+The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexId,
 VD)]` and `RDD[Edge[ED]]` respectively.  Both `VertexRDD[VD]` and `EdgeRDD[ED]` provide  additional
 functionality built around graph computation and leverage internal optimizations.  We discuss the
-`VertexRDD` and `EdgeRDD` API in greater detail in the section on [vertex and edge
+`VertexRDD`[VertexRDD] and `EdgeRDD`[EdgeRDD] API in greater detail in the section on [vertex and edge
 RDDs](#vertex_and_edge_rdds) but for now they can be thought of as simply RDDs of the form:
-`RDD[(VertexID, VD)]` and `RDD[Edge[ED]]`.
+`RDD[(VertexId, VD)]` and `RDD[Edge[ED]]`.
 
 ### Example Property Graph
 
@@ -197,7 +198,7 @@ graph.edges.filter(e => e.srcId > e.dstId).count
 {% endhighlight %}
 
 > Note that `graph.vertices` returns an `VertexRDD[(String, String)]` which extends
-> `RDD[(VertexID, (String, String))]` and so we use the scala `case` expression to deconstruct the
+> `RDD[(VertexId, (String, String))]` and so we use the scala `case` expression to deconstruct the
 > tuple.  On the other hand, `graph.edges` returns an `EdgeRDD` containing `Edge[String]` objects.
 > We could have also used the case class type constructor as in the following:
 > {% highlight scala %}
@@ -287,7 +288,7 @@ class Graph[VD, ED] {
   // Change the partitioning heuristic  ============================================================
   def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
   // Transform vertex and edge attributes ==========================================================
-  def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
+  def mapVertices[VD2](map: (VertexId, VD) => VD2): Graph[VD2, ED]
   def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2]
   def mapEdges[ED2](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2]
   def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
@@ -297,18 +298,18 @@ class Graph[VD, ED] {
   def reverse: Graph[VD, ED]
   def subgraph(
       epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
-      vpred: (VertexID, VD) => Boolean = ((v, d) => true))
+      vpred: (VertexId, VD) => Boolean = ((v, d) => true))
     : Graph[VD, ED]
   def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED]
   def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED]
   // Join RDDs with the graph ======================================================================
-  def joinVertices[U](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD): Graph[VD, ED]
-  def outerJoinVertices[U, VD2](other: RDD[(VertexID, U)])
-      (mapFunc: (VertexID, VD, Option[U]) => VD2)
+  def joinVertices[U](table: RDD[(VertexId, U)])(mapFunc: (VertexId, VD, U) => VD): Graph[VD, ED]
+  def outerJoinVertices[U, VD2](other: RDD[(VertexId, U)])
+      (mapFunc: (VertexId, VD, Option[U]) => VD2)
     : Graph[VD2, ED]
   // Aggregate information about adjacent triplets =================================================
-  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]]
-  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]]
+  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]]
+  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]]
   def aggregateMessages[Msg: ClassTag](
       sendMsg: EdgeContext[VD, ED, Msg] => Unit,
       mergeMsg: (Msg, Msg) => Msg,
@@ -316,15 +317,15 @@ class Graph[VD, ED] {
     : VertexRDD[A]
   // Iterative graph-parallel computation ==========================================================
   def pregel[A](initialMsg: A, maxIterations: Int, activeDirection: EdgeDirection)(
-      vprog: (VertexID, VD, A) => VD,
-      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)],
+      vprog: (VertexId, VD, A) => VD,
+      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId,A)],
       mergeMsg: (A, A) => A)
     : Graph[VD, ED]
   // Basic graph algorithms ========================================================================
   def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
-  def connectedComponents(): Graph[VertexID, ED]
+  def connectedComponents(): Graph[VertexId, ED]
   def triangleCount(): Graph[Int, ED]
-  def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED]
+  def stronglyConnectedComponents(numIter: Int): Graph[VertexId, ED]
 }
 {% endhighlight %}
 
@@ -481,7 +482,7 @@ original value.
 > is therefore recommended that the input RDD be made unique using the following which will
 > also *pre-index* the resulting values to substantially accelerate the subsequent join.
 > {% highlight scala %}
-val nonUniqueCosts: RDD[(VertexID, Double)]
+val nonUniqueCosts: RDD[(VertexId, Double)]
 val uniqueCosts: VertexRDD[Double] =
   graph.vertices.aggregateUsingIndex(nonUnique, (a,b) => a + b)
 val joinedGraph = graph.joinVertices(uniqueCosts)(
@@ -511,7 +512,7 @@ val degreeGraph = graph.outerJoinVertices(outDegrees) { (id, oldAttr, outDegOpt)
 > provide type annotation for the user defined function:
 > {% highlight scala %}
 val joinedGraph = graph.joinVertices(uniqueCosts,
-  (id: VertexID, oldCost: Double, extraCost: Double) => oldCost + extraCost)
+  (id: VertexId, oldCost: Double, extraCost: Double) => oldCost + extraCost)
 {% endhighlight %}
 
 >
@@ -558,7 +559,7 @@ The user defined `mergeMsg` function takes two messages destined to the same ver
 yields a single message.  Think of `mergeMsg` as the <i>reduce</i> function in map-reduce.
 The  [`aggregateMessages`][Graph.aggregateMessages] operator returns a `VertexRDD[Msg]`
 containing the aggregate message (of type `Msg`) destined to each vertex.  Vertices that did not
-receive a message are not included in the returned `VertexRDD`.
+receive a message are not included in the returned `VertexRDD`[VertexRDD].
 
 <!--
 > An [`EdgeContext`][EdgeContext] is provided in place of a [`EdgeTriplet`][EdgeTriplet] to
@@ -815,21 +816,22 @@ object Graph {
 
 GraphX exposes `RDD` views of the vertices and edges stored within the graph.  However, because
 GraphX maintains the vertices and edges in optimized data structures and these data structures
-provide additional functionality, the vertices and edges are returned as `VertexRDD` and `EdgeRDD`
+provide additional functionality, the vertices and edges are returned as `VertexRDD`[VertexRDD] and `EdgeRDD`[EdgeRDD]
 respectively.  In this section we review some of the additional useful functionality in these types.
+Note that this is just an incomplete list, please refer to the API docs for the official list of operations. 
 
 ## VertexRDDs
 
-The `VertexRDD[A]` extends `RDD[(VertexID, A)]` and adds the additional constraint that each
-`VertexID` occurs only *once*.  Moreover, `VertexRDD[A]` represents a *set* of vertices each with an
+The `VertexRDD[A]` extends `RDD[(VertexId, A)]` and adds the additional constraint that each
+`VertexId` occurs only *once*.  Moreover, `VertexRDD[A]` represents a *set* of vertices each with an
 attribute of type `A`.  Internally, this is achieved by storing the vertex attributes in a reusable
 hash-map data-structure.  As a consequence if two `VertexRDD`s are derived from the same base
-`VertexRDD` (e.g., by `filter` or `mapValues`) they can be joined in constant time without hash
-evaluations. To leverage this indexed data structure, the `VertexRDD` exposes the following
+`VertexRDD`[VertexRDD] (e.g., by `filter` or `mapValues`) they can be joined in constant time without hash
+evaluations. To leverage this indexed data structure, the `VertexRDD`[VertexRDD] exposes the following
 additional functionality:
 
 {% highlight scala %}
-class VertexRDD[VD] extends RDD[(VertexID, VD)] {
+class VertexRDD[VD] extends RDD[(VertexId, VD)] {
   // Filter the vertex set but preserves the internal index
   def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD]
   // Transform the values without changing the ids (preserves the internal index)
@@ -847,17 +849,17 @@ class VertexRDD[VD] extends RDD[(VertexID, VD)] {
 }
 {% endhighlight %}
 
-Notice, for example,  how the `filter` operator returns an `VertexRDD`.  Filter is actually
+Notice, for example,  how the `filter` operator returns an `VertexRDD`[VertexRDD].  Filter is actually
 implemented using a `BitSet` thereby reusing the index and preserving the ability to do fast joins
 with other `VertexRDD`s.  Likewise, the `mapValues` operators do not allow the `map` function to
-change the `VertexID` thereby enabling the same `HashMap` data structures to be reused.  Both the
+change the `VertexId` thereby enabling the same `HashMap` data structures to be reused.  Both the
 `leftJoin` and `innerJoin` are able to identify when joining two `VertexRDD`s derived from the same
 `HashMap` and implement the join by linear scan rather than costly point lookups.
 
-The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD` from an
-`RDD[(VertexID, A)]`.  Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices,
-*which is a super-set* of the vertices in some `RDD[(VertexID, A)]` then I can reuse the index to
-both aggregate and then subsequently index the `RDD[(VertexID, A)]`.  For example:
+The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD`[VertexRDD] from an
+`RDD[(VertexId, A)]`.  Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices,
+*which is a super-set* of the vertices in some `RDD[(VertexId, A)]` then I can reuse the index to
+both aggregate and then subsequently index the `RDD[(VertexId, A)]`.  For example:
 
 {% highlight scala %}
 val setA: VertexRDD[Int] = VertexRDD(sc.parallelize(0L until 100L).map(id => (id, 1)))
@@ -878,7 +880,7 @@ of the various partitioning strategies defined in [`PartitionStrategy`][Partitio
 each partition, edge attributes and adjacency structure, are stored separately enabling maximum
 reuse when changing attribute values.
 
-The three additional functions exposed by the `EdgeRDD` are:
+The three additional functions exposed by the `EdgeRDD`[EdgeRDD] are:
 {% highlight scala %}
 // Transform the edge attributes while preserving the structure
 def mapValues[ED2](f: Edge[ED] => ED2): EdgeRDD[ED2]
@@ -888,7 +890,7 @@ def reverse: EdgeRDD[ED]
 def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3]
 {% endhighlight %}
 
-In most applications we have found that operations on the `EdgeRDD` are accomplished through the
+In most applications we have found that operations on the `EdgeRDD`[EdgeRDD] are accomplished through the
 graph operators or rely on operations defined in the base `RDD` class.
 
 # Optimized Representation

From a0125fd6847d5dbce92dc92cb5b16ee00f0ff6a8 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 14 Nov 2016 21:21:34 -0800
Subject: [PATCH 108/534] [SPARK-18430][SQL] Fixed Exception Messages when
 Hitting an Invocation Exception of Function Lookup

### What changes were proposed in this pull request?
When the exception is an invocation exception during function lookup, we return a useless/confusing error message:

For example,
```Scala
df.selectExpr("concat_ws()")
```
Below is the error message we got:
```
null; line 1 pos 0
org.apache.spark.sql.AnalysisException: null; line 1 pos 0
```

To get the meaningful error message, we need to get the cause. The fix is exactly the same as what we did in https://github.com/apache/spark/pull/12136. After the fix, the message we got is the exception issued in the constuctor of function implementation:
```
requirement failed: concat_ws requires at least one argument.; line 1 pos 0
org.apache.spark.sql.AnalysisException: requirement failed: concat_ws requires at least one argument.; line 1 pos 0
```

### How was this patch tested?
Added test cases.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15878 from gatorsmile/functionNotFound.

(cherry picked from commit 86430cc4e8dbc65a091a532fc9c5ec12b7be04f4)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalyst/analysis/FunctionRegistry.scala  |  5 ++++-
 .../sql-tests/inputs/string-functions.sql     |  3 +++
 .../results/string-functions.sql.out          | 20 +++++++++++++++++++
 3 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/string-functions.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index b028d07fb8d0c..007cdc1ccbe4e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -446,7 +446,10 @@ object FunctionRegistry {
         // If there is an apply method that accepts Seq[Expression], use that one.
         Try(varargCtor.get.newInstance(expressions).asInstanceOf[Expression]) match {
           case Success(e) => e
-          case Failure(e) => throw new AnalysisException(e.getMessage)
+          case Failure(e) =>
+            // the exception is an invocation exception. To get a meaningful message, we need the
+            // cause.
+            throw new AnalysisException(e.getCause.getMessage)
         }
       } else {
         // Otherwise, find a constructor method that matches the number of arguments, and use that.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
new file mode 100644
index 0000000000000..f21981ef7b72a
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -0,0 +1,3 @@
+-- Argument number exception
+select concat_ws();
+select format_string();
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
new file mode 100644
index 0000000000000..6961e9b65922f
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -0,0 +1,20 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 2
+
+
+-- !query 0
+select concat_ws()
+-- !query 0 schema
+struct<>
+-- !query 0 output
+org.apache.spark.sql.AnalysisException
+requirement failed: concat_ws requires at least one argument.; line 1 pos 7
+
+
+-- !query 1
+select format_string()
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+requirement failed: format_string() should take at least 1 argument; line 1 pos 7

From 0762c0cebe66f806b138420baa562787fd0cf375 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 15 Nov 2016 15:44:50 +0100
Subject: [PATCH 109/534] [SPARK-18427][DOC] Update docs of mllib.KMeans

## What changes were proposed in this pull request?
1,Remove `runs` from docs of mllib.KMeans
2,Add notes for `k` according to comments in sources
## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15873 from zhengruifeng/update_doc_mllib_kmeans.

(cherry picked from commit 33be4da5391b884191c405ffbce7d382ea8a2f66)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/mllib-clustering.md                          | 6 ++----
 examples/src/main/python/mllib/k_means_example.py | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index d5f6ae379a85e..8990e95796b67 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -24,13 +24,11 @@ variant of the [k-means++](http://en.wikipedia.org/wiki/K-means%2B%2B) method
 called [kmeans||](http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
 The implementation in `spark.mllib` has the following parameters:
 
-* *k* is the number of desired clusters.
+* *k* is the number of desired clusters. Note that it is possible for fewer than k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
 * *maxIterations* is the maximum number of iterations to run.
 * *initializationMode* specifies either random initialization or
 initialization via k-means\|\|.
-* *runs* is the number of times to run the k-means algorithm (k-means is not
-guaranteed to find a globally optimal solution, and when run multiple times on
-a given dataset, the algorithm returns the best clustering result).
+* *runs* This param has no effect since Spark 2.0.0.
 * *initializationSteps* determines the number of steps in the k-means\|\| algorithm.
 * *epsilon* determines the distance threshold within which we consider k-means to have converged.
 * *initialModel* is an optional set of cluster centers used for initialization. If this parameter is supplied, only one run is performed.
diff --git a/examples/src/main/python/mllib/k_means_example.py b/examples/src/main/python/mllib/k_means_example.py
index 5c397e62ef10e..d6058f45020c4 100644
--- a/examples/src/main/python/mllib/k_means_example.py
+++ b/examples/src/main/python/mllib/k_means_example.py
@@ -36,8 +36,7 @@
     parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
 
     # Build the model (cluster the data)
-    clusters = KMeans.train(parsedData, 2, maxIterations=10,
-                            runs=10, initializationMode="random")
+    clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")
 
     # Evaluate clustering by computing Within Set Sum of Squared Errors
     def error(point):

From 0af94e77221415fa006c467440514ee1c9e693f4 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Tue, 15 Nov 2016 06:59:25 -0800
Subject: [PATCH 110/534] [SPARK-18300][SQL] Do not apply foldable propagation
 with expand as a child.

## What changes were proposed in this pull request?
The `FoldablePropagation` optimizer rule, pulls foldable values out from under an `Expand`. This breaks the `Expand` in two ways:

- It rewrites the output attributes of the `Expand`. We explicitly define output attributes for `Expand`, these are (unfortunately) considered as part of the expressions of the `Expand` and can be rewritten.
- Expand can actually change the column (it will typically re-use the attributes or the underlying plan). This means that we cannot safely propagate the expressions from under an `Expand`.

This PR fixes this and (hopefully) other issues by explicitly whitelisting allowed operators.

## How was this patch tested?
Added tests to `FoldablePropagationSuite` and to `SQLQueryTestSuite`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #15857 from hvanhovell/SPARK-18300.

(cherry picked from commit f14ae4900ad0ed66ba36108b7792d56cd6767a69)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/optimizer/expressions.scala  | 58 ++++++++++---------
 .../optimizer/FoldablePropagationSuite.scala  | 27 +++++++--
 .../resources/sql-tests/inputs/group-by.sql   |  3 +
 .../sql-tests/results/group-by.sql.out        | 10 +++-
 4 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index b7458910da13e..3a7004ef297f6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -428,43 +428,49 @@ object FoldablePropagation extends Rule[LogicalPlan] {
       }
       case _ => Nil
     })
+    val replaceFoldable: PartialFunction[Expression, Expression] = {
+      case a: AttributeReference if foldableMap.contains(a) => foldableMap(a)
+    }
 
     if (foldableMap.isEmpty) {
       plan
     } else {
       var stop = false
       CleanupAliases(plan.transformUp {
-        case u: Union =>
-          stop = true
-          u
-        case c: Command =>
-          stop = true
-          c
-        // For outer join, although its output attributes are derived from its children, they are
-        // actually different attributes: the output of outer join is not always picked from its
-        // children, but can also be null.
+        // A leaf node should not stop the folding process (note that we are traversing up the
+        // tree, starting at the leaf nodes); so we are allowing it.
+        case l: LeafNode =>
+          l
+
+        // Whitelist of all nodes we are allowed to apply this rule to.
+        case p @ (_: Project | _: Filter | _: SubqueryAlias | _: Aggregate | _: Window |
+                  _: Sample | _: GlobalLimit | _: LocalLimit | _: Generate | _: Distinct |
+                  _: AppendColumns | _: AppendColumnsWithObject | _: BroadcastHint |
+                  _: RedistributeData | _: Repartition | _: Sort | _: TypedFilter) if !stop =>
+          p.transformExpressions(replaceFoldable)
+
+        // Allow inner joins. We do not allow outer join, although its output attributes are
+        // derived from its children, they are actually different attributes: the output of outer
+        // join is not always picked from its children, but can also be null.
         // TODO(cloud-fan): It seems more reasonable to use new attributes as the output attributes
         // of outer join.
-        case j @ Join(_, _, LeftOuter | RightOuter | FullOuter, _) =>
+        case j @ Join(_, _, Inner, _) =>
+          j.transformExpressions(replaceFoldable)
+
+        // We can fold the projections an expand holds. However expand changes the output columns
+        // and often reuses the underlying attributes; so we cannot assume that a column is still
+        // foldable after the expand has been applied.
+        // TODO(hvanhovell): Expand should use new attributes as the output attributes.
+        case expand: Expand if !stop =>
+          val newExpand = expand.copy(projections = expand.projections.map { projection =>
+            projection.map(_.transform(replaceFoldable))
+          })
           stop = true
-          j
+          newExpand
 
-        // These 3 operators take attributes as constructor parameters, and these attributes
-        // can't be replaced by alias.
-        case m: MapGroups =>
-          stop = true
-          m
-        case f: FlatMapGroupsInR =>
-          stop = true
-          f
-        case c: CoGroup =>
+        case other =>
           stop = true
-          c
-
-        case p: LogicalPlan if !stop => p.transformExpressions {
-          case a: AttributeReference if foldableMap.contains(a) =>
-            foldableMap(a)
-        }
+          other
       })
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
index 355b3fc4aa637..82756f545a8c7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
@@ -116,16 +116,35 @@ class FoldablePropagationSuite extends PlanTest {
   test("Propagate in subqueries of Union queries") {
     val query = Union(
       Seq(
-        testRelation.select(Literal(1).as('x), 'a).select('x + 'a),
-        testRelation.select(Literal(2).as('x), 'a).select('x + 'a)))
+        testRelation.select(Literal(1).as('x), 'a).select('x, 'x + 'a),
+        testRelation.select(Literal(2).as('x), 'a).select('x, 'x + 'a)))
       .select('x)
     val optimized = Optimize.execute(query.analyze)
     val correctAnswer = Union(
       Seq(
-        testRelation.select(Literal(1).as('x), 'a).select((Literal(1).as('x) + 'a).as("(x + a)")),
-        testRelation.select(Literal(2).as('x), 'a).select((Literal(2).as('x) + 'a).as("(x + a)"))))
+        testRelation.select(Literal(1).as('x), 'a)
+          .select(Literal(1).as('x), (Literal(1).as('x) + 'a).as("(x + a)")),
+        testRelation.select(Literal(2).as('x), 'a)
+          .select(Literal(2).as('x), (Literal(2).as('x) + 'a).as("(x + a)"))))
       .select('x).analyze
+    comparePlans(optimized, correctAnswer)
+  }
 
+  test("Propagate in expand") {
+    val c1 = Literal(1).as('a)
+    val c2 = Literal(2).as('b)
+    val a1 = c1.toAttribute.withNullability(true)
+    val a2 = c2.toAttribute.withNullability(true)
+    val expand = Expand(
+      Seq(Seq(Literal(null), 'b), Seq('a, Literal(null))),
+      Seq(a1, a2),
+      OneRowRelation.select(c1, c2))
+    val query = expand.where(a1.isNotNull).select(a1, a2).analyze
+    val optimized = Optimize.execute(query)
+    val correctExpand = expand.copy(projections = Seq(
+      Seq(Literal(null), c2),
+      Seq(c1, Literal(null))))
+    val correctAnswer = correctExpand.where(a1.isNotNull).select(a1, a2).analyze
     comparePlans(optimized, correctAnswer)
   }
 }
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index d950ec83d98c3..4d0ed43153004 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -32,3 +32,6 @@ SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;
 -- Aggregate with nulls.
 SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
 FROM testData;
+
+-- Aggregate with foldable input and multiple distinct groups.
+SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index af6c930d64b76..4b87d5161fc0e 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 14
+-- Number of queries: 15
 
 
 -- !query 0
@@ -131,3 +131,11 @@ FROM testData
 struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min(a):int,max(a):int,avg(a):double,var_samp(CAST(a AS DOUBLE)):double,stddev_samp(CAST(a AS DOUBLE)):double,sum(a):bigint,count(a):bigint>
 -- !query 13 output
 -0.2723801058145729	-1.5069204152249134	1	3	2.142857142857143	0.8095238095238094	0.8997354108424372	15	7
+
+
+-- !query 14
+SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 14 schema
+struct<count(DISTINCT b):bigint,count(DISTINCT b, c):bigint>
+-- !query 14 output
+1	1

From 5f7a9af66c0c05225f175f36bc10016874fab6fc Mon Sep 17 00:00:00 2001
From: Aaditya Ramesh <aramesh@conviva.com>
Date: Tue, 15 Nov 2016 13:01:01 -0800
Subject: [PATCH 111/534] [SPARK-13027][STREAMING] Added batch time as a
 parameter to updateStateByKey

Added RDD batch time as an input parameter to the update function in updateStateByKey.

Author: Aaditya Ramesh <aramesh@conviva.com>

Closes #11122 from aramesh117/SPARK-13027.

(cherry picked from commit 6f9e598ccf92f6272bbfb56ac56d3101387131b9)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../dstream/PairDStreamFunctions.scala        | 40 +++++++++--
 .../streaming/dstream/StateDStream.scala      | 28 ++++----
 .../streaming/BasicOperationsSuite.scala      | 66 +++++++++++++++++++
 .../spark/streaming/DStreamClosureSuite.scala | 12 ++++
 4 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index 2f2a6d13dd79b..ac739411fd212 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -453,9 +453,12 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
   def updateStateByKey[S: ClassTag](
       updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
       partitioner: Partitioner,
-      rememberPartitioner: Boolean
-    ): DStream[(K, S)] = ssc.withScope {
-     new StateDStream(self, ssc.sc.clean(updateFunc), partitioner, rememberPartitioner, None)
+      rememberPartitioner: Boolean): DStream[(K, S)] = ssc.withScope {
+    val cleanedFunc = ssc.sc.clean(updateFunc)
+    val newUpdateFunc = (_: Time, it: Iterator[(K, Seq[V], Option[S])]) => {
+      cleanedFunc(it)
+    }
+    new StateDStream(self, newUpdateFunc, partitioner, rememberPartitioner, None)
   }
 
   /**
@@ -499,10 +502,33 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
       updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
       partitioner: Partitioner,
       rememberPartitioner: Boolean,
-      initialRDD: RDD[(K, S)]
-    ): DStream[(K, S)] = ssc.withScope {
-     new StateDStream(self, ssc.sc.clean(updateFunc), partitioner,
-       rememberPartitioner, Some(initialRDD))
+      initialRDD: RDD[(K, S)]): DStream[(K, S)] = ssc.withScope {
+    val cleanedFunc = ssc.sc.clean(updateFunc)
+    val newUpdateFunc = (_: Time, it: Iterator[(K, Seq[V], Option[S])]) => {
+      cleanedFunc(it)
+    }
+    new StateDStream(self, newUpdateFunc, partitioner, rememberPartitioner, Some(initialRDD))
+  }
+
+  /**
+   * Return a new "state" DStream where the state for each key is updated by applying
+   * the given function on the previous state of the key and the new values of the key.
+   * org.apache.spark.Partitioner is used to control the partitioning of each RDD.
+   * @param updateFunc State update function. If `this` function returns None, then
+   *                   corresponding state key-value pair will be eliminated.
+   * @param partitioner Partitioner for controlling the partitioning of each RDD in the new
+   *                    DStream.
+   * @tparam S State type
+   */
+  def updateStateByKey[S: ClassTag](updateFunc: (Time, K, Seq[V], Option[S]) => Option[S],
+      partitioner: Partitioner,
+      rememberPartitioner: Boolean,
+      initialRDD: Option[RDD[(K, S)]] = None): DStream[(K, S)] = ssc.withScope {
+    val cleanedFunc = ssc.sc.clean(updateFunc)
+    val newUpdateFunc = (time: Time, iterator: Iterator[(K, Seq[V], Option[S])]) => {
+      iterator.flatMap(t => cleanedFunc(time, t._1, t._2, t._3).map(s => (t._1, s)))
+    }
+    new StateDStream(self, newUpdateFunc, partitioner, rememberPartitioner, initialRDD)
   }
 
   /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
index 8efb09a8ce981..5bf1dabf08f45 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/StateDStream.scala
@@ -27,7 +27,7 @@ import org.apache.spark.streaming.{Duration, Time}
 private[streaming]
 class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
     parent: DStream[(K, V)],
-    updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
+    updateFunc: (Time, Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)],
     partitioner: Partitioner,
     preservePartitioning: Boolean,
     initialRDD: Option[RDD[(K, S)]]
@@ -41,8 +41,10 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
 
   override val mustCheckpoint = true
 
-  private [this] def computeUsingPreviousRDD (
-    parentRDD: RDD[(K, V)], prevStateRDD: RDD[(K, S)]) = {
+  private [this] def computeUsingPreviousRDD(
+      batchTime: Time,
+      parentRDD: RDD[(K, V)],
+      prevStateRDD: RDD[(K, S)]) = {
     // Define the function for the mapPartition operation on cogrouped RDD;
     // first map the cogrouped tuple to tuples of required type,
     // and then apply the update function
@@ -53,7 +55,7 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
         val headOption = if (itr.hasNext) Some(itr.next()) else None
         (t._1, t._2._1.toSeq, headOption)
       }
-      updateFuncLocal(i)
+      updateFuncLocal(batchTime, i)
     }
     val cogroupedRDD = parentRDD.cogroup(prevStateRDD, partitioner)
     val stateRDD = cogroupedRDD.mapPartitions(finalFunc, preservePartitioning)
@@ -68,15 +70,14 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
       case Some(prevStateRDD) =>    // If previous state RDD exists
         // Try to get the parent RDD
         parent.getOrCompute(validTime) match {
-          case Some(parentRDD) =>   // If parent RDD exists, then compute as usual
-            computeUsingPreviousRDD(parentRDD, prevStateRDD)
-          case None =>    // If parent RDD does not exist
-
+          case Some(parentRDD) =>    // If parent RDD exists, then compute as usual
+            computeUsingPreviousRDD (validTime, parentRDD, prevStateRDD)
+          case None =>     // If parent RDD does not exist
             // Re-apply the update function to the old state RDD
             val updateFuncLocal = updateFunc
             val finalFunc = (iterator: Iterator[(K, S)]) => {
               val i = iterator.map(t => (t._1, Seq[V](), Option(t._2)))
-              updateFuncLocal(i)
+              updateFuncLocal(validTime, i)
             }
             val stateRDD = prevStateRDD.mapPartitions(finalFunc, preservePartitioning)
             Some(stateRDD)
@@ -93,15 +94,16 @@ class StateDStream[K: ClassTag, V: ClassTag, S: ClassTag](
                 // and then apply the update function
                 val updateFuncLocal = updateFunc
                 val finalFunc = (iterator: Iterator[(K, Iterable[V])]) => {
-                  updateFuncLocal(iterator.map(tuple => (tuple._1, tuple._2.toSeq, None)))
+                  updateFuncLocal (validTime,
+                    iterator.map (tuple => (tuple._1, tuple._2.toSeq, None)))
                 }
 
                 val groupedRDD = parentRDD.groupByKey(partitioner)
                 val sessionRDD = groupedRDD.mapPartitions(finalFunc, preservePartitioning)
                 // logDebug("Generating state RDD for time " + validTime + " (first)")
-                Some(sessionRDD)
-              case Some(initialStateRDD) =>
-                computeUsingPreviousRDD(parentRDD, initialStateRDD)
+                Some (sessionRDD)
+              case Some (initialStateRDD) =>
+                computeUsingPreviousRDD(validTime, parentRDD, initialStateRDD)
             }
           case None => // If parent RDD does not exist, then nothing to do!
             // logDebug("Not generating state RDD (no previous state, no parent)")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index cfcbdc7c382f9..4e702bbb92061 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -471,6 +471,72 @@ class BasicOperationsSuite extends TestSuiteBase {
     testOperation(inputData, updateStateOperation, outputData, true)
   }
 
+  test("updateStateByKey - testing time stamps as input") {
+    type StreamingState = Long
+    val initial: Seq[(String, StreamingState)] = Seq(("a", 0L), ("c", 0L))
+
+    val inputData =
+      Seq(
+        Seq("a"),
+        Seq("a", "b"),
+        Seq("a", "b", "c"),
+        Seq("a", "b"),
+        Seq("a"),
+        Seq()
+      )
+
+    // a -> 1000, 3000, 6000, 10000, 15000, 15000
+    // b -> 0, 2000, 5000, 9000, 9000, 9000
+    // c -> 1000, 1000, 3000, 3000, 3000, 3000
+
+    val outputData: Seq[Seq[(String, StreamingState)]] = Seq(
+        Seq(
+          ("a", 1000L),
+          ("c", 0L)), // t = 1000
+        Seq(
+          ("a", 3000L),
+          ("b", 2000L),
+          ("c", 0L)), // t = 2000
+        Seq(
+          ("a", 6000L),
+          ("b", 5000L),
+          ("c", 3000L)), // t = 3000
+        Seq(
+          ("a", 10000L),
+          ("b", 9000L),
+          ("c", 3000L)), // t = 4000
+        Seq(
+          ("a", 15000L),
+          ("b", 9000L),
+          ("c", 3000L)), // t = 5000
+        Seq(
+          ("a", 15000L),
+          ("b", 9000L),
+          ("c", 3000L)) // t = 6000
+      )
+
+    val updateStateOperation = (s: DStream[String]) => {
+      val initialRDD = s.context.sparkContext.makeRDD(initial)
+      val updateFunc = (time: Time,
+                        key: String,
+                        values: Seq[Int],
+                        state: Option[StreamingState]) => {
+        // Update only if we receive values for this key during the batch.
+        if (values.nonEmpty) {
+          Option(time.milliseconds + state.getOrElse(0L))
+        } else {
+          Option(state.getOrElse(0L))
+        }
+      }
+      s.map(x => (x, 1)).updateStateByKey[StreamingState](updateFunc = updateFunc,
+        partitioner = new HashPartitioner (numInputPartitions), rememberPartitioner = false,
+        initialRDD = Option(initialRDD))
+    }
+
+    testOperation(input = inputData, operation = updateStateOperation,
+      expectedOutput = outputData, useSet = true)
+  }
+
   test("updateStateByKey - with initial value RDD") {
     val initial = Seq(("a", 1), ("c", 2))
 
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
index 1fc34f569f9f4..2ab600ab817e0 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala
@@ -164,6 +164,10 @@ class DStreamClosureSuite extends SparkFunSuite with BeforeAndAfterAll {
   private def testUpdateStateByKey(ds: DStream[(Int, Int)]): Unit = {
     val updateF1 = (_: Seq[Int], _: Option[Int]) => { return; Some(1) }
     val updateF2 = (_: Iterator[(Int, Seq[Int], Option[Int])]) => { return; Seq((1, 1)).toIterator }
+    val updateF3 = (_: Time, _: Int, _: Seq[Int], _: Option[Int]) => {
+      return
+      Option(1)
+    }
     val initialRDD = ds.ssc.sparkContext.emptyRDD[Int].map { i => (i, i) }
     expectCorrectException { ds.updateStateByKey(updateF1) }
     expectCorrectException { ds.updateStateByKey(updateF1, 5) }
@@ -177,6 +181,14 @@ class DStreamClosureSuite extends SparkFunSuite with BeforeAndAfterAll {
     expectCorrectException {
       ds.updateStateByKey(updateF2, new HashPartitioner(5), true, initialRDD)
     }
+    expectCorrectException {
+      ds.updateStateByKey(
+        updateFunc = updateF3,
+        partitioner = new HashPartitioner(5),
+        rememberPartitioner = true,
+        initialRDD = Option(initialRDD)
+      )
+    }
   }
   private def testMapValues(ds: DStream[(Int, Int)]): Unit = expectCorrectException {
     ds.mapValues { _ => return; 1 }

From f13a33b477a3f9cc81f9decee736e7c50d8205e1 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 15 Nov 2016 13:09:29 -0800
Subject: [PATCH 112/534] [SPARK-18337] Complete mode memory sinks should be
 able to recover from checkpoints

## What changes were proposed in this pull request?

It would be nice if memory sinks can also recover from checkpoints. For correctness reasons, the only time we should support it is in `Complete` OutputMode. We can support this in CompleteMode, because the output of the StateStore is already persisted in the checkpoint directory.

## How was this patch tested?

Unit test

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #15801 from brkyvz/mem-stream.

(cherry picked from commit 2afdaa9805f44b45242978eab9a9623d31dddbf3)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/streaming/DataStreamWriter.scala      |  6 +-
 .../test/DataStreamReaderWriterSuite.scala    | 65 +++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index b959444b49298..daed1dcb77370 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -222,14 +222,16 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
 
       val sink = new MemorySink(df.schema, outputMode)
       val resultDf = Dataset.ofRows(df.sparkSession, new MemoryPlan(sink))
+      val chkpointLoc = extraOptions.get("checkpointLocation")
+      val recoverFromChkpoint = chkpointLoc.isDefined && outputMode == OutputMode.Complete()
       val query = df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
-        extraOptions.get("checkpointLocation"),
+        chkpointLoc,
         df,
         sink,
         outputMode,
         useTempCheckpointLocation = true,
-        recoverFromCheckpointLocation = false,
+        recoverFromCheckpointLocation = recoverFromChkpoint,
         trigger = trigger)
       resultDf.createOrReplaceTempView(query.name)
       query
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index f0994395813e4..5630464f40803 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.streaming.test
 
+import java.io.File
 import java.util.concurrent.TimeUnit
 
 import scala.concurrent.duration._
@@ -467,4 +468,68 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     val sq = df.writeStream.format("console").start()
     sq.stop()
   }
+
+  test("MemorySink can recover from a checkpoint in Complete Mode") {
+    import testImplicits._
+    val ms = new MemoryStream[Int](0, sqlContext)
+    val df = ms.toDF().toDF("a")
+    val checkpointLoc = newMetadataDir
+    val checkpointDir = new File(checkpointLoc, "offsets")
+    checkpointDir.mkdirs()
+    assert(checkpointDir.exists())
+    val tableName = "test"
+    def startQuery: StreamingQuery = {
+      df.groupBy("a")
+        .count()
+        .writeStream
+        .format("memory")
+        .queryName(tableName)
+        .option("checkpointLocation", checkpointLoc)
+        .outputMode("complete")
+        .start()
+    }
+    // no exception here
+    val q = startQuery
+    ms.addData(0, 1)
+    q.processAllAvailable()
+    q.stop()
+
+    checkAnswer(
+      spark.table(tableName),
+      Seq(Row(0, 1), Row(1, 1))
+    )
+    spark.sql(s"drop table $tableName")
+    // verify table is dropped
+    intercept[AnalysisException](spark.table(tableName).collect())
+    val q2 = startQuery
+    ms.addData(0)
+    q2.processAllAvailable()
+    checkAnswer(
+      spark.table(tableName),
+      Seq(Row(0, 2), Row(1, 1))
+    )
+
+    q2.stop()
+  }
+
+  test("append mode memory sink's do not support checkpoint recovery") {
+    import testImplicits._
+    val ms = new MemoryStream[Int](0, sqlContext)
+    val df = ms.toDF().toDF("a")
+    val checkpointLoc = newMetadataDir
+    val checkpointDir = new File(checkpointLoc, "offsets")
+    checkpointDir.mkdirs()
+    assert(checkpointDir.exists())
+
+    val e = intercept[AnalysisException] {
+      df.writeStream
+        .format("memory")
+        .queryName("test")
+        .option("checkpointLocation", checkpointLoc)
+        .outputMode("append")
+        .start()
+    }
+    assert(e.getMessage.contains("does not support recovering"))
+    assert(e.getMessage.contains("checkpoint location"))
+  }
 }

From b424dc947be8ea7230bfdf7f66976fbf63c85f85 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 15 Nov 2016 15:12:30 -0800
Subject: [PATCH 113/534] [SPARK-18440][STRUCTURED STREAMING] Pass correct
 query execution to FileFormatWriter

## What changes were proposed in this pull request?

SPARK-18012 refactored the file write path in FileStreamSink using FileFormatWriter which always uses the default non-streaming QueryExecution to perform the writes. This is wrong for FileStreamSink, because the streaming QueryExecution (i.e. IncrementalExecution) should be used for correctly incrementalizing aggregation. The addition of watermarks in SPARK-18124, file stream sink should logically supports aggregation + watermark + append mode. But actually it fails with
```
16:23:07.389 ERROR org.apache.spark.sql.execution.streaming.StreamExecution: Query query-0 terminated with error
java.lang.AssertionError: assertion failed: No plan for EventTimeWatermark timestamp#7: timestamp, interval 10 seconds
+- LocalRelation [timestamp#7]

	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:74)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:66)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74)
```

This PR fixes it by passing the correct query execution.

## How was this patch tested?
New unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15885 from tdas/SPARK-18440.

(cherry picked from commit 1ae4652b7e1f77a984b8459c778cb06c814192c5)
Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 .../datasources/FileFormatWriter.scala        |  9 +--
 .../InsertIntoHadoopFsRelationCommand.scala   |  2 +-
 .../execution/streaming/FileStreamSink.scala  |  2 +-
 .../sql/streaming/FileStreamSinkSuite.scala   | 78 +++++++++++++++++--
 4 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index edcce103d0963..a9f79da6358dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.{SQLExecution, UnsafeKVExternalSorter}
+import org.apache.spark.sql.execution.{QueryExecution, SQLExecution, UnsafeKVExternalSorter}
 import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
@@ -85,7 +85,7 @@ object FileFormatWriter extends Logging {
    */
   def write(
       sparkSession: SparkSession,
-      plan: LogicalPlan,
+      queryExecution: QueryExecution,
       fileFormat: FileFormat,
       committer: FileCommitProtocol,
       outputSpec: OutputSpec,
@@ -101,8 +101,7 @@ object FileFormatWriter extends Logging {
     FileOutputFormat.setOutputPath(job, new Path(outputSpec.outputPath))
 
     val partitionSet = AttributeSet(partitionColumns)
-    val dataColumns = plan.output.filterNot(partitionSet.contains)
-    val queryExecution = Dataset.ofRows(sparkSession, plan).queryExecution
+    val dataColumns = queryExecution.logical.output.filterNot(partitionSet.contains)
 
     // Note: prepareWrite has side effect. It sets "job".
     val outputWriterFactory =
@@ -112,7 +111,7 @@ object FileFormatWriter extends Logging {
       uuid = UUID.randomUUID().toString,
       serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
       outputWriterFactory = outputWriterFactory,
-      allColumns = plan.output,
+      allColumns = queryExecution.logical.output,
       partitionColumns = partitionColumns,
       nonPartitionColumns = dataColumns,
       bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 28975e1546e79..a9bde903b3b5e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -100,7 +100,7 @@ case class InsertIntoHadoopFsRelationCommand(
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
-        plan = query,
+        queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
         fileFormat = fileFormat,
         committer = committer,
         outputSpec = FileFormatWriter.OutputSpec(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index f1c5f9ab5067d..0dbe2a71ed3bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -77,7 +77,7 @@ class FileStreamSink(
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
-        plan = data.logicalPlan,
+        queryExecution = data.queryExecution,
         fileFormat = fileFormat,
         committer = committer,
         outputSpec = FileFormatWriter.OutputSpec(path, Map.empty),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index fa97d9292e551..09613ef9e4348 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -21,13 +21,14 @@ import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.streaming.{MemoryStream, MetadataLogFileIndex}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 class FileStreamSinkSuite extends StreamTest {
   import testImplicits._
 
-  test("FileStreamSink - unpartitioned writing and batch reading") {
+  test("unpartitioned writing and batch reading") {
     val inputData = MemoryStream[Int]
     val df = inputData.toDF()
 
@@ -59,7 +60,7 @@ class FileStreamSinkSuite extends StreamTest {
     }
   }
 
-  test("FileStreamSink - partitioned writing and batch reading") {
+  test("partitioned writing and batch reading") {
     val inputData = MemoryStream[Int]
     val ds = inputData.toDS()
 
@@ -142,16 +143,83 @@ class FileStreamSinkSuite extends StreamTest {
     }
   }
 
-  test("FileStreamSink - parquet") {
+  // This tests whether FileStreamSink works with aggregations. Specifically, it tests
+  // whether the the correct streaming QueryExecution (i.e. IncrementalExecution) is used to
+  // to execute the trigger for writing data to file sink. See SPARK-18440 for more details.
+  test("writing with aggregation") {
+
+    // Since FileStreamSink currently only supports append mode, we will test FileStreamSink
+    // with aggregations using event time windows and watermark, which allows
+    // aggregation + append mode.
+    val inputData = MemoryStream[Long]
+    val inputDF = inputData.toDF.toDF("time")
+    val outputDf = inputDF
+      .selectExpr("CAST(time AS timestamp) AS timestamp")
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds"))
+      .count()
+      .select("window.start", "window.end", "count")
+
+    val outputDir = Utils.createTempDir(namePrefix = "stream.output").getCanonicalPath
+    val checkpointDir = Utils.createTempDir(namePrefix = "stream.checkpoint").getCanonicalPath
+
+    var query: StreamingQuery = null
+
+    try {
+      query =
+        outputDf.writeStream
+          .option("checkpointLocation", checkpointDir)
+          .format("parquet")
+          .start(outputDir)
+
+
+      def addTimestamp(timestampInSecs: Int*): Unit = {
+        inputData.addData(timestampInSecs.map(_ * 1L): _*)
+        failAfter(streamingTimeout) {
+          query.processAllAvailable()
+        }
+      }
+
+      def check(expectedResult: ((Long, Long), Long)*): Unit = {
+        val outputDf = spark.read.parquet(outputDir)
+          .selectExpr(
+            "CAST(start as BIGINT) AS start",
+            "CAST(end as BIGINT) AS end",
+            "count")
+        checkDataset(
+          outputDf.as[(Long, Long, Long)],
+          expectedResult.map(x => (x._1._1, x._1._2, x._2)): _*)
+      }
+
+      addTimestamp(100) // watermark = None before this, watermark = 100 - 10 = 90 after this
+      check() // nothing emitted yet
+
+      addTimestamp(104, 123) // watermark = 90 before this, watermark = 123 - 10 = 113 after this
+      check() // nothing emitted yet
+
+      addTimestamp(140) // wm = 113 before this, emit results on 100-105, wm = 130 after this
+      check((100L, 105L) -> 2L)
+
+      addTimestamp(150) // wm = 130s before this, emit results on 120-125, wm = 150 after this
+      check((100L, 105L) -> 2L, (120L, 125L) -> 1L)
+
+    } finally {
+      if (query != null) {
+        query.stop()
+      }
+    }
+  }
+
+  test("parquet") {
     testFormat(None) // should not throw error as default format parquet when not specified
     testFormat(Some("parquet"))
   }
 
-  test("FileStreamSink - text") {
+  test("text") {
     testFormat(Some("text"))
   }
 
-  test("FileStreamSink - json") {
+  test("json") {
     testFormat(Some("json"))
   }
 

From e469d3badffdf9d1cd8399a06d0bdb61781e76d4 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 15 Nov 2016 15:44:15 -0800
Subject: [PATCH 114/534] [SPARK-18423][STREAMING] ReceiverTracker should close
 checkpoint dir when stopped even if it was not started

## What changes were proposed in this pull request?

Several tests are being failed on Windows due to the failure of removing the checkpoint dir between each tests.

This is caused by not closed file in `ReceiverTracker`. When it is not started, it does not close it even if `stop()` is called.

```
Test org.apache.spark.streaming.JavaAPISuite.testCheckpointMasterRecovery started
Test org.apache.spark.streaming.JavaAPISuite.testCheckpointMasterRecovery failed: java.io.IOException: Failed to delete: C:\projects\spark\target\tmp\1478983663710-0, took 3.828 sec
    at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1010)
    at org.apache.spark.util.Utils.deleteRecursively(Utils.scala)
    at org.apache.spark.streaming.JavaAPISuite.testCheckpointMasterRecovery(JavaAPISuite.java:1809)
    ...
```

```
- mapWithState - basic operations with simple API (7 seconds, 640 milliseconds)
Exception encountered when attempting to run a suite with class name: org.apache.spark.streaming.MapWithStateSuite *** ABORTED *** (12 seconds, 688 milliseconds)
  java.io.IOException: Failed to delete: C:\projects\spark\streaming\checkpoint\spark-b8486e2b-6468-4e6f-bb24-88277d2c033c
  ...
```

## How was this patch tested?

Tests in `JavaAPISuite` and `MapWithStateSuite`.

Manually tested via AppVeyor:

**Before**

- `org.apache.spark.streaming.JavaAPISuite`
  Build: https://ci.appveyor.com/project/spark-test/spark/build/71-MapWithStateSuite-1
  Diff: https://github.com/apache/spark/compare/master...spark-test:188c828e682ec45b75d15c3dfc782bcdc8ce024c

- `org.apache.spark.streaming.MapWithStateSuite`
  Build: https://ci.appveyor.com/project/spark-test/spark/build/72-MapWithStateSuite-1
  Diff: https://github.com/apache/spark/compare/master...spark-test:8f6945d0ccde022a23d3848f6b7fe6da1e7c902e

**After**

- `org.apache.spark.streaming.JavaAPISuite`
  Build started: [Streaming] `org.apache.spark.streaming.JavaAPISuite` [![PR-15867](https://ci.appveyor.com/api/projects/status/github/spark-test/spark?branch=3D74F2D5-B0D5-4E1D-874C-685AE694FD37&svg=true)](https://ci.appveyor.com/project/spark-test/spark/branch/3D74F2D5-B0D5-4E1D-874C-685AE694FD37)
  Diff: https://github.com/apache/spark/compare/master...spark-test:3D74F2D5-B0D5-4E1D-874C-685AE694FD37

- `org.apache.spark.streaming.MapWithStateSuite`
  Build started: [Streaming] `org.apache.spark.streaming.MapWithStateSuite` [![PR-15867](https://ci.appveyor.com/api/projects/status/github/spark-test/spark?branch=C8E88B64-49F0-4157-9AFA-FC3ACC442351&svg=true)](https://ci.appveyor.com/project/spark-test/spark/branch/C8E88B64-49F0-4157-9AFA-FC3ACC442351)
  Diff: https://github.com/apache/spark/compare/master...spark-test:C8E88B64-49F0-4157-9AFA-FC3ACC442351

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15867 from HyukjinKwon/SPARK-18423.

(cherry picked from commit 503378f10ca92064034aa88e0feebe4718af8bbe)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../spark/streaming/scheduler/ReceiverTracker.scala    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index b9d898a72362e..8f55d982a904c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -197,6 +197,13 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
       receivedBlockTracker.stop()
       logInfo("ReceiverTracker stopped")
       trackerState = Stopped
+    } else if (isTrackerInitialized) {
+      trackerState = Stopping
+      // `ReceivedBlockTracker` is open when this instance is created. We should
+      // close this even if this `ReceiverTracker` is not started.
+      receivedBlockTracker.stop()
+      logInfo("ReceiverTracker stopped")
+      trackerState = Stopped
     }
   }
 
@@ -446,6 +453,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
     endpoint.send(StartAllReceivers(receivers))
   }
 
+  /** Check if tracker has been marked for initiated */
+  private def isTrackerInitialized: Boolean = trackerState == Initialized
+
   /** Check if tracker has been marked for starting */
   private def isTrackerStarted: Boolean = trackerState == Started
 

From 1126c3194ee1c79015cf1d3808bc963aa93dcadf Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 15 Nov 2016 15:59:04 -0800
Subject: [PATCH 115/534] [SPARK-17732][SQL] ALTER TABLE DROP PARTITION should
 support comparators

## What changes were proposed in this pull request?

This PR aims to support `comparators`, e.g. '<', '<=', '>', '>=', again in Apache Spark 2.0 for backward compatibility.

**Spark 1.6**

``` scala
scala> sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
res0: org.apache.spark.sql.DataFrame = [result: string]

scala> sql("ALTER TABLE sales DROP PARTITION (country < 'KR')")
res1: org.apache.spark.sql.DataFrame = [result: string]
```

**Spark 2.0**

``` scala
scala> sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
res0: org.apache.spark.sql.DataFrame = []

scala> sql("ALTER TABLE sales DROP PARTITION (country < 'KR')")
org.apache.spark.sql.catalyst.parser.ParseException:
mismatched input '<' expecting {')', ','}(line 1, pos 42)
```

After this PR, it's supported.

## How was this patch tested?

Pass the Jenkins test with a newly added testcase.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15704 from dongjoon-hyun/SPARK-17732-2.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |   6 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  30 ++++-
 .../spark/sql/execution/SparkSqlParser.scala  |   2 +-
 .../spark/sql/execution/command/ddl.scala     |  51 +++++++--
 .../datasources/DataSourceStrategy.scala      |   8 +-
 .../execution/command/DDLCommandSuite.scala   |   9 +-
 .../sql/hive/execution/HiveDDLSuite.scala     | 103 ++++++++++++++++++
 7 files changed, 185 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index b599a884957a8..fcca11c69f0a3 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -239,11 +239,7 @@ partitionSpecLocation
     ;
 
 partitionSpec
-    : PARTITION '(' partitionVal (',' partitionVal)* ')'
-    ;
-
-partitionVal
-    : identifier (EQ constant)?
+    : PARTITION '(' expression (',' expression)* ')'
     ;
 
 describeFuncName
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 2006844923cf7..97056bba9d763 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -194,10 +194,15 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
    */
   override def visitPartitionSpec(
       ctx: PartitionSpecContext): Map[String, Option[String]] = withOrigin(ctx) {
-    val parts = ctx.partitionVal.asScala.map { pVal =>
-      val name = pVal.identifier.getText
-      val value = Option(pVal.constant).map(visitStringConstant)
-      name -> value
+    val parts = ctx.expression.asScala.map { pVal =>
+      expression(pVal) match {
+        case UnresolvedAttribute(name :: Nil) =>
+          name -> None
+        case cmp @ EqualTo(UnresolvedAttribute(name :: Nil), constant: Literal) =>
+          name -> Option(constant.toString)
+        case _ =>
+          throw new ParseException("Invalid partition filter specification", ctx)
+      }
     }
     // Before calling `toMap`, we check duplicated keys to avoid silently ignore partition values
     // in partition spec like PARTITION(a='1', b='2', a='3'). The real semantical check for
@@ -206,6 +211,23 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
     parts.toMap
   }
 
+  /**
+   * Create a partition filter specification.
+   */
+  def visitPartitionFilterSpec(ctx: PartitionSpecContext): Expression = withOrigin(ctx) {
+    val parts = ctx.expression.asScala.map { pVal =>
+      expression(pVal) match {
+        case EqualNullSafe(_, _) =>
+          throw new ParseException("'<=>' operator is not allowed in partition specification.", ctx)
+        case cmp @ BinaryComparison(UnresolvedAttribute(name :: Nil), constant: Literal) =>
+          cmp.withNewChildren(Seq(AttributeReference(name, StringType)(), constant))
+        case _ =>
+          throw new ParseException("Invalid partition filter specification", ctx)
+      }
+    }
+    parts.reduceLeft(And)
+  }
+
   /**
    * Create a partition specification map without optional values.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index b8be3d17ba444..112d812cb6c76 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -813,7 +813,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     }
     AlterTableDropPartitionCommand(
       visitTableIdentifier(ctx.tableIdentifier),
-      ctx.partitionSpec.asScala.map(visitNonOptionalPartitionSpec),
+      ctx.partitionSpec.asScala.map(visitPartitionFilterSpec),
       ctx.EXISTS != null,
       ctx.PURGE != null)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 84a63fdb9f36f..6c1c398940d0f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -31,7 +31,8 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryComparison}
+import org.apache.spark.sql.catalyst.expressions.{EqualTo, Expression, PredicateHelper}
 import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, PartitioningUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -418,27 +419,55 @@ case class AlterTableRenamePartitionCommand(
  */
 case class AlterTableDropPartitionCommand(
     tableName: TableIdentifier,
-    specs: Seq[TablePartitionSpec],
+    specs: Seq[Expression],
     ifExists: Boolean,
     purge: Boolean)
-  extends RunnableCommand {
+  extends RunnableCommand with PredicateHelper {
+
+  private def isRangeComparison(expr: Expression): Boolean = {
+    expr.find(e => e.isInstanceOf[BinaryComparison] && !e.isInstanceOf[EqualTo]).isDefined
+  }
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
+    val resolver = sparkSession.sessionState.conf.resolver
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
     DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE DROP PARTITION")
 
-    val normalizedSpecs = specs.map { spec =>
-      PartitioningUtils.normalizePartitionSpec(
-        spec,
-        table.partitionColumnNames,
-        table.identifier.quotedString,
-        sparkSession.sessionState.conf.resolver)
+    specs.foreach { expr =>
+      expr.references.foreach { attr =>
+        if (!table.partitionColumnNames.exists(resolver(_, attr.name))) {
+          throw new AnalysisException(s"${attr.name} is not a valid partition column " +
+            s"in table ${table.identifier.quotedString}.")
+        }
+      }
     }
 
-    catalog.dropPartitions(
-      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
+    if (specs.exists(isRangeComparison)) {
+      val partitionSet = specs.flatMap { spec =>
+        val partitions = catalog.listPartitionsByFilter(table.identifier, Seq(spec)).map(_.spec)
+        if (partitions.isEmpty && !ifExists) {
+          throw new AnalysisException(s"There is no partition for ${spec.sql}")
+        }
+        partitions
+      }.distinct
+      catalog.dropPartitions(
+        table.identifier, partitionSet, ignoreIfNotExists = ifExists, purge = purge)
+    } else {
+      val normalizedSpecs = specs.map { expr =>
+        val spec = splitConjunctivePredicates(expr).map {
+          case BinaryComparison(AttributeReference(name, _, _, _), right) => name -> right.toString
+        }.toMap
+        PartitioningUtils.normalizePartitionSpec(
+          spec,
+          table.partitionColumnNames,
+          table.identifier.quotedString,
+          resolver)
+      }
+      catalog.dropPartitions(
+        table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
+    }
     Seq.empty[Row]
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 4f19a2d00b0e4..e81512d1abf84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -215,8 +215,14 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
           if (overwrite.enabled) {
             val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
             if (deletedPartitions.nonEmpty) {
+              import org.apache.spark.sql.catalyst.expressions._
+              val expressions = deletedPartitions.map { specs =>
+                specs.map { case (key, value) =>
+                  EqualTo(AttributeReference(key, StringType)(), Literal.create(value, StringType))
+                }.reduceLeft(And)
+              }.toSeq
               AlterTableDropPartitionCommand(
-                l.catalogTable.get.identifier, deletedPartitions.toSeq,
+                l.catalogTable.get.identifier, expressions,
                 ifExists = true, purge = true).run(t.sparkSession)
             }
           }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index d31e7aeb3a78a..057528bef5084 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -21,6 +21,7 @@ import scala.reflect.{classTag, ClassTag}
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, Literal}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.Project
@@ -612,8 +613,12 @@ class DDLCommandSuite extends PlanTest {
     val expected1_table = AlterTableDropPartitionCommand(
       tableIdent,
       Seq(
-        Map("dt" -> "2008-08-08", "country" -> "us"),
-        Map("dt" -> "2009-09-09", "country" -> "uk")),
+        And(
+          EqualTo(AttributeReference("dt", StringType)(), Literal.create("2008-08-08", StringType)),
+          EqualTo(AttributeReference("country", StringType)(), Literal.create("us", StringType))),
+        And(
+          EqualTo(AttributeReference("dt", StringType)(), Literal.create("2009-09-09", StringType)),
+          EqualTo(AttributeReference("country", StringType)(), Literal.create("uk", StringType)))),
       ifExists = true,
       purge = false)
     val expected2_table = expected1_table.copy(ifExists = false)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 6efae13ddf69d..a2b04863d39b7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
@@ -225,6 +226,108 @@ class HiveDDLSuite
     }
   }
 
+  test("SPARK-17732: Drop partitions by filter") {
+    withTable("sales") {
+      sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
+
+      for (country <- Seq("US", "CA", "KR")) {
+        for (quarter <- 1 to 4) {
+          sql(s"ALTER TABLE sales ADD PARTITION (country = '$country', quarter = '$quarter')")
+        }
+      }
+
+      sql("ALTER TABLE sales DROP PARTITION (country < 'KR', quarter > '2')")
+      checkAnswer(sql("SHOW PARTITIONS sales"),
+        Row("country=CA/quarter=1") ::
+        Row("country=CA/quarter=2") ::
+        Row("country=KR/quarter=1") ::
+        Row("country=KR/quarter=2") ::
+        Row("country=KR/quarter=3") ::
+        Row("country=KR/quarter=4") ::
+        Row("country=US/quarter=1") ::
+        Row("country=US/quarter=2") ::
+        Row("country=US/quarter=3") ::
+        Row("country=US/quarter=4") :: Nil)
+
+      sql("ALTER TABLE sales DROP PARTITION (country < 'KR'), PARTITION (quarter <= '1')")
+      checkAnswer(sql("SHOW PARTITIONS sales"),
+        Row("country=KR/quarter=2") ::
+        Row("country=KR/quarter=3") ::
+        Row("country=KR/quarter=4") ::
+        Row("country=US/quarter=2") ::
+        Row("country=US/quarter=3") ::
+        Row("country=US/quarter=4") :: Nil)
+
+      sql("ALTER TABLE sales DROP PARTITION (country='KR', quarter='4')")
+      sql("ALTER TABLE sales DROP PARTITION (country='US', quarter='3')")
+      checkAnswer(sql("SHOW PARTITIONS sales"),
+        Row("country=KR/quarter=2") ::
+        Row("country=KR/quarter=3") ::
+        Row("country=US/quarter=2") ::
+        Row("country=US/quarter=4") :: Nil)
+
+      sql("ALTER TABLE sales DROP PARTITION (quarter <= 2), PARTITION (quarter >= '4')")
+      checkAnswer(sql("SHOW PARTITIONS sales"),
+        Row("country=KR/quarter=3") :: Nil)
+
+      // According to the declarative partition spec definitions, this drops the union of target
+      // partitions without exceptions. Hive raises exceptions because it handles them sequentially.
+      sql("ALTER TABLE sales DROP PARTITION (quarter <= 4), PARTITION (quarter <= '3')")
+      checkAnswer(sql("SHOW PARTITIONS sales"), Nil)
+    }
+  }
+
+  test("SPARK-17732: Error handling for drop partitions by filter") {
+    withTable("sales") {
+      sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
+
+      val m = intercept[AnalysisException] {
+        sql("ALTER TABLE sales DROP PARTITION (unknown = 'KR')")
+      }.getMessage
+      assert(m.contains("unknown is not a valid partition column in table"))
+
+      val m2 = intercept[AnalysisException] {
+        sql("ALTER TABLE sales DROP PARTITION (unknown < 'KR')")
+      }.getMessage
+      assert(m2.contains("unknown is not a valid partition column in table"))
+
+      val m3 = intercept[AnalysisException] {
+        sql("ALTER TABLE sales DROP PARTITION (unknown <=> 'KR')")
+      }.getMessage
+      assert(m3.contains("'<=>' operator is not allowed in partition specification"))
+
+      val m4 = intercept[ParseException] {
+        sql("ALTER TABLE sales DROP PARTITION (unknown <=> upper('KR'))")
+      }.getMessage
+      assert(m4.contains("'<=>' operator is not allowed in partition specification"))
+
+      val m5 = intercept[ParseException] {
+        sql("ALTER TABLE sales DROP PARTITION (country < 'KR', quarter)")
+      }.getMessage
+      assert(m5.contains("Invalid partition filter specification"))
+
+      sql(s"ALTER TABLE sales ADD PARTITION (country = 'KR', quarter = '3')")
+      val m6 = intercept[AnalysisException] {
+        sql("ALTER TABLE sales DROP PARTITION (quarter <= '4'), PARTITION (quarter <= '2')")
+      }.getMessage
+      // The query is not executed because `PARTITION (quarter <= '2')` is invalid.
+      checkAnswer(sql("SHOW PARTITIONS sales"),
+        Row("country=KR/quarter=3") :: Nil)
+      assert(m6.contains("There is no partition for (`quarter` <= '2')"))
+    }
+  }
+
+  test("SPARK-17732: Partition filter is not allowed in ADD PARTITION") {
+    withTable("sales") {
+      sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
+
+      val m = intercept[ParseException] {
+        sql("ALTER TABLE sales ADD PARTITION (country = 'US', quarter < '1')")
+      }.getMessage()
+      assert(m.contains("Invalid partition filter specification"))
+    }
+  }
+
   test("drop views") {
     withTable("tab1") {
       val tabName = "tab1"

From 175c47864b893d924166b6eb17c52042611eeb97 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Tue, 15 Nov 2016 16:55:02 -0800
Subject: [PATCH 116/534] [SPARK-18300][SQL] Fix scala 2.10 build for
 FoldablePropagation

## What changes were proposed in this pull request?
Commit https://github.com/apache/spark/commit/f14ae4900ad0ed66ba36108b7792d56cd6767a69 broke the scala 2.10 build. This PR fixes this by simplifying the used pattern match.

## How was this patch tested?
Tested building manually. Ran `build/sbt -Dscala-2.10 -Pscala-2.10 package`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #15891 from hvanhovell/SPARK-18300-scala-2.10.

(cherry picked from commit 4b35d13baca189a50cdaa2ba435d10a1f953e3f8)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../sql/catalyst/optimizer/expressions.scala  | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 3a7004ef297f6..6958398e03f70 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -442,12 +442,9 @@ object FoldablePropagation extends Rule[LogicalPlan] {
         case l: LeafNode =>
           l
 
-        // Whitelist of all nodes we are allowed to apply this rule to.
-        case p @ (_: Project | _: Filter | _: SubqueryAlias | _: Aggregate | _: Window |
-                  _: Sample | _: GlobalLimit | _: LocalLimit | _: Generate | _: Distinct |
-                  _: AppendColumns | _: AppendColumnsWithObject | _: BroadcastHint |
-                  _: RedistributeData | _: Repartition | _: Sort | _: TypedFilter) if !stop =>
-          p.transformExpressions(replaceFoldable)
+        // We can only propagate foldables for a subset of unary nodes.
+        case u: UnaryNode if !stop && canPropagateFoldables(u) =>
+          u.transformExpressions(replaceFoldable)
 
         // Allow inner joins. We do not allow outer join, although its output attributes are
         // derived from its children, they are actually different attributes: the output of outer
@@ -474,6 +471,30 @@ object FoldablePropagation extends Rule[LogicalPlan] {
       })
     }
   }
+
+  /**
+   * Whitelist of all [[UnaryNode]]s for which allow foldable propagation.
+   */
+  private def canPropagateFoldables(u: UnaryNode): Boolean = u match {
+    case _: Project => true
+    case _: Filter => true
+    case _: SubqueryAlias => true
+    case _: Aggregate => true
+    case _: Window => true
+    case _: Sample => true
+    case _: GlobalLimit => true
+    case _: LocalLimit => true
+    case _: Generate => true
+    case _: Distinct => true
+    case _: AppendColumns => true
+    case _: AppendColumnsWithObject => true
+    case _: BroadcastHint => true
+    case _: RedistributeData => true
+    case _: Repartition => true
+    case _: Sort => true
+    case _: TypedFilter => true
+    case _ => false
+  }
 }
 
 

From 436ae201f825c02b9720805ada8c0dca496a1ac5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 15 Nov 2016 20:24:36 -0800
Subject: [PATCH 117/534] [SPARK-18377][SQL] warehouse path should be a static
 conf

## What changes were proposed in this pull request?

it's weird that every session can set its own warehouse path at runtime, we should forbid it and make it a static conf.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15825 from cloud-fan/warehouse.

(cherry picked from commit 4ac9759f807d217b6f67badc6d5f6b7138eb92d2)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/catalog/SessionCatalog.scala |   9 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  12 +-
 .../spark/sql/internal/SharedState.scala      |  32 +--
 .../sql/execution/command/DDLSuite.scala      | 193 +++++++-----------
 .../spark/sql/internal/SQLConfSuite.scala     |  16 +-
 .../org/apache/spark/sql/hive/HiveUtils.scala |   4 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |  85 ++++----
 7 files changed, 142 insertions(+), 209 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index c8b61d8df3585..19a8fcdd8b75b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -83,14 +83,7 @@ class SessionCatalog(
   // check whether the temporary table or function exists, then, if not, operate on
   // the corresponding item in the current database.
   @GuardedBy("this")
-  protected var currentDb = {
-    val defaultName = DEFAULT_DATABASE
-    val defaultDbDefinition =
-      CatalogDatabase(defaultName, "default database", conf.warehousePath, Map())
-    // Initialize default database if it doesn't already exist
-    createDatabase(defaultDbDefinition, ignoreIfExists = true)
-    formatDatabaseName(defaultName)
-  }
+  protected var currentDb = formatDatabaseName(DEFAULT_DATABASE)
 
   /**
    * Format table name, taking into account case sensitivity.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7b8ed65054c3c..7cca9dba2962a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -56,11 +56,6 @@ object SQLConf {
 
   }
 
-  val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir")
-    .doc("The default location for managed databases and tables.")
-    .stringConf
-    .createWithDefault(Utils.resolveURI("spark-warehouse").toString)
-
   val OPTIMIZER_MAX_ITERATIONS = SQLConfigBuilder("spark.sql.optimizer.maxIterations")
     .internal()
     .doc("The max number of iterations the optimizer and analyzer runs.")
@@ -773,7 +768,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def variableSubstituteDepth: Int = getConf(VARIABLE_SUBSTITUTE_DEPTH)
 
-  def warehousePath: String = new Path(getConf(WAREHOUSE_PATH)).toString
+  def warehousePath: String = new Path(getConf(StaticSQLConf.WAREHOUSE_PATH)).toString
 
   def ignoreCorruptFiles: Boolean = getConf(IGNORE_CORRUPT_FILES)
 
@@ -918,6 +913,11 @@ object StaticSQLConf {
     }
   }
 
+  val WAREHOUSE_PATH = buildConf("spark.sql.warehouse.dir")
+    .doc("The default location for managed databases and tables.")
+    .stringConf
+    .createWithDefault(Utils.resolveURI("spark-warehouse").toString)
+
   val CATALOG_IMPLEMENTATION = buildConf("spark.sql.catalogImplementation")
     .internal()
     .stringConf
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index c6083b372a2db..6232c18b1cea8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -23,10 +23,9 @@ import scala.util.control.NonFatal
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.{SparkConf, SparkContext, SparkException}
-import org.apache.spark.internal.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{SparkSession, SQLContext}
-import org.apache.spark.sql.catalyst.catalog.{ExternalCatalog, GlobalTempViewManager, InMemoryCatalog}
+import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.CacheManager
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -40,34 +39,35 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
 
   // Load hive-site.xml into hadoopConf and determine the warehouse path we want to use, based on
   // the config from both hive and Spark SQL. Finally set the warehouse config value to sparkConf.
-  {
+  val warehousePath = {
     val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml")
     if (configFile != null) {
       sparkContext.hadoopConfiguration.addResource(configFile)
     }
 
     // Set the Hive metastore warehouse path to the one we use
-    val tempConf = new SQLConf
-    sparkContext.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) }
     val hiveWarehouseDir = sparkContext.hadoopConfiguration.get("hive.metastore.warehouse.dir")
-    if (hiveWarehouseDir != null && !tempConf.contains(SQLConf.WAREHOUSE_PATH.key)) {
+    if (hiveWarehouseDir != null && !sparkContext.conf.contains(WAREHOUSE_PATH.key)) {
       // If hive.metastore.warehouse.dir is set and spark.sql.warehouse.dir is not set,
       // we will respect the value of hive.metastore.warehouse.dir.
-      tempConf.setConfString(SQLConf.WAREHOUSE_PATH.key, hiveWarehouseDir)
-      sparkContext.conf.set(SQLConf.WAREHOUSE_PATH.key, hiveWarehouseDir)
-      logInfo(s"${SQLConf.WAREHOUSE_PATH.key} is not set, but hive.metastore.warehouse.dir " +
-        s"is set. Setting ${SQLConf.WAREHOUSE_PATH.key} to the value of " +
+      sparkContext.conf.set(WAREHOUSE_PATH.key, hiveWarehouseDir)
+      logInfo(s"${WAREHOUSE_PATH.key} is not set, but hive.metastore.warehouse.dir " +
+        s"is set. Setting ${WAREHOUSE_PATH.key} to the value of " +
         s"hive.metastore.warehouse.dir ('$hiveWarehouseDir').")
+      hiveWarehouseDir
     } else {
       // If spark.sql.warehouse.dir is set, we will override hive.metastore.warehouse.dir using
       // the value of spark.sql.warehouse.dir.
       // When neither spark.sql.warehouse.dir nor hive.metastore.warehouse.dir is set,
       // we will set hive.metastore.warehouse.dir to the default value of spark.sql.warehouse.dir.
-      sparkContext.conf.set("hive.metastore.warehouse.dir", tempConf.warehousePath)
+      val sparkWarehouseDir = sparkContext.conf.get(WAREHOUSE_PATH)
+      sparkContext.conf.set("hive.metastore.warehouse.dir", sparkWarehouseDir)
+      sparkWarehouseDir
     }
 
-    logInfo(s"Warehouse path is '${tempConf.warehousePath}'.")
   }
+  logInfo(s"Warehouse path is '$warehousePath'.")
+
 
   /**
    * Class for caching query results reused in future executions.
@@ -88,6 +88,14 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
       sparkContext.conf,
       sparkContext.hadoopConfiguration)
 
+  // Create the default database if it doesn't exist.
+  {
+    val defaultDbDefinition = CatalogDatabase(
+      SessionCatalog.DEFAULT_DATABASE, "default database", warehousePath, Map())
+    // Initialize default database if it doesn't already exist
+    externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true)
+  }
+
   /**
    * A manager for global temporary views.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 363715c6d2249..a01073987423e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -125,17 +125,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       assert("file" === pathInCatalog.getScheme)
       val expectedPath = new Path(path).toUri
       assert(expectedPath.getPath === pathInCatalog.getPath)
-
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        sql(s"CREATE DATABASE db2")
-        val pathInCatalog2 = new Path(catalog.getDatabaseMetadata("db2").locationUri).toUri
-        assert("file" === pathInCatalog2.getScheme)
-        val expectedPath2 = new Path(spark.sessionState.conf.warehousePath + "/" + "db2.db").toUri
-        assert(expectedPath2.getPath === pathInCatalog2.getPath)
-      }
-
       sql("DROP DATABASE db1")
-      sql("DROP DATABASE db2")
     }
   }
 
@@ -146,55 +136,22 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
     fs.makeQualified(hadoopPath).toString
   }
 
-  test("Create/Drop Database") {
-    withTempDir { tmpDir =>
-      val path = tmpDir.getCanonicalPath
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val catalog = spark.sessionState.catalog
-        val databaseNames = Seq("db1", "`database`")
-
-        databaseNames.foreach { dbName =>
-          try {
-            val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-
-            sql(s"CREATE DATABASE $dbName")
-            val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
-            val expectedLocation = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
-            assert(db1 == CatalogDatabase(
-              dbNameWithoutBackTicks,
-              "",
-              expectedLocation,
-              Map.empty))
-            sql(s"DROP DATABASE $dbName CASCADE")
-            assert(!catalog.databaseExists(dbNameWithoutBackTicks))
-          } finally {
-            catalog.reset()
-          }
-        }
-      }
-    }
-  }
-
   test("Create Database using Default Warehouse Path") {
-    withSQLConf(SQLConf.WAREHOUSE_PATH.key -> "") {
-      // Will use the default location if and only if we unset the conf
-      spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
-      val catalog = spark.sessionState.catalog
-      val dbName = "db1"
-      try {
-        sql(s"CREATE DATABASE $dbName")
-        val db1 = catalog.getDatabaseMetadata(dbName)
-        val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbName.db")
-        assert(db1 == CatalogDatabase(
-          dbName,
-          "",
-          expectedLocation,
-          Map.empty))
-        sql(s"DROP DATABASE $dbName CASCADE")
-        assert(!catalog.databaseExists(dbName))
-      } finally {
-        catalog.reset()
-      }
+    val catalog = spark.sessionState.catalog
+    val dbName = "db1"
+    try {
+      sql(s"CREATE DATABASE $dbName")
+      val db1 = catalog.getDatabaseMetadata(dbName)
+      val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbName.db")
+      assert(db1 == CatalogDatabase(
+        dbName,
+        "",
+        expectedLocation,
+        Map.empty))
+      sql(s"DROP DATABASE $dbName CASCADE")
+      assert(!catalog.databaseExists(dbName))
+    } finally {
+      catalog.reset()
     }
   }
 
@@ -224,31 +181,26 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   }
 
   test("Create Database - database already exists") {
-    withTempDir { tmpDir =>
-      val path = tmpDir.getCanonicalPath
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val catalog = spark.sessionState.catalog
-        val databaseNames = Seq("db1", "`database`")
-
-        databaseNames.foreach { dbName =>
-          try {
-            val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-            sql(s"CREATE DATABASE $dbName")
-            val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
-            val expectedLocation = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
-            assert(db1 == CatalogDatabase(
-              dbNameWithoutBackTicks,
-              "",
-              expectedLocation,
-              Map.empty))
-
-            intercept[DatabaseAlreadyExistsException] {
-              sql(s"CREATE DATABASE $dbName")
-            }
-          } finally {
-            catalog.reset()
-          }
+    val catalog = spark.sessionState.catalog
+    val databaseNames = Seq("db1", "`database`")
+
+    databaseNames.foreach { dbName =>
+      try {
+        val dbNameWithoutBackTicks = cleanIdentifier(dbName)
+        sql(s"CREATE DATABASE $dbName")
+        val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks)
+        val expectedLocation = makeQualifiedPath(s"spark-warehouse/$dbNameWithoutBackTicks.db")
+        assert(db1 == CatalogDatabase(
+          dbNameWithoutBackTicks,
+          "",
+          expectedLocation,
+          Map.empty))
+
+        intercept[DatabaseAlreadyExistsException] {
+          sql(s"CREATE DATABASE $dbName")
         }
+      } finally {
+        catalog.reset()
       }
     }
   }
@@ -473,47 +425,42 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
   }
 
   test("Alter/Describe Database") {
-    withTempDir { tmpDir =>
-      val path = tmpDir.getCanonicalPath
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val catalog = spark.sessionState.catalog
-        val databaseNames = Seq("db1", "`database`")
-
-        databaseNames.foreach { dbName =>
-          try {
-            val dbNameWithoutBackTicks = cleanIdentifier(dbName)
-            val location = makeQualifiedPath(s"$path/$dbNameWithoutBackTicks.db")
-
-            sql(s"CREATE DATABASE $dbName")
-
-            checkAnswer(
-              sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
-              Row("Database Name", dbNameWithoutBackTicks) ::
-                Row("Description", "") ::
-                Row("Location", location) ::
-                Row("Properties", "") :: Nil)
-
-            sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')")
-
-            checkAnswer(
-              sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
-              Row("Database Name", dbNameWithoutBackTicks) ::
-                Row("Description", "") ::
-                Row("Location", location) ::
-                Row("Properties", "((a,a), (b,b), (c,c))") :: Nil)
-
-            sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('d'='d')")
-
-            checkAnswer(
-              sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
-              Row("Database Name", dbNameWithoutBackTicks) ::
-                Row("Description", "") ::
-                Row("Location", location) ::
-                Row("Properties", "((a,a), (b,b), (c,c), (d,d))") :: Nil)
-          } finally {
-            catalog.reset()
-          }
-        }
+    val catalog = spark.sessionState.catalog
+    val databaseNames = Seq("db1", "`database`")
+
+    databaseNames.foreach { dbName =>
+      try {
+        val dbNameWithoutBackTicks = cleanIdentifier(dbName)
+        val location = makeQualifiedPath(s"spark-warehouse/$dbNameWithoutBackTicks.db")
+
+        sql(s"CREATE DATABASE $dbName")
+
+        checkAnswer(
+          sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
+          Row("Database Name", dbNameWithoutBackTicks) ::
+            Row("Description", "") ::
+            Row("Location", location) ::
+            Row("Properties", "") :: Nil)
+
+        sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')")
+
+        checkAnswer(
+          sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
+          Row("Database Name", dbNameWithoutBackTicks) ::
+            Row("Description", "") ::
+            Row("Location", location) ::
+            Row("Properties", "((a,a), (b,b), (c,c))") :: Nil)
+
+        sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('d'='d')")
+
+        checkAnswer(
+          sql(s"DESCRIBE DATABASE EXTENDED $dbName"),
+          Row("Database Name", dbNameWithoutBackTicks) ::
+            Row("Description", "") ::
+            Row("Location", location) ::
+            Row("Properties", "((a,a), (b,b), (c,c), (d,d))") :: Nil)
+      } finally {
+        catalog.reset()
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 11d4693f1c2a3..a283ff971adcd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -215,18 +215,10 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
   }
 
   test("default value of WAREHOUSE_PATH") {
-
-    val original = spark.conf.get(SQLConf.WAREHOUSE_PATH)
-    try {
-      // to get the default value, always unset it
-      spark.conf.unset(SQLConf.WAREHOUSE_PATH.key)
-      // JVM adds a trailing slash if the directory exists and leaves it as-is, if it doesn't
-      // In our comparison, strip trailing slash off of both sides, to account for such cases
-      assert(new Path(Utils.resolveURI("spark-warehouse")).toString.stripSuffix("/") === spark
-        .sessionState.conf.warehousePath.stripSuffix("/"))
-    } finally {
-      sql(s"set ${SQLConf.WAREHOUSE_PATH}=$original")
-    }
+    // JVM adds a trailing slash if the directory exists and leaves it as-is, if it doesn't
+    // In our comparison, strip trailing slash off of both sides, to account for such cases
+    assert(new Path(Utils.resolveURI("spark-warehouse")).toString.stripSuffix("/") === spark
+      .sessionState.conf.warehousePath.stripSuffix("/"))
   }
 
   test("MAX_CASES_BRANCHES") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index a5ef8723c8b6f..81cd65c3cc337 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf._
-import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
+import org.apache.spark.sql.internal.StaticSQLConf.{CATALOG_IMPLEMENTATION, WAREHOUSE_PATH}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -373,7 +373,7 @@ private[spark] object HiveUtils extends Logging {
         propMap.put(confvar.varname, confvar.getDefaultExpr())
       }
     }
-    propMap.put(SQLConf.WAREHOUSE_PATH.key, localMetastore.toURI.toString)
+    propMap.put(WAREHOUSE_PATH.key, localMetastore.toURI.toString)
     propMap.put(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname,
       s"jdbc:derby:${withInMemoryMode};databaseName=${localMetastore.getAbsolutePath};create=true")
     propMap.put("datanucleus.rdbms.datastoreAdapterClassName",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index a2b04863d39b7..15e3927b755af 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -722,53 +722,46 @@ class HiveDDLSuite
   }
 
   private def dropDatabase(cascade: Boolean, tableExists: Boolean): Unit = {
-    withTempPath { tmpDir =>
-      val path = tmpDir.toString
-      withSQLConf(SQLConf.WAREHOUSE_PATH.key -> path) {
-        val dbName = "db1"
-        val fs = new Path(path).getFileSystem(spark.sessionState.newHadoopConf())
-        val dbPath = new Path(path)
-        // the database directory does not exist
-        assert(!fs.exists(dbPath))
-
-        sql(s"CREATE DATABASE $dbName")
-        val catalog = spark.sessionState.catalog
-        val expectedDBLocation = "file:" + appendTrailingSlash(dbPath.toString) + s"$dbName.db"
-        val db1 = catalog.getDatabaseMetadata(dbName)
-        assert(db1 == CatalogDatabase(
-          dbName,
-          "",
-          expectedDBLocation,
-          Map.empty))
-        // the database directory was created
-        assert(fs.exists(dbPath) && fs.isDirectory(dbPath))
-        sql(s"USE $dbName")
-
-        val tabName = "tab1"
-        assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
-        sql(s"CREATE TABLE $tabName as SELECT 1")
-        assert(tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
-
-        if (!tableExists) {
-          sql(s"DROP TABLE $tabName")
-          assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
-        }
+    val dbName = "db1"
+    val dbPath = new Path(spark.sessionState.conf.warehousePath)
+    val fs = dbPath.getFileSystem(spark.sessionState.newHadoopConf())
 
-        sql(s"USE default")
-        val sqlDropDatabase = s"DROP DATABASE $dbName ${if (cascade) "CASCADE" else "RESTRICT"}"
-        if (tableExists && !cascade) {
-          val message = intercept[AnalysisException] {
-            sql(sqlDropDatabase)
-          }.getMessage
-          assert(message.contains(s"Database $dbName is not empty. One or more tables exist."))
-          // the database directory was not removed
-          assert(fs.exists(new Path(expectedDBLocation)))
-        } else {
-          sql(sqlDropDatabase)
-          // the database directory was removed and the inclusive table directories are also removed
-          assert(!fs.exists(new Path(expectedDBLocation)))
-        }
-      }
+    sql(s"CREATE DATABASE $dbName")
+    val catalog = spark.sessionState.catalog
+    val expectedDBLocation = "file:" + appendTrailingSlash(dbPath.toString) + s"$dbName.db"
+    val db1 = catalog.getDatabaseMetadata(dbName)
+    assert(db1 == CatalogDatabase(
+      dbName,
+      "",
+      expectedDBLocation,
+      Map.empty))
+    // the database directory was created
+    assert(fs.exists(dbPath) && fs.isDirectory(dbPath))
+    sql(s"USE $dbName")
+
+    val tabName = "tab1"
+    assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
+    sql(s"CREATE TABLE $tabName as SELECT 1")
+    assert(tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
+
+    if (!tableExists) {
+      sql(s"DROP TABLE $tabName")
+      assert(!tableDirectoryExists(TableIdentifier(tabName), Option(expectedDBLocation)))
+    }
+
+    sql(s"USE default")
+    val sqlDropDatabase = s"DROP DATABASE $dbName ${if (cascade) "CASCADE" else "RESTRICT"}"
+    if (tableExists && !cascade) {
+      val message = intercept[AnalysisException] {
+        sql(sqlDropDatabase)
+      }.getMessage
+      assert(message.contains(s"Database $dbName is not empty. One or more tables exist."))
+      // the database directory was not removed
+      assert(fs.exists(new Path(expectedDBLocation)))
+    } else {
+      sql(sqlDropDatabase)
+      // the database directory was removed and the inclusive table directories are also removed
+      assert(!fs.exists(new Path(expectedDBLocation)))
     }
   }
 

From 7b57e480d2f2c0695eb4036199cd0db52c6f2008 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 16 Nov 2016 01:04:18 -0800
Subject: [PATCH 118/534] [SPARK-18438][SPARKR][ML] spark.mlp should support
 RFormula.

## What changes were proposed in this pull request?
```spark.mlp``` should support ```RFormula``` like other ML algorithm wrappers.
BTW, I did some cleanup and improvement for ```spark.mlp```.

## How was this patch tested?
Unit tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15883 from yanboliang/spark-18438.

(cherry picked from commit 95eb06bd7d0f7110ef62c8d1cb6337c72b10d99f)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/generics.R                            |  2 +-
 R/pkg/R/mllib.R                               | 30 +++++----
 R/pkg/inst/tests/testthat/test_mllib.R        | 63 +++++++++++++------
 ...ultilayerPerceptronClassifierWrapper.scala | 61 ++++++++++--------
 4 files changed, 96 insertions(+), 60 deletions(-)

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 7653ca7bccec9..499c7b279ea9d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1373,7 +1373,7 @@ setGeneric("spark.logit", function(data, formula, ...) { standardGeneric("spark.
 
 #' @rdname spark.mlp
 #' @export
-setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
+setGeneric("spark.mlp", function(data, formula, ...) { standardGeneric("spark.mlp") })
 
 #' @rdname spark.naiveBayes
 #' @export
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 1065b4b37d7f3..265e64e7466fa 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -525,7 +525,7 @@ setMethod("write.ml", signature(object = "LDAModel", path = "character"),
 #' @note spark.isoreg since 2.1.0
 setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) {
-            formula <- paste0(deparse(formula), collapse = "")
+            formula <- paste(deparse(formula), collapse = "")
 
             if (is.null(weightCol)) {
               weightCol <- ""
@@ -775,7 +775,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
                    tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE,
                    thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
                    probabilityCol = "probability") {
-            formula <- paste0(deparse(formula), collapse = "")
+            formula <- paste(deparse(formula), collapse = "")
 
             if (is.null(weightCol)) {
               weightCol <- ""
@@ -858,6 +858,8 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #'   Multilayer Perceptron}
 #'
 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
+#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
+#'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param blockSize blockSize parameter.
 #' @param layers integer vector containing the number of nodes for each layer
 #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
@@ -870,7 +872,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
 #' @rdname spark.mlp
-#' @aliases spark.mlp,SparkDataFrame-method
+#' @aliases spark.mlp,SparkDataFrame,formula-method
 #' @name spark.mlp
 #' @seealso \link{read.ml}
 #' @export
@@ -879,7 +881,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
 #'
 #' # fit a Multilayer Perceptron Classification Model
-#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
+#' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
 #'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
 #'                    initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
 #'
@@ -896,9 +898,10 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' summary(savedModel)
 #' }
 #' @note spark.mlp since 2.1.0
-setMethod("spark.mlp", signature(data = "SparkDataFrame"),
-          function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
+setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
                    tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
+            formula <- paste(deparse(formula), collapse = "")
             if (is.null(layers)) {
               stop ("layers must be a integer vector with length > 1.")
             }
@@ -913,7 +916,7 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"),
               initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
             }
             jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
-                                "fit", data@sdf, as.integer(blockSize), as.array(layers),
+                                "fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
                                 as.character(solver), as.integer(maxIter), as.numeric(tol),
                                 as.numeric(stepSize), seed, initialWeights)
             new("MultilayerPerceptronClassificationModel", jobj = jobj)
@@ -936,9 +939,10 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
 # Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
 
 #' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
-#' @return \code{summary} returns a list containing \code{labelCount}, \code{layers}, and
-#'         \code{weights}. For \code{weights}, it is a numeric vector with length equal to
-#'         the expected given the architecture (i.e., for 8-10-2 network, 100 connection weights).
+#' @return \code{summary} returns a list containing \code{numOfInputs}, \code{numOfOutputs},
+#'         \code{layers}, and \code{weights}. For \code{weights}, it is a numeric vector with
+#'         length equal to the expected given the architecture (i.e., for 8-10-2 network,
+#'         112 connection weights).
 #' @rdname spark.mlp
 #' @export
 #' @aliases summary,MultilayerPerceptronClassificationModel-method
@@ -946,10 +950,12 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
 setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"),
           function(object) {
             jobj <- object@jobj
-            labelCount <- callJMethod(jobj, "labelCount")
             layers <- unlist(callJMethod(jobj, "layers"))
+            numOfInputs <- head(layers, n = 1)
+            numOfOutputs <- tail(layers, n = 1)
             weights <- callJMethod(jobj, "weights")
-            list(labelCount = labelCount, layers = layers, weights = weights)
+            list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs,
+                 layers = layers, weights = weights)
           })
 
 #' Naive Bayes Models
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 4831ce27bec8a..70a033de5308e 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -371,12 +371,13 @@ test_that("spark.kmeans", {
 test_that("spark.mlp", {
   df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                 source = "libsvm")
-  model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100,
-                     tol = 0.5, stepSize = 1, seed = 1)
+  model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
+                     solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
 
   # Test summary method
   summary <- summary(model)
-  expect_equal(summary$labelCount, 3)
+  expect_equal(summary$numOfInputs, 4)
+  expect_equal(summary$numOfOutputs, 3)
   expect_equal(summary$layers, c(4, 5, 4, 3))
   expect_equal(length(summary$weights), 64)
   expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
@@ -385,7 +386,7 @@ test_that("spark.mlp", {
   # Test predict method
   mlpTestDF <- df
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 6), c(0, 1, 1, 1, 1, 1))
+  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
 
   # Test model save/load
   modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
@@ -395,46 +396,68 @@ test_that("spark.mlp", {
   model2 <- read.ml(modelPath)
   summary2 <- summary(model2)
 
-  expect_equal(summary2$labelCount, 3)
+  expect_equal(summary2$numOfInputs, 4)
+  expect_equal(summary2$numOfOutputs, 3)
   expect_equal(summary2$layers, c(4, 5, 4, 3))
   expect_equal(length(summary2$weights), 64)
 
   unlink(modelPath)
 
   # Test default parameter
-  model <- spark.mlp(df, layers = c(4, 5, 4, 3))
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 0))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
   # Test illegal parameter
-  expect_error(spark.mlp(df, layers = NULL), "layers must be a integer vector with length > 1.")
-  expect_error(spark.mlp(df, layers = c()), "layers must be a integer vector with length > 1.")
-  expect_error(spark.mlp(df, layers = c(3)), "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = NULL),
+               "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = c()),
+               "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = c(3)),
+               "layers must be a integer vector with length > 1.")
 
   # Test random seed
   # default seed
-  model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 2, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
   # seed equals 10
-  model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
   # test initialWeights
-  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
     c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
-  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2, initialWeights =
     c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "2.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
 
-  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), maxIter = 2)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "0.0", "2.0", "1.0", "0.0"))
+
+  # Test formula works well
+  df <- suppressWarnings(createDataFrame(iris))
+  model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+                     layers = c(4, 3))
+  summary <- summary(model)
+  expect_equal(summary$numOfInputs, 4)
+  expect_equal(summary$numOfOutputs, 3)
+  expect_equal(summary$layers, c(4, 3))
+  expect_equal(length(summary$weights), 15)
+  expect_equal(head(summary$weights, 5), list(-1.1957257, -5.2693685, 7.4489734, -6.3751413,
+               -10.2376130), tolerance = 1e-6)
 })
 
 test_that("spark.naiveBayes", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
index 2193eb80e9fdd..d34de30931143 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala
@@ -24,19 +24,29 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
 import org.apache.spark.sql.{DataFrame, Dataset}
 
 private[r] class MultilayerPerceptronClassifierWrapper private (
-    val pipeline: PipelineModel,
-    val labelCount: Long,
-    val layers: Array[Int],
-    val weights: Array[Double]
+    val pipeline: PipelineModel
   ) extends MLWritable {
 
+  import MultilayerPerceptronClassifierWrapper._
+
+  val mlpModel: MultilayerPerceptronClassificationModel =
+    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]
+
+  val weights: Array[Double] = mlpModel.weights.toArray
+  val layers: Array[Int] = mlpModel.layers
+
   def transform(dataset: Dataset[_]): DataFrame = {
     pipeline.transform(dataset)
+      .drop(mlpModel.getFeaturesCol)
+      .drop(mlpModel.getLabelCol)
+      .drop(PREDICTED_LABEL_INDEX_COL)
   }
 
   /**
@@ -49,10 +59,12 @@ private[r] class MultilayerPerceptronClassifierWrapper private (
 private[r] object MultilayerPerceptronClassifierWrapper
   extends MLReadable[MultilayerPerceptronClassifierWrapper] {
 
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
   val PREDICTED_LABEL_COL = "prediction"
 
   def fit(
       data: DataFrame,
+      formula: String,
       blockSize: Int,
       layers: Array[Int],
       solver: String,
@@ -62,8 +74,13 @@ private[r] object MultilayerPerceptronClassifierWrapper
       seed: String,
       initialWeights: Array[Double]
      ): MultilayerPerceptronClassifierWrapper = {
+    val rFormula = new RFormula()
+      .setFormula(formula)
+      .setForceIndexLabel(true)
+    checkDataColumns(rFormula, data)
+    val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
-    val schema = data.schema
+    val (_, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val mlp = new MultilayerPerceptronClassifier()
@@ -73,25 +90,25 @@ private[r] object MultilayerPerceptronClassifierWrapper
       .setMaxIter(maxIter)
       .setTol(tol)
       .setStepSize(stepSize)
-      .setPredictionCol(PREDICTED_LABEL_COL)
+      .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     if (seed != null && seed.length > 0) mlp.setSeed(seed.toInt)
     if (initialWeights != null) {
       require(initialWeights.length > 0)
       mlp.setInitialWeights(Vectors.dense(initialWeights))
     }
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(mlp))
+      .setStages(Array(rFormulaModel, mlp, idxToStr))
       .fit(data)
 
-    val multilayerPerceptronClassificationModel: MultilayerPerceptronClassificationModel =
-    pipeline.stages.head.asInstanceOf[MultilayerPerceptronClassificationModel]
-
-    val weights = multilayerPerceptronClassificationModel.weights.toArray
-    val layersFromPipeline = multilayerPerceptronClassificationModel.layers
-    val labelCount = data.select("label").distinct().count()
-
-    new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layersFromPipeline, weights)
+    new MultilayerPerceptronClassifierWrapper(pipeline)
   }
 
   /**
@@ -107,17 +124,10 @@ private[r] object MultilayerPerceptronClassifierWrapper
 
     override def load(path: String): MultilayerPerceptronClassifierWrapper = {
       implicit val format = DefaultFormats
-      val rMetadataPath = new Path(path, "rMetadata").toString
       val pipelinePath = new Path(path, "pipeline").toString
 
-      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
-      val rMetadata = parse(rMetadataStr)
-      val labelCount = (rMetadata \ "labelCount").extract[Long]
-      val layers = (rMetadata \ "layers").extract[Array[Int]]
-      val weights = (rMetadata \ "weights").extract[Array[Double]]
-
       val pipeline = PipelineModel.load(pipelinePath)
-      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
+      new MultilayerPerceptronClassifierWrapper(pipeline)
     }
   }
 
@@ -128,10 +138,7 @@ private[r] object MultilayerPerceptronClassifierWrapper
       val rMetadataPath = new Path(path, "rMetadata").toString
       val pipelinePath = new Path(path, "pipeline").toString
 
-      val rMetadata = ("class" -> instance.getClass.getName) ~
-        ("labelCount" -> instance.labelCount) ~
-        ("layers" -> instance.layers.toSeq) ~
-        ("weights" -> instance.weights.toArray.toSeq)
+      val rMetadata = "class" -> instance.getClass.getName
       val rMetadataJson: String = compact(render(rMetadata))
       sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
 

From b18c5a9b97981742b6ee1c928705d9af0dc85e70 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 16 Nov 2016 17:12:18 +0800
Subject: [PATCH 119/534] [SPARK-18433][SQL] Improve DataSource option keys to
 be more case-insensitive

## What changes were proposed in this pull request?

This PR aims to improve DataSource option keys to be more case-insensitive

DataSource partially use CaseInsensitiveMap in code-path. For example, the following fails to find url.

```scala
val df = spark.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
df.write.format("jdbc")
    .option("UrL", url1)
    .option("dbtable", "TEST.SAVETEST")
    .options(properties.asScala)
    .save()
```

This PR makes DataSource options to use CaseInsensitiveMap internally and also makes DataSource to use CaseInsensitiveMap generally except `InMemoryFileIndex` and `InsertIntoHadoopFsRelationCommand`. We can not pass them CaseInsensitiveMap because they creates new case-sensitive HadoopConfs by calling newHadoopConfWithOptions(options) inside.

## How was this patch tested?

Pass the Jenkins test with newly added test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15884 from dongjoon-hyun/SPARK-18433.

(cherry picked from commit 74f5c2176d8449e41f520febd38109edaf3f4172)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/json/JSONOptions.scala |  6 ++--
 .../catalyst/util/CaseInsensitiveMap.scala    | 36 +++++++++++++++++++
 .../spark/sql/execution/command/ddl.scala     |  2 +-
 .../execution/datasources/DataSource.scala    | 30 ++++++++--------
 .../datasources/csv/CSVOptions.scala          |  8 +++--
 .../spark/sql/execution/datasources/ddl.scala | 18 ----------
 .../datasources/jdbc/JDBCOptions.scala        | 10 ++++--
 .../datasources/parquet/ParquetOptions.scala  |  6 +++-
 .../streaming/FileStreamOptions.scala         |  8 +++--
 .../datasources/csv/CSVInferSchemaSuite.scala |  5 +++
 .../datasources/json/JsonSuite.scala          | 19 ++++++++--
 .../datasources/parquet/ParquetIOSuite.scala  |  7 ++++
 .../spark/sql/jdbc/JDBCWriteSuite.scala       |  9 +++++
 .../sql/streaming/FileStreamSourceSuite.scala |  5 +++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  2 +-
 .../spark/sql/hive/orc/OrcOptions.scala       |  6 +++-
 .../spark/sql/hive/orc/OrcSourceSuite.scala   |  4 +++
 .../apache/spark/sql/hive/parquetSuites.scala |  1 +
 18 files changed, 133 insertions(+), 49 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index c45970658cf07..38e191bbbad6c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -23,7 +23,7 @@ import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
 import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.util.{CompressionCodecs, ParseModes}
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs, ParseModes}
 
 /**
  * Options for parsing JSON data into Spark SQL rows.
@@ -31,9 +31,11 @@ import org.apache.spark.sql.catalyst.util.{CompressionCodecs, ParseModes}
  * Most of these map directly to Jackson's internal options, specified in [[JsonParser.Feature]].
  */
 private[sql] class JSONOptions(
-    @transient private val parameters: Map[String, String])
+    @transient private val parameters: CaseInsensitiveMap)
   extends Logging with Serializable  {
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   val samplingRatio =
     parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
   val primitivesAsString =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala
new file mode 100644
index 0000000000000..a7f7a8a66382a
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CaseInsensitiveMap.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+/**
+ * Builds a map in which keys are case insensitive
+ */
+class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
+  with Serializable {
+
+  val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
+
+  override def get(k: String): Option[String] = baseMap.get(k.toLowerCase)
+
+  override def + [B1 >: String](kv: (String, B1)): Map[String, B1] =
+    baseMap + kv.copy(_1 = kv._1.toLowerCase)
+
+  override def iterator: Iterator[(String, String)] = baseMap.iterator
+
+  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 6c1c398940d0f..588aa05c37b49 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryComparison}
 import org.apache.spark.sql.catalyst.expressions.{EqualTo, Expression, PredicateHelper}
-import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, PartitioningUtils}
+import org.apache.spark.sql.execution.datasources.PartitioningUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 65422f1495f03..cfee7be1e3f07 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -31,6 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
@@ -80,13 +81,13 @@ case class DataSource(
 
   lazy val providingClass: Class[_] = DataSource.lookupDataSource(className)
   lazy val sourceInfo = sourceSchema()
+  private val caseInsensitiveOptions = new CaseInsensitiveMap(options)
 
   /**
    * Infer the schema of the given FileFormat, returns a pair of schema and partition column names.
    */
   private def inferFileFormatSchema(format: FileFormat): (StructType, Seq[String]) = {
     userSpecifiedSchema.map(_ -> partitionColumns).orElse {
-      val caseInsensitiveOptions = new CaseInsensitiveMap(options)
       val allPaths = caseInsensitiveOptions.get("path")
       val globbedPaths = allPaths.toSeq.flatMap { path =>
         val hdfsPath = new Path(path)
@@ -114,11 +115,10 @@ case class DataSource(
     providingClass.newInstance() match {
       case s: StreamSourceProvider =>
         val (name, schema) = s.sourceSchema(
-          sparkSession.sqlContext, userSpecifiedSchema, className, options)
+          sparkSession.sqlContext, userSpecifiedSchema, className, caseInsensitiveOptions)
         SourceInfo(name, schema, Nil)
 
       case format: FileFormat =>
-        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
         val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
         })
@@ -158,10 +158,14 @@ case class DataSource(
     providingClass.newInstance() match {
       case s: StreamSourceProvider =>
         s.createSource(
-          sparkSession.sqlContext, metadataPath, userSpecifiedSchema, className, options)
+          sparkSession.sqlContext,
+          metadataPath,
+          userSpecifiedSchema,
+          className,
+          caseInsensitiveOptions)
 
       case format: FileFormat =>
-        val path = new CaseInsensitiveMap(options).getOrElse("path", {
+        val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
         })
         new FileStreamSource(
@@ -171,7 +175,7 @@ case class DataSource(
           schema = sourceInfo.schema,
           partitionColumns = sourceInfo.partitionColumns,
           metadataPath = metadataPath,
-          options = options)
+          options = caseInsensitiveOptions)
       case _ =>
         throw new UnsupportedOperationException(
           s"Data source $className does not support streamed reading")
@@ -182,10 +186,9 @@ case class DataSource(
   def createSink(outputMode: OutputMode): Sink = {
     providingClass.newInstance() match {
       case s: StreamSinkProvider =>
-        s.createSink(sparkSession.sqlContext, options, partitionColumns, outputMode)
+        s.createSink(sparkSession.sqlContext, caseInsensitiveOptions, partitionColumns, outputMode)
 
       case fileFormat: FileFormat =>
-        val caseInsensitiveOptions = new CaseInsensitiveMap(options)
         val path = caseInsensitiveOptions.getOrElse("path", {
           throw new IllegalArgumentException("'path' is not specified")
         })
@@ -193,7 +196,7 @@ case class DataSource(
           throw new IllegalArgumentException(
             s"Data source $className does not support $outputMode output mode")
         }
-        new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, options)
+        new FileStreamSink(sparkSession, path, fileFormat, partitionColumns, caseInsensitiveOptions)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -234,7 +237,6 @@ case class DataSource(
    *                        that files already exist, we don't need to check them again.
    */
   def resolveRelation(checkFilesExist: Boolean = true): BaseRelation = {
-    val caseInsensitiveOptions = new CaseInsensitiveMap(options)
     val relation = (providingClass.newInstance(), userSpecifiedSchema) match {
       // TODO: Throw when too much is given.
       case (dataSource: SchemaRelationProvider, Some(schema)) =>
@@ -274,7 +276,7 @@ case class DataSource(
           dataSchema = dataSchema,
           bucketSpec = None,
           format,
-          options)(sparkSession)
+          caseInsensitiveOptions)(sparkSession)
 
       // This is a non-streaming file based datasource.
       case (format: FileFormat, _) =>
@@ -358,13 +360,13 @@ case class DataSource(
 
     providingClass.newInstance() match {
       case dataSource: CreatableRelationProvider =>
-        dataSource.createRelation(sparkSession.sqlContext, mode, options, data)
+        dataSource.createRelation(sparkSession.sqlContext, mode, caseInsensitiveOptions, data)
       case format: FileFormat =>
         // Don't glob path for the write path.  The contracts here are:
         //  1. Only one output path can be specified on the write path;
         //  2. Output path must be a legal HDFS style file system path;
         //  3. It's OK that the output path doesn't exist yet;
-        val allPaths = paths ++ new CaseInsensitiveMap(options).get("path")
+        val allPaths = paths ++ caseInsensitiveOptions.get("path")
         val outputPath = if (allPaths.length == 1) {
           val path = new Path(allPaths.head)
           val fs = path.getFileSystem(sparkSession.sessionState.newHadoopConf())
@@ -391,7 +393,7 @@ case class DataSource(
           // TODO: Case sensitivity.
           val sameColumns =
             existingPartitionColumns.map(_.toLowerCase()) == partitionColumns.map(_.toLowerCase())
-          if (existingPartitionColumns.size > 0 && !sameColumns) {
+          if (existingPartitionColumns.nonEmpty && !sameColumns) {
             throw new AnalysisException(
               s"""Requested partitioning does not match existing partitioning.
                  |Existing partitioning columns:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 5903729c11fc5..21e50307b5ab0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -23,11 +23,13 @@ import java.util.Locale
 import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.util.{CompressionCodecs, ParseModes}
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CompressionCodecs, ParseModes}
 
-private[csv] class CSVOptions(@transient private val parameters: Map[String, String])
+private[csv] class CSVOptions(@transient private val parameters: CaseInsensitiveMap)
   extends Logging with Serializable {
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   private def getChar(paramName: String, default: Char): Char = {
     val paramValue = parameters.get(paramName)
     paramValue match {
@@ -128,7 +130,7 @@ private[csv] class CSVOptions(@transient private val parameters: Map[String, Str
 
 object CSVOptions {
 
-  def apply(): CSVOptions = new CSVOptions(Map.empty)
+  def apply(): CSVOptions = new CSVOptions(new CaseInsensitiveMap(Map.empty))
 
   def apply(paramName: String, paramValue: String): CSVOptions = {
     new CSVOptions(Map(paramName -> paramValue))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index 59fb48ffea598..fa8dfa9640d3d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -96,21 +96,3 @@ case class RefreshResource(path: String)
     Seq.empty[Row]
   }
 }
-
-/**
- * Builds a map in which keys are case insensitive
- */
-class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
-  with Serializable {
-
-  val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
-
-  override def get(k: String): Option[String] = baseMap.get(k.toLowerCase)
-
-  override def + [B1 >: String](kv: (String, B1)): Map[String, B1] =
-    baseMap + kv.copy(_1 = kv._1.toLowerCase)
-
-  override def iterator: Iterator[(String, String)] = baseMap.iterator
-
-  override def -(key: String): Map[String, String] = baseMap - key.toLowerCase
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index fcd7409159def..7f419b5788c4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -22,19 +22,23 @@ import java.util.Properties
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+
 /**
  * Options for the JDBC data source.
  */
 class JDBCOptions(
-    @transient private val parameters: Map[String, String])
+    @transient private val parameters: CaseInsensitiveMap)
   extends Serializable {
 
   import JDBCOptions._
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   def this(url: String, table: String, parameters: Map[String, String]) = {
-    this(parameters ++ Map(
+    this(new CaseInsensitiveMap(parameters ++ Map(
       JDBCOptions.JDBC_URL -> url,
-      JDBCOptions.JDBC_TABLE_NAME -> table))
+      JDBCOptions.JDBC_TABLE_NAME -> table)))
   }
 
   val asConnectionProperties: Properties = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index d0fd23605bea8..a81a95d510855 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -19,18 +19,22 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for the Parquet data source.
  */
 private[parquet] class ParquetOptions(
-    @transient private val parameters: Map[String, String],
+    @transient private val parameters: CaseInsensitiveMap,
     @transient private val sqlConf: SQLConf)
   extends Serializable {
 
   import ParquetOptions._
 
+  def this(parameters: Map[String, String], sqlConf: SQLConf) =
+    this(new CaseInsensitiveMap(parameters), sqlConf)
+
   /**
    * Compression codec to use. By default use the value specified in SQLConf.
    * Acceptable values are defined in [[shortParquetCompressionCodecNames]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
index 3efc20c1d662d..fdea65cb10ae0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
@@ -20,13 +20,15 @@ package org.apache.spark.sql.execution.streaming
 import scala.util.Try
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.util.Utils
 
 /**
  * User specified options for file streams.
  */
-class FileStreamOptions(parameters: Map[String, String]) extends Logging {
+class FileStreamOptions(parameters: CaseInsensitiveMap) extends Logging {
+
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
 
   val maxFilesPerTrigger: Option[Int] = parameters.get("maxFilesPerTrigger").map { str =>
     Try(str.toInt).toOption.filter(_ > 0).getOrElse {
@@ -50,5 +52,5 @@ class FileStreamOptions(parameters: Map[String, String]) extends Logging {
 
   /** Options as specified by the user, in a case-insensitive map, without "path" set. */
   val optionMapWithoutPath: Map[String, String] =
-    new CaseInsensitiveMap(parameters).filterKeys(_ != "path")
+    parameters.filterKeys(_ != "path")
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
index 5e00f669b8593..93f752d107ca3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchemaSuite.scala
@@ -109,4 +109,9 @@ class CSVInferSchemaSuite extends SparkFunSuite {
     val mergedNullTypes = CSVInferSchema.mergeRowTypes(Array(NullType), Array(NullType))
     assert(mergedNullTypes.deep == Array(NullType).deep)
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val options = new CSVOptions(Map("TiMeStampFormat" -> "yyyy-mm"))
+    assert(CSVInferSchema.inferField(TimestampType, "2015-08", options) == TimestampType)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 456052f79afcc..598e44ec8c194 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -1366,7 +1366,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
   test("SPARK-6245 JsonRDD.inferSchema on empty RDD") {
     // This is really a test that it doesn't throw an exception
-    val emptySchema = InferSchema.infer(empty, "", new JSONOptions(Map()))
+    val emptySchema = InferSchema.infer(empty, "", new JSONOptions(Map.empty[String, String]))
     assert(StructType(Seq()) === emptySchema)
   }
 
@@ -1390,7 +1390,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
   }
 
   test("SPARK-8093 Erase empty structs") {
-    val emptySchema = InferSchema.infer(emptyRecords, "", new JSONOptions(Map()))
+    val emptySchema = InferSchema.infer(
+      emptyRecords, "", new JSONOptions(Map.empty[String, String]))
     assert(StructType(Seq()) === emptySchema)
   }
 
@@ -1749,4 +1750,18 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
       checkAnswer(stringTimestampsWithFormat, expectedStringDatesWithFormat)
     }
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val records = sparkContext
+      .parallelize("""{"a": 3, "b": 1.1}""" :: """{"a": 3.1, "b": 0.000001}""" :: Nil)
+
+    val schema = StructType(
+      StructField("a", DecimalType(21, 1), true) ::
+      StructField("b", DecimalType(7, 6), true) :: Nil)
+
+    val df1 = spark.read.option("prefersDecimal", "true").json(records)
+    assert(df1.schema == schema)
+    val df2 = spark.read.option("PREfersdecimaL", "true").json(records)
+    assert(df2.schema == schema)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 580eade4b1412..acdadb3103c8a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -736,6 +736,13 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val option = new ParquetOptions(Map("Compression" -> "uncompressed"), spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "UNCOMPRESSED")
+    }
+  }
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index 96540ec92da73..e3d3c6c3a887c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -303,4 +303,13 @@ class JDBCWriteSuite extends SharedSQLContext with BeforeAndAfter {
     assert(e.contains("If 'partitionColumn' is specified then 'lowerBound', 'upperBound'," +
       " and 'numPartitions' are required."))
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val df = spark.createDataFrame(sparkContext.parallelize(arr2x2), schema2)
+    df.write.format("jdbc")
+      .option("Url", url1)
+      .option("dbtable", "TEST.SAVETEST")
+      .options(properties.asScala)
+      .save()
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index fab7642994ffc..b365af76c3795 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -1004,6 +1004,11 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       )
     }
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    val options = new FileStreamOptions(Map("maxfilespertrigger" -> "1"))
+    assert(options.maxFilesPerTrigger == Some(1))
+  }
 }
 
 class FileStreamSourceStressTestSuite extends FileStreamSourceTest {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 42ce1a88a2b67..cbd00da81cfcd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -35,8 +35,8 @@ import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
-import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
index c2a126d3bf9c0..ac587ab99ae28 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala
@@ -17,14 +17,18 @@
 
 package org.apache.spark.sql.hive.orc
 
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+
 /**
  * Options for the ORC data source.
  */
-private[orc] class OrcOptions(@transient private val parameters: Map[String, String])
+private[orc] class OrcOptions(@transient private val parameters: CaseInsensitiveMap)
   extends Serializable {
 
   import OrcOptions._
 
+  def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
+
   /**
    * Compression codec to use. By default snappy compression.
    * Acceptable values are defined in [[shortOrcCompressionCodecNames]].
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 0f37cd7bf3652..12f948041a8ab 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -146,6 +146,10 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
 
     sql("DROP TABLE IF EXISTS orcNullValues")
   }
+
+  test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
+    assert(new OrcOptions(Map("Orc.Compress" -> "NONE")).compressionCodec == "NONE")
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
index 3644ff952eb0d..2ce60fe58921d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.command.ExecutedCommandExec
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoDataSourceCommand, InsertIntoHadoopFsRelationCommand, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions
 import org.apache.spark.sql.hive.execution.HiveTableScanExec
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf

From 4567db9da47f0830e952614393d6105f4f5587a0 Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Wed, 16 Nov 2016 09:51:59 +0000
Subject: [PATCH 120/534] [DOC][MINOR] Kafka doc: breakup into lines

## Before

![before](https://cloud.githubusercontent.com/assets/15843379/20340231/99b039fe-ac1b-11e6-9ba9-b44582427459.png)

## After

![after](https://cloud.githubusercontent.com/assets/15843379/20340236/9d5796e2-ac1b-11e6-92bb-6da40ba1a383.png)

Author: Liwei Lin <lwlin7@gmail.com>

Closes #15903 from lw-lin/kafka-doc-lines.

(cherry picked from commit 3e01f128284993f39463c0ccd902b774f57cce76)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/structured-streaming-kafka-integration.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
index c4c9fb3f7d3db..2458bb5ffa298 100644
--- a/docs/structured-streaming-kafka-integration.md
+++ b/docs/structured-streaming-kafka-integration.md
@@ -240,6 +240,7 @@ Kafka's own configurations can be set via `DataStreamReader.option` with `kafka.
 [Kafka consumer config docs](http://kafka.apache.org/documentation.html#newconsumerconfigs).
 
 Note that the following Kafka params cannot be set and the Kafka source will throw an exception:
+
 - **group.id**: Kafka source will create a unique group id for each query automatically.
 - **auto.offset.reset**: Set the source option `startingOffsets` to specify
  where to start instead. Structured Streaming manages which offsets are consumed internally, rather 

From a94659ceeb339a93f72bad3ed059bd2cdfca4df9 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 16 Nov 2016 10:16:36 +0000
Subject: [PATCH 121/534] [SPARK-18400][STREAMING] NPE when resharding Kinesis
 Stream

## What changes were proposed in this pull request?

Avoid NPE in KinesisRecordProcessor when shutdown happens without successful init

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15882 from srowen/SPARK-18400.

(cherry picked from commit 43a26899e5dd2364297eaf8985bd68367e4735a7)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../kinesis/KinesisRecordProcessor.scala      | 42 ++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
index 80e0cce055862..a0ccd086d90fa 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -27,7 +27,6 @@ import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
 import com.amazonaws.services.kinesis.model.Record
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.streaming.Duration
 
 /**
  * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
@@ -102,27 +101,32 @@ private[kinesis] class KinesisRecordProcessor[T](receiver: KinesisReceiver[T], w
    * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE
    * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE)
    */
-  override def shutdown(checkpointer: IRecordProcessorCheckpointer, reason: ShutdownReason) {
+  override def shutdown(
+      checkpointer: IRecordProcessorCheckpointer,
+      reason: ShutdownReason): Unit = {
     logInfo(s"Shutdown:  Shutting down workerId $workerId with reason $reason")
-    reason match {
-      /*
-       * TERMINATE Use Case.  Checkpoint.
-       * Checkpoint to indicate that all records from the shard have been drained and processed.
-       * It's now OK to read from the new shards that resulted from a resharding event.
-       */
-      case ShutdownReason.TERMINATE =>
-        receiver.removeCheckpointer(shardId, checkpointer)
+    // null if not initialized before shutdown:
+    if (shardId == null) {
+      logWarning(s"No shardId for workerId $workerId?")
+    } else {
+      reason match {
+        /*
+         * TERMINATE Use Case.  Checkpoint.
+         * Checkpoint to indicate that all records from the shard have been drained and processed.
+         * It's now OK to read from the new shards that resulted from a resharding event.
+         */
+        case ShutdownReason.TERMINATE => receiver.removeCheckpointer(shardId, checkpointer)
 
-      /*
-       * ZOMBIE Use Case or Unknown reason.  NoOp.
-       * No checkpoint because other workers may have taken over and already started processing
-       *    the same records.
-       * This may lead to records being processed more than once.
-       */
-      case _ =>
-        receiver.removeCheckpointer(shardId, null) // return null so that we don't checkpoint
+        /*
+         * ZOMBIE Use Case or Unknown reason.  NoOp.
+         * No checkpoint because other workers may have taken over and already started processing
+         *    the same records.
+         * This may lead to records being processed more than once.
+         * Return null so that we don't checkpoint
+         */
+        case _ => receiver.removeCheckpointer(shardId, null)
+      }
     }
-
   }
 }
 

From 6b2301b89bf5a89bd2b8a3d85c9c05a490be2ddb Mon Sep 17 00:00:00 2001
From: uncleGen <hustyugm@gmail.com>
Date: Wed, 16 Nov 2016 10:19:10 +0000
Subject: [PATCH 122/534] [SPARK-18410][STREAMING] Add structured kafka example

## What changes were proposed in this pull request?

This PR provides structured kafka wordcount examples

## How was this patch tested?

Author: uncleGen <hustyugm@gmail.com>

Closes #15849 from uncleGen/SPARK-18410.

(cherry picked from commit e6145772eda8d6d3727605e80a7c2f182c801003)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../JavaStructuredKafkaWordCount.java         | 96 +++++++++++++++++++
 .../streaming/structured_kafka_wordcount.py   | 90 +++++++++++++++++
 .../streaming/StructuredKafkaWordCount.scala  | 85 ++++++++++++++++
 3 files changed, 271 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java
 create mode 100644 examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala

diff --git a/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java
new file mode 100644
index 0000000000000..0f45cfeca4429
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/sql/streaming/JavaStructuredKafkaWordCount.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.sql.streaming;
+
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.streaming.StreamingQuery;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: JavaStructuredKafkaWordCount <bootstrap-servers> <subscribe-type> <topics>
+ *   <bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
+ *   comma-separated list of host:port.
+ *   <subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
+ *   'subscribePattern'.
+ *   |- <assign> Specific TopicPartitions to consume. Json string
+ *   |  {"topicA":[0,1],"topicB":[2,4]}.
+ *   |- <subscribe> The topic list to subscribe. A comma-separated list of
+ *   |  topics.
+ *   |- <subscribePattern> The pattern used to subscribe to topic(s).
+ *   |  Java regex string.
+ *   |- Only one of "assign, "subscribe" or "subscribePattern" options can be
+ *   |  specified for Kafka source.
+ *   <topics> Different value format depends on the value of 'subscribe-type'.
+ *
+ * Example:
+ *    `$ bin/run-example \
+ *      sql.streaming.JavaStructuredKafkaWordCount host1:port1,host2:port2 \
+ *      subscribe topic1,topic2`
+ */
+public final class JavaStructuredKafkaWordCount {
+
+  public static void main(String[] args) throws Exception {
+    if (args.length < 3) {
+      System.err.println("Usage: JavaStructuredKafkaWordCount <bootstrap-servers> " +
+        "<subscribe-type> <topics>");
+      System.exit(1);
+    }
+
+    String bootstrapServers = args[0];
+    String subscribeType = args[1];
+    String topics = args[2];
+
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaStructuredKafkaWordCount")
+      .getOrCreate();
+
+    // Create DataSet representing the stream of input lines from kafka
+    Dataset<String> lines = spark
+      .readStream()
+      .format("kafka")
+      .option("kafka.bootstrap.servers", bootstrapServers)
+      .option(subscribeType, topics)
+      .load()
+      .selectExpr("CAST(value AS STRING)")
+      .as(Encoders.STRING());
+
+    // Generate running word count
+    Dataset<Row> wordCounts = lines.flatMap(new FlatMapFunction<String, String>() {
+      @Override
+      public Iterator<String> call(String x) {
+        return Arrays.asList(x.split(" ")).iterator();
+      }
+    }, Encoders.STRING()).groupBy("value").count();
+
+    // Start running the query that prints the running counts to the console
+    StreamingQuery query = wordCounts.writeStream()
+      .outputMode("complete")
+      .format("console")
+      .start();
+
+    query.awaitTermination();
+  }
+}
diff --git a/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
new file mode 100644
index 0000000000000..9e8a552b3b10b
--- /dev/null
+++ b/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Consumes messages from one or more topics in Kafka and does wordcount.
+ Usage: structured_kafka_wordcount.py <bootstrap-servers> <subscribe-type> <topics>
+   <bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
+   comma-separated list of host:port.
+   <subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
+   'subscribePattern'.
+   |- <assign> Specific TopicPartitions to consume. Json string
+   |  {"topicA":[0,1],"topicB":[2,4]}.
+   |- <subscribe> The topic list to subscribe. A comma-separated list of
+   |  topics.
+   |- <subscribePattern> The pattern used to subscribe to topic(s).
+   |  Java regex string.
+   |- Only one of "assign, "subscribe" or "subscribePattern" options can be
+   |  specified for Kafka source.
+   <topics> Different value format depends on the value of 'subscribe-type'.
+
+ Run the example
+    `$ bin/spark-submit examples/src/main/python/sql/streaming/structured_kafka_wordcount.py \
+    host1:port1,host2:port2 subscribe topic1,topic2`
+"""
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import explode
+from pyspark.sql.functions import split
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("""
+        Usage: structured_kafka_wordcount.py <bootstrap-servers> <subscribe-type> <topics>
+        """, file=sys.stderr)
+        exit(-1)
+
+    bootstrapServers = sys.argv[1]
+    subscribeType = sys.argv[2]
+    topics = sys.argv[3]
+
+    spark = SparkSession\
+        .builder\
+        .appName("StructuredKafkaWordCount")\
+        .getOrCreate()
+
+    # Create DataSet representing the stream of input lines from kafka
+    lines = spark\
+        .readStream\
+        .format("kafka")\
+        .option("kafka.bootstrap.servers", bootstrapServers)\
+        .option(subscribeType, topics)\
+        .load()\
+        .selectExpr("CAST(value AS STRING)")
+
+    # Split the lines into words
+    words = lines.select(
+        # explode turns each item in an array into a separate row
+        explode(
+            split(lines.value, ' ')
+        ).alias('word')
+    )
+
+    # Generate running word count
+    wordCounts = words.groupBy('word').count()
+
+    # Start running the query that prints the running counts to the console
+    query = wordCounts\
+        .writeStream\
+        .outputMode('complete')\
+        .format('console')\
+        .start()
+
+    query.awaitTermination()
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala
new file mode 100644
index 0000000000000..c26f73e788814
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/streaming/StructuredKafkaWordCount.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.sql.streaming
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: StructuredKafkaWordCount <bootstrap-servers> <subscribe-type> <topics>
+ *   <bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
+ *   comma-separated list of host:port.
+ *   <subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
+ *   'subscribePattern'.
+ *   |- <assign> Specific TopicPartitions to consume. Json string
+ *   |  {"topicA":[0,1],"topicB":[2,4]}.
+ *   |- <subscribe> The topic list to subscribe. A comma-separated list of
+ *   |  topics.
+ *   |- <subscribePattern> The pattern used to subscribe to topic(s).
+ *   |  Java regex string.
+ *   |- Only one of "assign, "subscribe" or "subscribePattern" options can be
+ *   |  specified for Kafka source.
+ *   <topics> Different value format depends on the value of 'subscribe-type'.
+ *
+ * Example:
+ *    `$ bin/run-example \
+ *      sql.streaming.StructuredKafkaWordCount host1:port1,host2:port2 \
+ *      subscribe topic1,topic2`
+ */
+object StructuredKafkaWordCount {
+  def main(args: Array[String]): Unit = {
+    if (args.length < 3) {
+      System.err.println("Usage: StructuredKafkaWordCount <bootstrap-servers> " +
+        "<subscribe-type> <topics>")
+      System.exit(1)
+    }
+
+    val Array(bootstrapServers, subscribeType, topics) = args
+
+    val spark = SparkSession
+      .builder
+      .appName("StructuredKafkaWordCount")
+      .getOrCreate()
+
+    import spark.implicits._
+
+    // Create DataSet representing the stream of input lines from kafka
+    val lines = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", bootstrapServers)
+      .option(subscribeType, topics)
+      .load()
+      .selectExpr("CAST(value AS STRING)")
+      .as[String]
+
+    // Generate running word count
+    val wordCounts = lines.flatMap(_.split(" ")).groupBy("value").count()
+
+    // Start running the query that prints the running counts to the console
+    val query = wordCounts.writeStream
+      .outputMode("complete")
+      .format("console")
+      .start()
+
+    query.awaitTermination()
+  }
+
+}
+// scalastyle:on println

From 8208470084153f0be6818f66309f63dcdcb16519 Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Wed, 16 Nov 2016 10:34:56 +0000
Subject: [PATCH 123/534] [MINOR][DOC] Fix typos in the 'configuration',
 'monitoring' and 'sql-programming-guide' documentation

## What changes were proposed in this pull request?

Fix typos in the 'configuration', 'monitoring' and 'sql-programming-guide' documentation.

## How was this patch tested?
Manually.

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15886 from weiqingy/fixTypo.

(cherry picked from commit 241e04bc03efb1379622c0c84299e617512973ac)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/configuration.md         | 2 +-
 docs/monitoring.md            | 2 +-
 docs/sql-programming-guide.md | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index d0acd944dd6b9..e0c661349caab 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1916,7 +1916,7 @@ showDF(properties, numRows = 200, truncate = FALSE)
   <td><code>spark.r.heartBeatInterval</code></td>
   <td>100</td>
   <td>
-    Interval for heartbeats sents from SparkR backend to R process to prevent connection timeout.
+    Interval for heartbeats sent from SparkR backend to R process to prevent connection timeout.
   </td>
 </tr>
 
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 5bc5e18c4d45f..2eef4568d00e9 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -41,7 +41,7 @@ directory must be supplied in the `spark.history.fs.logDirectory` configuration
 and should contain sub-directories that each represents an application's event logs.
 
 The spark jobs themselves must be configured to log events, and to log them to the same shared,
-writeable directory. For example, if the server was configured with a log directory of
+writable directory. For example, if the server was configured with a log directory of
 `hdfs://namenode/shared/spark-logs`, then the client-side options would be:
 
 ```
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b9be7a7545ef8..ba3e55fc061a7 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -222,9 +222,9 @@ The `sql` function enables applications to run SQL queries programmatically and
 
 ## Global Temporary View
 
-Temporay views in Spark SQL are session-scoped and will disappear if the session that creates it
+Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it
 terminates. If you want to have a temporary view that is shared among all sessions and keep alive
-until the Spark application terminiates, you can create a global temporary view. Global temporary
+until the Spark application terminates, you can create a global temporary view. Global temporary
 view is tied to a system preserved database `global_temp`, and we must use the qualified name to
 refer it, e.g. `SELECT * FROM global_temp.view1`.
 
@@ -1029,7 +1029,7 @@ following command:
 bin/spark-shell --driver-class-path postgresql-9.4.1207.jar --jars postgresql-9.4.1207.jar
 {% endhighlight %}
 
-Tables from the remote database can be loaded as a DataFrame or Spark SQL Temporary table using
+Tables from the remote database can be loaded as a DataFrame or Spark SQL temporary view using
 the Data Sources API. Users can specify the JDBC connection properties in the data source options.
 <code>user</code> and <code>password</code> are normally provided as connection properties for
 logging into the data sources. In addition to the connection properties, Spark also supports

From 6b6eb4e520d07a27aa68d3450f3c7613b233d928 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 16 Nov 2016 02:46:27 -0800
Subject: [PATCH 124/534] [SPARK-18434][ML] Add missing ParamValidations for ML
 algos

## What changes were proposed in this pull request?
Add missing ParamValidations for ML algos
## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15881 from zhengruifeng/arg_checking.

(cherry picked from commit c68f1a38af67957ee28889667193da8f64bb4342)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../scala/org/apache/spark/ml/feature/IDF.scala     |  3 ++-
 .../scala/org/apache/spark/ml/feature/PCA.scala     |  3 ++-
 .../org/apache/spark/ml/feature/Word2Vec.scala      | 13 ++++++++-----
 .../spark/ml/regression/IsotonicRegression.scala    |  3 ++-
 .../spark/ml/regression/LinearRegression.scala      |  6 +++++-
 .../scala/org/apache/spark/ml/tree/treeParams.scala |  4 +++-
 6 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index 6386dd8a10801..46a0730f5ddb8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -44,7 +44,8 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol
    * @group param
    */
   final val minDocFreq = new IntParam(
-    this, "minDocFreq", "minimum number of documents in which a term should appear for filtering")
+    this, "minDocFreq", "minimum number of documents in which a term should appear for filtering" +
+      " (>= 0)", ParamValidators.gtEq(0))
 
   setDefault(minDocFreq -> 0)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 6b913480fdc28..444006fe1edb6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -44,7 +44,8 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
    * The number of principal components.
    * @group param
    */
-  final val k: IntParam = new IntParam(this, "k", "the number of principal components")
+  final val k: IntParam = new IntParam(this, "k", "the number of principal components (> 0)",
+    ParamValidators.gt(0))
 
   /** @group getParam */
   def getK: Int = $(k)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index d53f3df514dff..3ed08c983d561 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -43,7 +43,8 @@ private[feature] trait Word2VecBase extends Params
    * @group param
    */
   final val vectorSize = new IntParam(
-    this, "vectorSize", "the dimension of codes after transforming from words")
+    this, "vectorSize", "the dimension of codes after transforming from words (> 0)",
+    ParamValidators.gt(0))
   setDefault(vectorSize -> 100)
 
   /** @group getParam */
@@ -55,7 +56,8 @@ private[feature] trait Word2VecBase extends Params
    * @group expertParam
    */
   final val windowSize = new IntParam(
-    this, "windowSize", "the window size (context words from [-window, window])")
+    this, "windowSize", "the window size (context words from [-window, window]) (> 0)",
+    ParamValidators.gt(0))
   setDefault(windowSize -> 5)
 
   /** @group expertGetParam */
@@ -67,7 +69,8 @@ private[feature] trait Word2VecBase extends Params
    * @group param
    */
   final val numPartitions = new IntParam(
-    this, "numPartitions", "number of partitions for sentences of words")
+    this, "numPartitions", "number of partitions for sentences of words (> 0)",
+    ParamValidators.gt(0))
   setDefault(numPartitions -> 1)
 
   /** @group getParam */
@@ -80,7 +83,7 @@ private[feature] trait Word2VecBase extends Params
    * @group param
    */
   final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
-    "appear to be included in the word2vec model's vocabulary")
+    "appear to be included in the word2vec model's vocabulary (>= 0)", ParamValidators.gtEq(0))
   setDefault(minCount -> 5)
 
   /** @group getParam */
@@ -95,7 +98,7 @@ private[feature] trait Word2VecBase extends Params
    */
   final val maxSentenceLength = new IntParam(this, "maxSentenceLength", "Maximum length " +
     "(in words) of each sentence in the input data. Any sentence longer than this threshold will " +
-    "be divided into chunks up to the size.")
+    "be divided into chunks up to the size (> 0)", ParamValidators.gt(0))
   setDefault(maxSentenceLength -> 1000)
 
   /** @group getParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index cd7b4f2a9c56e..4d274f3a5bbf1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -61,7 +61,8 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
    * @group param
    */
   final val featureIndex: IntParam = new IntParam(this, "featureIndex",
-    "The index of the feature if featuresCol is a vector column, no effect otherwise.")
+    "The index of the feature if featuresCol is a vector column, no effect otherwise (>= 0)",
+    ParamValidators.gtEq(0))
 
   /** @group getParam */
   final def getFeatureIndex: Int = $(featureIndex)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 9639b07496c13..71c542adf6f6f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -171,7 +171,11 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    * @group setParam
    */
   @Since("1.6.0")
-  def setSolver(value: String): this.type = set(solver, value)
+  def setSolver(value: String): this.type = {
+    require(Set("auto", "l-bfgs", "normal").contains(value),
+      s"Solver $value was not supported. Supported options: auto, l-bfgs, normal")
+    set(solver, value)
+  }
   setDefault(solver -> "auto")
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 57c7e44e97607..5a551533be9ca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -73,11 +73,13 @@ private[ml] trait DecisionTreeParams extends PredictorParams
 
   /**
    * Minimum information gain for a split to be considered at a tree node.
+   * Should be >= 0.0.
    * (default = 0.0)
    * @group param
    */
   final val minInfoGain: DoubleParam = new DoubleParam(this, "minInfoGain",
-    "Minimum information gain for a split to be considered at a tree node.")
+    "Minimum information gain for a split to be considered at a tree node.",
+    ParamValidators.gtEq(0.0))
 
   /**
    * Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be

From 416bc3dd3db7f7ae2cc7b3ffe395decd0c5b73f9 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 16 Nov 2016 10:53:23 +0000
Subject: [PATCH 125/534] [SPARK-18446][ML][DOCS] Add links to API docs for ML
 algos

## What changes were proposed in this pull request?
Add links to API docs for ML algos
## How was this patch tested?
Manual checking for the API links

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15890 from zhengruifeng/algo_link.

(cherry picked from commit a75e3fe923372c56bc1b2f4baeaaf5868ad28341)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/ml-classification-regression.md | 39 ++++++++++++++++++++++++++++
 docs/ml-pipeline.md                  | 25 ++++++++++++++++++
 docs/ml-tuning.md                    | 17 ++++++++++++
 3 files changed, 81 insertions(+)

diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index bb2e404330cc0..cb2ccbf4fe157 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -55,14 +55,23 @@ $\alpha$ and `regParam` corresponds to $\lambda$.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression).
+
 {% include_example scala/org/apache/spark/examples/ml/LogisticRegressionWithElasticNetExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/classification/LogisticRegression.html).
+
 {% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
+More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegression).
+
 {% include_example python/ml/logistic_regression_with_elastic_net.py %}
 </div>
 
@@ -289,14 +298,23 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier) for more details.
+
 {% include_example scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.html) for more details.
+
 {% include_example java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.MultilayerPerceptronClassifier) for more details.
+
 {% include_example python/ml/multilayer_perceptron_classification.py %}
 </div>
 
@@ -392,15 +410,24 @@ regression model and extracting model summary statistics.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression).
+
 {% include_example scala/org/apache/spark/examples/ml/LinearRegressionWithElasticNetExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/regression/LinearRegression.html).
+
 {% include_example java/org/apache/spark/examples/ml/JavaLinearRegressionWithElasticNetExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 <!--- TODO: Add python model summaries once implemented -->
+
+More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.regression.LinearRegression).
+
 {% include_example python/ml/linear_regression_with_elastic_net.py %}
 </div>
 
@@ -519,18 +546,21 @@ function and extracting model summary statistics.
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression) for more details.
 
 {% include_example scala/org/apache/spark/examples/ml/GeneralizedLinearRegressionExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
 Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/GeneralizedLinearRegression.html) for more details.
 
 {% include_example java/org/apache/spark/examples/ml/JavaGeneralizedLinearRegressionExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
 Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.GeneralizedLinearRegression) for more details.
 
 {% include_example python/ml/generalized_linear_regression_example.py %}
@@ -705,14 +735,23 @@ The implementation matches the result from R's survival function
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) for more details.
+
 {% include_example scala/org/apache/spark/examples/ml/AFTSurvivalRegressionExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/AFTSurvivalRegression.html) for more details.
+
 {% include_example java/org/apache/spark/examples/ml/JavaAFTSurvivalRegressionExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
+
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.AFTSurvivalRegression) for more details.
+
 {% include_example python/ml/aft_survival_regression.py %}
 </div>
 
diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md
index adb057ba7e250..b4d6be94f5eb0 100644
--- a/docs/ml-pipeline.md
+++ b/docs/ml-pipeline.md
@@ -207,14 +207,29 @@ This example covers the concepts of `Estimator`, `Transformer`, and `Param`.
 <div class="codetabs">
 
 <div data-lang="scala">
+
+Refer to the [`Estimator` Scala docs](api/scala/index.html#org.apache.spark.ml.Estimator),
+the [`Transformer` Scala docs](api/scala/index.html#org.apache.spark.ml.Transformer) and
+the [`Params` Scala docs](api/scala/index.html#org.apache.spark.ml.param.Params) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala %}
 </div>
 
 <div data-lang="java">
+
+Refer to the [`Estimator` Java docs](api/java/org/apache/spark/ml/Estimator.html),
+the [`Transformer` Java docs](api/java/org/apache/spark/ml/Transformer.html) and
+the [`Params` Java docs](api/java/org/apache/spark/ml/param/Params.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaEstimatorTransformerParamExample.java %}
 </div>
 
 <div data-lang="python">
+
+Refer to the [`Estimator` Python docs](api/python/pyspark.ml.html#pyspark.ml.Estimator),
+the [`Transformer` Python docs](api/python/pyspark.ml.html#pyspark.ml.Transformer) and
+the [`Params` Python docs](api/python/pyspark.ml.html#pyspark.ml.param.Params) for more details on the API.
+
 {% include_example python/ml/estimator_transformer_param_example.py %}
 </div>
 
@@ -227,14 +242,24 @@ This example follows the simple text document `Pipeline` illustrated in the figu
 <div class="codetabs">
 
 <div data-lang="scala">
+
+Refer to the [`Pipeline` Scala docs](api/scala/index.html#org.apache.spark.ml.Pipeline) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/PipelineExample.scala %}
 </div>
 
 <div data-lang="java">
+
+
+Refer to the [`Pipeline` Java docs](api/java/org/apache/spark/ml/Pipeline.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaPipelineExample.java %}
 </div>
 
 <div data-lang="python">
+
+Refer to the [`Pipeline` Python docs](api/python/pyspark.ml.html#pyspark.ml.Pipeline) for more details on the API.
+
 {% include_example python/ml/pipeline_example.py %}
 </div>
 
diff --git a/docs/ml-tuning.md b/docs/ml-tuning.md
index 2ca90c7092fd3..15748720b7ae2 100644
--- a/docs/ml-tuning.md
+++ b/docs/ml-tuning.md
@@ -75,15 +75,23 @@ However, it is also a well-established method for choosing parameters which is m
 <div class="codetabs">
 
 <div data-lang="scala">
+
+Refer to the [`CrossValidator` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaCrossValidationExample.scala %}
 </div>
 
 <div data-lang="java">
+
+Refer to the [`CrossValidator` Java docs](api/java/org/apache/spark/ml/tuning/CrossValidator.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaCrossValidationExample.java %}
 </div>
 
 <div data-lang="python">
 
+Refer to the [`CrossValidator` Python docs](api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator) for more details on the API.
+
 {% include_example python/ml/cross_validator.py %}
 </div>
 
@@ -107,14 +115,23 @@ Like `CrossValidator`, `TrainValidationSplit` finally fits the `Estimator` using
 <div class="codetabs">
 
 <div data-lang="scala" markdown="1">
+
+Refer to the [`TrainValidationSplit` Scala docs](api/scala/index.html#org.apache.spark.ml.tuning.TrainValidationSplit) for details on the API.
+
 {% include_example scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
+
+Refer to the [`TrainValidationSplit` Java docs](api/java/org/apache/spark/ml/tuning/TrainValidationSplit.html) for details on the API.
+
 {% include_example java/org/apache/spark/examples/ml/JavaModelSelectionViaTrainValidationSplitExample.java %}
 </div>
 
 <div data-lang="python">
+
+Refer to the [`TrainValidationSplit` Python docs](api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplit) for more details on the API.
+
 {% include_example python/ml/train_validation_split.py %}
 </div>
 

From b0ae8712358fc8c07aa5efe4d0bd337e7e452078 Mon Sep 17 00:00:00 2001
From: Xianyang Liu <xyliu0530@icloud.com>
Date: Wed, 16 Nov 2016 11:59:00 +0000
Subject: [PATCH 126/534] [SPARK-18420][BUILD] Fix the errors caused by lint
 check in Java

Small fix, fix the errors caused by lint check in Java

- Clear unused objects and `UnusedImports`.
- Add comments around the method `finalize` of `NioBufferedFileInputStream`to turn off checkstyle.
- Cut the line which is longer than 100 characters into two lines.

Travis CI.
```
$ build/mvn -T 4 -q -DskipTests -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
$ dev/lint-java
```
Before:
```
Checkstyle checks failed at following occurrences:
[ERROR] src/main/java/org/apache/spark/network/util/TransportConf.java:[21,8] (imports) UnusedImports: Unused import - org.apache.commons.crypto.cipher.CryptoCipherFactory.
[ERROR] src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java:[516,5] (modifier) RedundantModifier: Redundant 'public' modifier.
[ERROR] src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java:[133] (coding) NoFinalizer: Avoid using finalizer method.
[ERROR] src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java:[71] (sizes) LineLength: Line is longer than 100 characters (found 113).
[ERROR] src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java:[112] (sizes) LineLength: Line is longer than 100 characters (found 110).
[ERROR] src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java:[31,17] (modifier) ModifierOrder: 'static' modifier out of order with the JLS suggestions.
[ERROR]src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java:[64] (sizes) LineLength: Line is longer than 100 characters (found 103).
[ERROR] src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java:[22,8] (imports) UnusedImports: Unused import - org.apache.spark.ml.linalg.Vectors.
[ERROR] src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java:[51] (regexp) RegexpSingleline: No trailing whitespace allowed.
```

After:
```
$ build/mvn -T 4 -q -DskipTests -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
$ dev/lint-java
Using `mvn` from path: /home/travis/build/ConeyLiu/spark/build/apache-maven-3.3.9/bin/mvn
Checkstyle checks passed.
```

Author: Xianyang Liu <xyliu0530@icloud.com>

Closes #15865 from ConeyLiu/master.

(cherry picked from commit 7569cf6cb85bda7d0e76d3e75e286d4796e77e08)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../spark/io/NioBufferedFileInputStream.java      |  2 ++
 dev/checkstyle.xml                                | 15 +++++++++++++++
 .../spark/examples/ml/JavaInteractionExample.java |  3 +--
 ...vaLogisticRegressionWithElasticNetExample.java |  4 ++--
 .../sql/catalyst/expressions/UnsafeArrayData.java |  3 ++-
 .../sql/catalyst/expressions/UnsafeMapData.java   |  3 ++-
 .../sql/catalyst/expressions/HiveHasherSuite.java |  1 -
 7 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
index f6d1288cb263d..ea5f1a9abf69b 100644
--- a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
+++ b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java
@@ -130,8 +130,10 @@ public synchronized void close() throws IOException {
     StorageUtils.dispose(byteBuffer);
   }
 
+  //checkstyle.off: NoFinalizer
   @Override
   protected void finalize() throws IOException {
     close();
   }
+  //checkstyle.on: NoFinalizer
 }
diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml
index 3de6aa91dcd51..92c5251c85037 100644
--- a/dev/checkstyle.xml
+++ b/dev/checkstyle.xml
@@ -52,6 +52,20 @@
       <property name="file" value="dev/checkstyle-suppressions.xml"/>
     </module>
 
+    <!--
+    If you wish to turn off checking for a section of code, you can put a comment in the source
+    before and after the section, with the following syntax:
+
+      // checkstyle:off no.XXX (such as checkstyle.off: NoFinalizer)
+      ...  // stuff that breaks the styles
+      // checkstyle:on
+    -->
+    <module name="SuppressionCommentFilter">
+        <property name="offCommentFormat" value="checkstyle.off\: ([\w\|]+)"/>
+        <property name="onCommentFormat" value="checkstyle.on\: ([\w\|]+)"/>
+        <property name="checkFormat" value="$1"/>
+    </module>
+
     <!-- Checks for whitespace                               -->
     <!-- See http://checkstyle.sf.net/config_whitespace.html -->
     <module name="FileTabCharacter">
@@ -168,5 +182,6 @@
         <module name="UnusedImports"/>
         <module name="RedundantImport"/>
         <module name="RedundantModifier"/>
+        <module name="FileContentsHolder"/>
     </module>
 </module>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
index 4213c05703cc6..3684a87e22e7b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaInteractionExample.java
@@ -19,7 +19,6 @@
 
 import org.apache.spark.ml.feature.Interaction;
 import org.apache.spark.ml.feature.VectorAssembler;
-import org.apache.spark.ml.linalg.Vectors;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.Metadata;
@@ -48,7 +47,7 @@ public static void main(String[] args) {
       RowFactory.create(5, 9, 2, 7, 10, 7, 3),
       RowFactory.create(6, 1, 1, 4, 2, 8, 4)
     );
-    
+
     StructType schema = new StructType(new StructField[]{
       new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
       new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
index b8fb5972ea418..4cdec21d23023 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaLogisticRegressionWithElasticNetExample.java
@@ -60,8 +60,8 @@ public static void main(String[] args) {
     LogisticRegressionModel mlrModel = mlr.fit(training);
 
     // Print the coefficients and intercepts for logistic regression with multinomial family
-    System.out.println("Multinomial coefficients: "
-            + lrModel.coefficientMatrix() + "\nMultinomial intercepts: " + mlrModel.interceptVector());
+    System.out.println("Multinomial coefficients: " + lrModel.coefficientMatrix()
+      + "\nMultinomial intercepts: " + mlrModel.interceptVector());
     // $example off$
 
     spark.stop();
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 86523c1474015..e8c33871f97bc 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -109,7 +109,8 @@ public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
     // Read the number of elements from the first 8 bytes.
     final long numElements = Platform.getLong(baseObject, baseOffset);
     assert numElements >= 0 : "numElements (" + numElements + ") should >= 0";
-    assert numElements <= Integer.MAX_VALUE : "numElements (" + numElements + ") should <= Integer.MAX_VALUE";
+    assert numElements <= Integer.MAX_VALUE :
+      "numElements (" + numElements + ") should <= Integer.MAX_VALUE";
 
     this.numElements = (int)numElements;
     this.baseObject = baseObject;
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
index 35029f5a50e3e..f17441dfccb6d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -68,7 +68,8 @@ public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
     // Read the numBytes of key array from the first 8 bytes.
     final long keyArraySize = Platform.getLong(baseObject, baseOffset);
     assert keyArraySize >= 0 : "keyArraySize (" + keyArraySize + ") should >= 0";
-    assert keyArraySize <= Integer.MAX_VALUE : "keyArraySize (" + keyArraySize + ") should <= Integer.MAX_VALUE";
+    assert keyArraySize <= Integer.MAX_VALUE :
+      "keyArraySize (" + keyArraySize + ") should <= Integer.MAX_VALUE";
     final int valueArraySize = sizeInBytes - (int)keyArraySize - 8;
     assert valueArraySize >= 0 : "valueArraySize (" + valueArraySize + ") should >= 0";
 
diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
index 67a5eb0c7fe8f..b67c6f3e6e85e 100644
--- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
+++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/HiveHasherSuite.java
@@ -28,7 +28,6 @@
 import java.util.Set;
 
 public class HiveHasherSuite {
-  private final static HiveHasher hasher = new HiveHasher();
 
   @Test
   public void testKnownIntegerInputs() {

From c0dbe08d604dea543eb17ccb802a8a20d6c21a69 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 16 Nov 2016 08:25:15 -0800
Subject: [PATCH 127/534] [SPARK-18415][SQL] Weird Plan Output when CTE used in
 RunnableCommand

### What changes were proposed in this pull request?
Currently, when CTE is used in RunnableCommand, the Analyzer does not replace the logical node `With`. The child plan of RunnableCommand is not resolved. Thus, the output of the `With` plan node looks very confusing.
For example,
```
sql(
  """
    |CREATE VIEW cte_view AS
    |WITH w AS (SELECT 1 AS n), cte1 (select 2), cte2 as (select 3)
    |SELECT n FROM w
  """.stripMargin).explain()
```
The output is like
```
ExecutedCommand
   +- CreateViewCommand `cte_view`, WITH w AS (SELECT 1 AS n), cte1 (select 2), cte2 as (select 3)
SELECT n FROM w, false, false, PersistedView
         +- 'With [(w,SubqueryAlias w
+- Project [1 AS n#16]
   +- OneRowRelation$
), (cte1,'SubqueryAlias cte1
+- 'Project [unresolvedalias(2, None)]
   +- OneRowRelation$
), (cte2,'SubqueryAlias cte2
+- 'Project [unresolvedalias(3, None)]
   +- OneRowRelation$
)]
            +- 'Project ['n]
               +- 'UnresolvedRelation `w`
```
After the fix, the output is as shown below.
```
ExecutedCommand
   +- CreateViewCommand `cte_view`, WITH w AS (SELECT 1 AS n), cte1 (select 2), cte2 as (select 3)
SELECT n FROM w, false, false, PersistedView
         +- CTE [w, cte1, cte2]
            :  :- SubqueryAlias w
            :  :  +- Project [1 AS n#16]
            :  :     +- OneRowRelation$
            :  :- 'SubqueryAlias cte1
            :  :  +- 'Project [unresolvedalias(2, None)]
            :  :     +- OneRowRelation$
            :  +- 'SubqueryAlias cte2
            :     +- 'Project [unresolvedalias(3, None)]
            :        +- OneRowRelation$
            +- 'Project ['n]
               +- 'UnresolvedRelation `w`
```

BTW, this PR also fixes the output of the view type.

### How was this patch tested?
Manual

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15854 from gatorsmile/cteName.

(cherry picked from commit 608ecc512b759514c75a1b475582f237ed569f10)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../catalyst/plans/logical/basicLogicalOperators.scala    | 8 ++++++++
 .../org/apache/spark/sql/execution/command/views.scala    | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 574caf039d3d2..dd6c8fd1dcf3e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 /**
  * When planning take() or collect() operations, this special node that is inserted at the top of
@@ -405,6 +406,13 @@ case class InsertIntoTable(
  */
 case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)]) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
+
+  override def simpleString: String = {
+    val cteAliases = Utils.truncatedString(cteRelations.map(_._1), "[", ", ", "]")
+    s"CTE $cteAliases"
+  }
+
+  override def innerChildren: Seq[QueryPlan[_]] = cteRelations.map(_._2)
 }
 
 case class WithWindowDefinition(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index 30472ec45ce44..154141bf83c7d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -33,7 +33,9 @@ import org.apache.spark.sql.types.MetadataBuilder
  * ViewType is used to specify the expected view type when we want to create or replace a view in
  * [[CreateViewCommand]].
  */
-sealed trait ViewType
+sealed trait ViewType {
+  override def toString: String = getClass.getSimpleName.stripSuffix("$")
+}
 
 /**
  * LocalTempView means session-scoped local temporary views. Its lifetime is the lifetime of the

From b86e962c90c4322cd98b5bf3b19e251da2d32442 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 16 Nov 2016 10:00:59 -0800
Subject: [PATCH 128/534] [SPARK-18459][SPARK-18460][STRUCTUREDSTREAMING]
 Rename triggerId to batchId and add triggerDetails to json in
 StreamingQueryStatus

## What changes were proposed in this pull request?

SPARK-18459: triggerId seems like a number that should be increasing with each trigger, whether or not there is data in it. However, actually, triggerId increases only where there is a batch of data in a trigger. So its better to rename it to batchId.

SPARK-18460: triggerDetails was missing from json representation. Fixed it.

## How was this patch tested?
Updated existing unit tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15895 from tdas/SPARK-18459.

(cherry picked from commit 0048ce7ce64b02cbb6a1c4a2963a0b1b9541047e)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 python/pyspark/sql/streaming.py               |  6 ++---
 .../execution/streaming/StreamMetrics.scala   |  8 +++----
 .../sql/streaming/StreamingQueryStatus.scala  |  4 ++--
 .../streaming/StreamMetricsSuite.scala        |  8 +++----
 .../StreamingQueryListenerSuite.scala         |  4 ++--
 .../streaming/StreamingQueryStatusSuite.scala | 22 +++++++++++++++++--
 6 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index f326f16232690..0e4589be976ea 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -212,12 +212,12 @@ def __str__(self):
             Processing rate 23.5 rows/sec
             Latency: 345.0 ms
             Trigger details:
+                batchId: 5
                 isDataPresentInTrigger: true
                 isTriggerActive: true
                 latency.getBatch.total: 20
                 latency.getOffset.total: 10
                 numRows.input.total: 100
-                triggerId: 5
             Source statuses [1 source]:
                 Source 1 - MySource1
                     Available offset: 0
@@ -341,8 +341,8 @@ def triggerDetails(self):
         If no trigger is currently active, then it will have details of the last completed trigger.
 
         >>> sqs.triggerDetails
-        {u'triggerId': u'5', u'latency.getBatch.total': u'20', u'numRows.input.total': u'100',
-        u'isTriggerActive': u'true', u'latency.getOffset.total': u'10',
+        {u'latency.getBatch.total': u'20', u'numRows.input.total': u'100',
+        u'isTriggerActive': u'true', u'batchId': u'5', u'latency.getOffset.total': u'10',
         u'isDataPresentInTrigger': u'true'}
         """
         return self._jsqs.triggerDetails()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
index 5645554a58f6e..942e6ed8944be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
@@ -78,13 +78,13 @@ class StreamMetrics(sources: Set[Source], triggerClock: Clock, codahaleSourceNam
 
   // =========== Setter methods ===========
 
-  def reportTriggerStarted(triggerId: Long): Unit = synchronized {
+  def reportTriggerStarted(batchId: Long): Unit = synchronized {
     numInputRows.clear()
     triggerDetails.clear()
     sourceTriggerDetails.values.foreach(_.clear())
 
-    reportTriggerDetail(TRIGGER_ID, triggerId)
-    sources.foreach(s => reportSourceTriggerDetail(s, TRIGGER_ID, triggerId))
+    reportTriggerDetail(BATCH_ID, batchId)
+    sources.foreach(s => reportSourceTriggerDetail(s, BATCH_ID, batchId))
     reportTriggerDetail(IS_TRIGGER_ACTIVE, true)
     currentTriggerStartTimestamp = triggerClock.getTimeMillis()
     reportTriggerDetail(START_TIMESTAMP, currentTriggerStartTimestamp)
@@ -217,7 +217,7 @@ object StreamMetrics extends Logging {
   }
 
 
-  val TRIGGER_ID = "triggerId"
+  val BATCH_ID = "batchId"
   val IS_TRIGGER_ACTIVE = "isTriggerActive"
   val IS_DATA_PRESENT_IN_TRIGGER = "isDataPresentInTrigger"
   val STATUS_MESSAGE = "statusMessage"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
index 99c7729d02351..ba732ff7fc2ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -102,7 +102,7 @@ class StreamingQueryStatus private(
     ("inputRate" -> JDouble(inputRate)) ~
     ("processingRate" -> JDouble(processingRate)) ~
     ("latency" -> latency.map(JDouble).getOrElse(JNothing)) ~
-    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala))
+    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala)) ~
     ("sourceStatuses" -> JArray(sourceStatuses.map(_.jsonValue).toList)) ~
     ("sinkStatus" -> sinkStatus.jsonValue)
   }
@@ -151,7 +151,7 @@ private[sql] object StreamingQueryStatus {
         desc = "MySink",
         offsetDesc = OffsetSeq(Some(LongOffset(1)) :: None :: Nil).toString),
       triggerDetails = Map(
-        TRIGGER_ID -> "5",
+        BATCH_ID -> "5",
         IS_TRIGGER_ACTIVE -> "true",
         IS_DATA_PRESENT_IN_TRIGGER -> "true",
         GET_OFFSET_LATENCY -> "10",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
index 938423db64745..38c4ece439770 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
@@ -50,10 +50,10 @@ class StreamMetricsSuite extends SparkFunSuite {
     assert(sm.currentSourceProcessingRate(source) === 0.0)
     assert(sm.currentLatency() === None)
     assert(sm.currentTriggerDetails() ===
-      Map(TRIGGER_ID -> "1", IS_TRIGGER_ACTIVE -> "true",
+      Map(BATCH_ID -> "1", IS_TRIGGER_ACTIVE -> "true",
         START_TIMESTAMP -> "0", "key" -> "value"))
     assert(sm.currentSourceTriggerDetails(source) ===
-      Map(TRIGGER_ID -> "1", "key2" -> "value2"))
+      Map(BATCH_ID -> "1", "key2" -> "value2"))
 
     // Finishing the trigger should calculate the rates, except input rate which needs
     // to have another trigger interval
@@ -66,11 +66,11 @@ class StreamMetricsSuite extends SparkFunSuite {
     assert(sm.currentSourceProcessingRate(source) === 100.0)
     assert(sm.currentLatency() === None)
     assert(sm.currentTriggerDetails() ===
-      Map(TRIGGER_ID -> "1", IS_TRIGGER_ACTIVE -> "false",
+      Map(BATCH_ID -> "1", IS_TRIGGER_ACTIVE -> "false",
         START_TIMESTAMP -> "0", FINISH_TIMESTAMP -> "1000",
         NUM_INPUT_ROWS -> "100", "key" -> "value"))
     assert(sm.currentSourceTriggerDetails(source) ===
-      Map(TRIGGER_ID -> "1", NUM_SOURCE_INPUT_ROWS -> "100", "key2" -> "value2"))
+      Map(BATCH_ID -> "1", NUM_SOURCE_INPUT_ROWS -> "100", "key2" -> "value2"))
 
     // After another trigger starts, the rates and latencies should not change until
     // new rows are reported
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index cebb32a0a56cc..98f3bec7080af 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -84,7 +84,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
       AssertOnLastQueryStatus { status: StreamingQueryStatus =>
         // Check the correctness of the trigger info of the last completed batch reported by
         // onQueryProgress
-        assert(status.triggerDetails.containsKey("triggerId"))
+        assert(status.triggerDetails.containsKey("batchId"))
         assert(status.triggerDetails.get("isTriggerActive") === "false")
         assert(status.triggerDetails.get("isDataPresentInTrigger") === "true")
 
@@ -104,7 +104,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         assert(status.triggerDetails.get("numRows.state.aggregation1.updated") === "1")
 
         assert(status.sourceStatuses.length === 1)
-        assert(status.sourceStatuses(0).triggerDetails.containsKey("triggerId"))
+        assert(status.sourceStatuses(0).triggerDetails.containsKey("batchId"))
         assert(status.sourceStatuses(0).triggerDetails.get("latency.getOffset.source") === "100")
         assert(status.sourceStatuses(0).triggerDetails.get("latency.getBatch.source") === "200")
         assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "2")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
index 6af19fb0c2327..50a7d92ede9a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
@@ -48,12 +48,12 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
         |    Processing rate 23.5 rows/sec
         |    Latency: 345.0 ms
         |    Trigger details:
+        |        batchId: 5
         |        isDataPresentInTrigger: true
         |        isTriggerActive: true
         |        latency.getBatch.total: 20
         |        latency.getOffset.total: 10
         |        numRows.input.total: 100
-        |        triggerId: 5
         |    Source statuses [1 source]:
         |        Source 1 - MySource1
         |            Available offset: 0
@@ -72,7 +72,11 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
   test("json") {
     assert(StreamingQueryStatus.testStatus.json ===
       """
-        |{"sourceStatuses":[{"description":"MySource1","offsetDesc":"0","inputRate":15.5,
+        |{"name":"query","id":1,"timestamp":123,"inputRate":15.5,"processingRate":23.5,
+        |"latency":345.0,"triggerDetails":{"latency.getBatch.total":"20",
+        |"numRows.input.total":"100","isTriggerActive":"true","batchId":"5",
+        |"latency.getOffset.total":"10","isDataPresentInTrigger":"true"},
+        |"sourceStatuses":[{"description":"MySource1","offsetDesc":"0","inputRate":15.5,
         |"processingRate":23.5,"triggerDetails":{"numRows.input.source":"100",
         |"latency.getOffset.source":"10","latency.getBatch.source":"20"}}],
         |"sinkStatus":{"description":"MySink","offsetDesc":"[1, -]"}}
@@ -84,6 +88,20 @@ class StreamingQueryStatusSuite extends SparkFunSuite {
       StreamingQueryStatus.testStatus.prettyJson ===
         """
           |{
+          |  "name" : "query",
+          |  "id" : 1,
+          |  "timestamp" : 123,
+          |  "inputRate" : 15.5,
+          |  "processingRate" : 23.5,
+          |  "latency" : 345.0,
+          |  "triggerDetails" : {
+          |    "latency.getBatch.total" : "20",
+          |    "numRows.input.total" : "100",
+          |    "isTriggerActive" : "true",
+          |    "batchId" : "5",
+          |    "latency.getOffset.total" : "10",
+          |    "isDataPresentInTrigger" : "true"
+          |  },
           |  "sourceStatuses" : [ {
           |    "description" : "MySource1",
           |    "offsetDesc" : "0",

From 3d4756d56b852dcf4e1bebe621d4a30570873c3c Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 16 Nov 2016 11:03:10 -0800
Subject: [PATCH 129/534] [SPARK-18461][DOCS][STRUCTUREDSTREAMING] Added more
 information about monitoring streaming queries

## What changes were proposed in this pull request?
<img width="941" alt="screen shot 2016-11-15 at 6 27 32 pm" src="https://cloud.githubusercontent.com/assets/663212/20332521/4190b858-ab61-11e6-93a6-4bdc05105ed9.png">
<img width="940" alt="screen shot 2016-11-15 at 6 27 45 pm" src="https://cloud.githubusercontent.com/assets/663212/20332525/44a0d01e-ab61-11e6-8668-47f925490d4f.png">

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #15897 from tdas/SPARK-18461.

(cherry picked from commit bb6cdfd9a6a6b6c91aada7c3174436146045ed1e)
Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 .../structured-streaming-programming-guide.md | 182 +++++++++++++++++-
 1 file changed, 179 insertions(+), 3 deletions(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index d2545584ae3b0..77b66b3b3a497 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -1087,9 +1087,185 @@ spark.streams().awaitAnyTermination()  # block until any one of them terminates
 </div>
 </div>
 
-Finally, for asynchronous monitoring of streaming queries, you can create and attach a `StreamingQueryListener`
-([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs),
-which will give you regular callback-based updates when queries are started and terminated.
+
+## Monitoring Streaming Queries
+There are two ways you can monitor queries. You can directly get the current status
+of an active query using `streamingQuery.status`, which will return a `StreamingQueryStatus` object
+([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryStatus)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryStatus.html)/[Python](api/python/pyspark.sql.html#pyspark.sql.streaming.StreamingQueryStatus) docs)
+that has all the details like current ingestion rates, processing rates, average latency,
+details of the currently active trigger, etc.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+val query: StreamingQuery = ...
+
+println(query.status)
+
+/* Will print the current status of the query
+
+Status of query 'queryName'
+    Query id: 1
+    Status timestamp: 123
+    Input rate: 15.5 rows/sec
+    Processing rate 23.5 rows/sec
+    Latency: 345.0 ms
+    Trigger details:
+        batchId: 5
+        isDataPresentInTrigger: true
+        isTriggerActive: true
+        latency.getBatch.total: 20
+        latency.getOffset.total: 10
+        numRows.input.total: 100
+    Source statuses [1 source]:
+        Source 1 - MySource1
+            Available offset: 0
+            Input rate: 15.5 rows/sec
+            Processing rate: 23.5 rows/sec
+            Trigger details:
+                numRows.input.source: 100
+                latency.getOffset.source: 10
+                latency.getBatch.source: 20
+    Sink status - MySink
+        Committed offsets: [1, -]
+*/
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+StreamingQuery query = ...
+
+System.out.println(query.status);
+
+/* Will print the current status of the query
+
+Status of query 'queryName'
+    Query id: 1
+    Status timestamp: 123
+    Input rate: 15.5 rows/sec
+    Processing rate 23.5 rows/sec
+    Latency: 345.0 ms
+    Trigger details:
+        batchId: 5
+        isDataPresentInTrigger: true
+        isTriggerActive: true
+        latency.getBatch.total: 20
+        latency.getOffset.total: 10
+        numRows.input.total: 100
+    Source statuses [1 source]:
+        Source 1 - MySource1
+            Available offset: 0
+            Input rate: 15.5 rows/sec
+            Processing rate: 23.5 rows/sec
+            Trigger details:
+                numRows.input.source: 100
+                latency.getOffset.source: 10
+                latency.getBatch.source: 20
+    Sink status - MySink
+        Committed offsets: [1, -]
+*/
+{% endhighlight %}
+
+</div>
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+query = ...  // a StreamingQuery
+
+print(query.status)
+
+'''
+Will print the current status of the query
+
+Status of query 'queryName'
+    Query id: 1
+    Status timestamp: 123
+    Input rate: 15.5 rows/sec
+    Processing rate 23.5 rows/sec
+    Latency: 345.0 ms
+    Trigger details:
+        batchId: 5
+        isDataPresentInTrigger: true
+        isTriggerActive: true
+        latency.getBatch.total: 20
+        latency.getOffset.total: 10
+        numRows.input.total: 100
+    Source statuses [1 source]:
+        Source 1 - MySource1
+            Available offset: 0
+            Input rate: 15.5 rows/sec
+            Processing rate: 23.5 rows/sec
+            Trigger details:
+                numRows.input.source: 100
+                latency.getOffset.source: 10
+                latency.getBatch.source: 20
+    Sink status - MySink
+        Committed offsets: [1, -]
+'''
+{% endhighlight %}
+
+</div>
+</div>
+
+
+You can also asynchronously monitor all queries associated with a
+`SparkSession` by attaching a `StreamingQueryListener`
+([Scala](api/scala/index.html#org.apache.spark.sql.streaming.StreamingQueryListener)/[Java](api/java/org/apache/spark/sql/streaming/StreamingQueryListener.html) docs).
+Once you attach your custom `StreamingQueryListener` object with
+`sparkSession.streams.attachListener()`, you will get callbacks when a query is started and
+stopped and when there is progress made in an active query. Here is an example,
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+val spark: SparkSession = ...
+
+spark.streams.addListener(new StreamingQueryListener() {
+
+    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
+        println("Query started: " + queryTerminated.queryStatus.name)
+    }
+    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
+        println("Query terminated: " + queryTerminated.queryStatus.name)
+    }
+    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
+        println("Query made progress: " + queryProgress.queryStatus)
+    }
+})
+{% endhighlight %}
+
+</div>
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+SparkSession spark = ...
+
+spark.streams.addListener(new StreamingQueryListener() {
+
+    @Overrides void onQueryStarted(QueryStartedEvent queryStarted) {
+        System.out.println("Query started: " + queryTerminated.queryStatus.name);
+    }
+    @Overrides void onQueryTerminated(QueryTerminatedEvent queryTerminated) {
+        System.out.println("Query terminated: " + queryTerminated.queryStatus.name);
+    }
+    @Overrides void onQueryProgress(QueryProgressEvent queryProgress) {
+        System.out.println("Query made progress: " + queryProgress.queryStatus);
+    }
+});
+{% endhighlight %}
+
+</div>
+<div data-lang="python"  markdown="1">
+{% highlight bash %}
+Not available in Python.
+{% endhighlight %}
+
+</div>
+</div>
 
 ## Recovering from Failures with Checkpointing 
 In case of a failure or intentional shutdown, you can recover the previous progress and state of a previous query, and continue where it left off. This is done using checkpointing and write ahead logs. You can configure a query with a checkpoint location, and the query will save all the progress information (i.e. range of offsets processed in each trigger) and the running aggregates (e.g. word counts in the [quick example](#quick-example)) to the checkpoint location. As of Spark 2.0, this checkpoint location has to be a path in an HDFS compatible file system, and can be set as an option in the DataStreamWriter when [starting a query](#starting-streaming-queries). 

From 523abfe19caa11747133877b0c8319c68ac66e56 Mon Sep 17 00:00:00 2001
From: Artur Sukhenko <artur.sukhenko@gmail.com>
Date: Wed, 16 Nov 2016 15:08:01 -0800
Subject: [PATCH 130/534] [YARN][DOC] Increasing NodeManager's heap size with
 External Shuffle Service

## What changes were proposed in this pull request?

Suggest users to increase `NodeManager's` heap size if `External Shuffle Service` is enabled as
`NM` can spend a lot of time doing GC resulting in  shuffle operations being a bottleneck due to `Shuffle Read blocked time` bumped up.
Also because of GC  `NodeManager` can use an enormous amount of CPU and cluster performance will suffer.
I have seen NodeManager using 5-13G RAM and up to 2700% CPU with `spark_shuffle` service on.

## How was this patch tested?

#### Added step 5:
![shuffle_service](https://cloud.githubusercontent.com/assets/15244468/20355499/2fec0fde-ac2a-11e6-8f8b-1c80daf71be1.png)

Author: Artur Sukhenko <artur.sukhenko@gmail.com>

Closes #15906 from Devian-ua/nmHeapSize.

(cherry picked from commit 55589987be89ff78dadf44498352fbbd811a206e)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 docs/running-on-yarn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index cd18808681ece..fe0221ce7c5b6 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -559,6 +559,8 @@ pre-packaged distribution.
 1. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`,
 then set `yarn.nodemanager.aux-services.spark_shuffle.class` to
 `org.apache.spark.network.yarn.YarnShuffleService`.
+1. Increase `NodeManager's` heap size by setting `YARN_HEAPSIZE` (1000 by default) in `etc/hadoop/yarn-env.sh` 
+to avoid garbage collection issues during shuffle. 
 1. Restart all `NodeManager`s in your cluster.
 
 The following extra configuration options are available when the shuffle service is running on YARN:

From 9515793820c7954d82116238a67e632ea3e783b5 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Thu, 17 Nov 2016 11:21:08 +0800
Subject: [PATCH 131/534] [SPARK-18442][SQL] Fix nullability of WrapOption.

## What changes were proposed in this pull request?

The nullability of `WrapOption` should be `false`.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #15887 from ueshin/issues/SPARK-18442.

(cherry picked from commit 170eeb345f951de89a39fe565697b3e913011768)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/catalyst/expressions/objects/objects.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 50e2ac3c36d93..0e3d99127ed56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -341,7 +341,7 @@ case class WrapOption(child: Expression, optType: DataType)
 
   override def dataType: DataType = ObjectType(classOf[Option[_]])
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = false
 
   override def inputTypes: Seq[AbstractDataType] = optType :: Nil
 

From 6a3cbbc037fe631e1b89c46000373dc2ba86a5eb Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@us.ibm.com>
Date: Wed, 16 Nov 2016 14:22:15 -0800
Subject: [PATCH 132/534] [SPARK-1267][SPARK-18129] Allow PySpark to be pip
 installed

## What changes were proposed in this pull request?

This PR aims to provide a pip installable PySpark package. This does a bunch of work to copy the jars over and package them with the Python code (to prevent challenges from trying to use different versions of the Python code with different versions of the JAR). It does not currently publish to PyPI but that is the natural follow up (SPARK-18129).

Done:
- pip installable on conda [manual tested]
- setup.py installed on a non-pip managed system (RHEL) with YARN [manual tested]
- Automated testing of this (virtualenv)
- packaging and signing with release-build*

Possible follow up work:
- release-build update to publish to PyPI (SPARK-18128)
- figure out who owns the pyspark package name on prod PyPI (is it someone with in the project or should we ask PyPI or should we choose a different name to publish with like ApachePySpark?)
- Windows support and or testing ( SPARK-18136 )
- investigate details of wheel caching and see if we can avoid cleaning the wheel cache during our test
- consider how we want to number our dev/snapshot versions

Explicitly out of scope:
- Using pip installed PySpark to start a standalone cluster
- Using pip installed PySpark for non-Python Spark programs

*I've done some work to test release-build locally but as a non-committer I've just done local testing.
## How was this patch tested?

Automated testing with virtualenv, manual testing with conda, a system wide install, and YARN integration.

release-build changes tested locally as a non-committer (no testing of upload artifacts to Apache staging websites)

Author: Holden Karau <holden@us.ibm.com>
Author: Juliet Hougland <juliet@cloudera.com>
Author: Juliet Hougland <not@myemail.com>

Closes #15659 from holdenk/SPARK-1267-pip-install-pyspark.
---
 .gitignore                                    |   2 +
 bin/beeline                                   |   2 +-
 bin/find-spark-home                           |  41 ++++
 bin/load-spark-env.sh                         |   2 +-
 bin/pyspark                                   |   6 +-
 bin/run-example                               |   2 +-
 bin/spark-class                               |   6 +-
 bin/spark-shell                               |   4 +-
 bin/spark-sql                                 |   2 +-
 bin/spark-submit                              |   2 +-
 bin/sparkR                                    |   2 +-
 dev/create-release/release-build.sh           |  26 ++-
 dev/create-release/release-tag.sh             |  11 +-
 dev/lint-python                               |   4 +-
 dev/make-distribution.sh                      |  16 +-
 dev/pip-sanity-check.py                       |  36 +++
 dev/run-pip-tests                             | 115 ++++++++++
 dev/run-tests-jenkins.py                      |   1 +
 dev/run-tests.py                              |   7 +
 dev/sparktestsupport/__init__.py              |   1 +
 docs/building-spark.md                        |   8 +
 docs/index.md                                 |   4 +-
 .../spark/launcher/CommandBuilderUtils.java   |   2 +-
 python/MANIFEST.in                            |  22 ++
 python/README.md                              |  32 +++
 python/pyspark/__init__.py                    |   1 +
 python/pyspark/find_spark_home.py             |  74 +++++++
 python/pyspark/java_gateway.py                |   3 +-
 python/pyspark/version.py                     |  19 ++
 python/setup.cfg                              |  22 ++
 python/setup.py                               | 209 ++++++++++++++++++
 31 files changed, 660 insertions(+), 24 deletions(-)
 create mode 100755 bin/find-spark-home
 create mode 100644 dev/pip-sanity-check.py
 create mode 100755 dev/run-pip-tests
 create mode 100644 python/MANIFEST.in
 create mode 100644 python/README.md
 create mode 100755 python/pyspark/find_spark_home.py
 create mode 100644 python/pyspark/version.py
 create mode 100644 python/setup.cfg
 create mode 100644 python/setup.py

diff --git a/.gitignore b/.gitignore
index 39d17e1793f77..5634a434db0c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -57,6 +57,8 @@ project/plugins/project/build.properties
 project/plugins/src_managed/
 project/plugins/target/
 python/lib/pyspark.zip
+python/deps
+python/pyspark/python
 reports/
 scalastyle-on-compile.generated.xml
 scalastyle-output.xml
diff --git a/bin/beeline b/bin/beeline
index 1627626941a73..058534699e44b 100755
--- a/bin/beeline
+++ b/bin/beeline
@@ -25,7 +25,7 @@ set -o posix
 
 # Figure out if SPARK_HOME is set
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 CLASS="org.apache.hive.beeline.BeeLine"
diff --git a/bin/find-spark-home b/bin/find-spark-home
new file mode 100755
index 0000000000000..fa78407d4175a
--- /dev/null
+++ b/bin/find-spark-home
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Attempts to find a proper value for SPARK_HOME. Should be included using "source" directive.
+
+FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_spark_home.py"
+
+# Short cirtuit if the user already has this set.
+if [ ! -z "${SPARK_HOME}" ]; then
+   exit 0
+elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
+  # If we are not in the same directory as find_spark_home.py we are not pip installed so we don't
+  # need to search the different Python directories for a Spark installation.
+  # Note only that, if the user has pip installed PySpark but is directly calling pyspark-shell or
+  # spark-submit in another directory we want to use that version of PySpark rather than the
+  # pip installed version of PySpark.
+  export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
+else
+  # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
+  # Default to standard python interpreter unless told otherwise
+  if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
+     PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
+  fi
+  export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
+fi
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
index eaea964ed5b3d..8a2f709960a25 100644
--- a/bin/load-spark-env.sh
+++ b/bin/load-spark-env.sh
@@ -23,7 +23,7 @@
 
 # Figure out where Spark is installed
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 if [ -z "$SPARK_ENV_LOADED" ]; then
diff --git a/bin/pyspark b/bin/pyspark
index d6b3ab0a44321..98387c2ec5b8a 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 source "${SPARK_HOME}"/bin/load-spark-env.sh
@@ -46,7 +46,7 @@ WORKS_WITH_IPYTHON=$(python -c 'import sys; print(sys.version_info >= (2, 7, 0))
 
 # Determine the Python executable to use for the executors:
 if [[ -z "$PYSPARK_PYTHON" ]]; then
-  if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! WORKS_WITH_IPYTHON ]]; then
+  if [[ $PYSPARK_DRIVER_PYTHON == *ipython* && ! $WORKS_WITH_IPYTHON ]]; then
     echo "IPython requires Python 2.7+; please install python2.7 or set PYSPARK_PYTHON" 1>&2
     exit 1
   else
@@ -68,7 +68,7 @@ if [[ -n "$SPARK_TESTING" ]]; then
   unset YARN_CONF_DIR
   unset HADOOP_CONF_DIR
   export PYTHONHASHSEED=0
-  exec "$PYSPARK_DRIVER_PYTHON" -m $1
+  exec "$PYSPARK_DRIVER_PYTHON" -m "$1"
   exit
 fi
 
diff --git a/bin/run-example b/bin/run-example
index dd0e3c4120260..4ba5399311d33 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/run-example [options] example-class [example args]"
diff --git a/bin/spark-class b/bin/spark-class
index 377c8d1add3f6..77ea40cc37946 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 . "${SPARK_HOME}"/bin/load-spark-env.sh
@@ -27,7 +27,7 @@ fi
 if [ -n "${JAVA_HOME}" ]; then
   RUNNER="${JAVA_HOME}/bin/java"
 else
-  if [ `command -v java` ]; then
+  if [ "$(command -v java)" ]; then
     RUNNER="java"
   else
     echo "JAVA_HOME is not set" >&2
@@ -36,7 +36,7 @@ else
 fi
 
 # Find Spark jars.
-if [ -f "${SPARK_HOME}/RELEASE" ]; then
+if [ -d "${SPARK_HOME}/jars" ]; then
   SPARK_JARS_DIR="${SPARK_HOME}/jars"
 else
   SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
diff --git a/bin/spark-shell b/bin/spark-shell
index 6583b5bd880ee..421f36cac3d47 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -21,7 +21,7 @@
 # Shell script for starting the Spark Shell REPL
 
 cygwin=false
-case "`uname`" in
+case "$(uname)" in
   CYGWIN*) cygwin=true;;
 esac
 
@@ -29,7 +29,7 @@ esac
 set -o posix
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]"
diff --git a/bin/spark-sql b/bin/spark-sql
index 970d12cbf51dd..b08b944ebd319 100755
--- a/bin/spark-sql
+++ b/bin/spark-sql
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-sql [options] [cli option]"
diff --git a/bin/spark-submit b/bin/spark-submit
index 023f9c162f4b8..4e9d3614e6370 100755
--- a/bin/spark-submit
+++ b/bin/spark-submit
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 # disable randomized hash for string in Python 3.3+
diff --git a/bin/sparkR b/bin/sparkR
index 2c07a82e2173b..29ab10df8ab6d 100755
--- a/bin/sparkR
+++ b/bin/sparkR
@@ -18,7 +18,7 @@
 #
 
 if [ -z "${SPARK_HOME}" ]; then
-  export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+  source "$(dirname "$0")"/find-spark-home
 fi
 
 source "${SPARK_HOME}"/bin/load-spark-env.sh
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 81f0d63054e29..1dbfa3b6e361b 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -162,14 +162,35 @@ if [[ "$1" == "package" ]]; then
     export ZINC_PORT=$ZINC_PORT
     echo "Creating distribution: $NAME ($FLAGS)"
 
+    # Write out the NAME and VERSION to PySpark version info we rewrite the - into a . and SNAPSHOT
+    # to dev0 to be closer to PEP440. We use the NAME as a "local version".
+    PYSPARK_VERSION=`echo "$SPARK_VERSION+$NAME" |  sed -r "s/-/./" | sed -r "s/SNAPSHOT/dev0/"`
+    echo "__version__='$PYSPARK_VERSION'" > python/pyspark/version.py
+
     # Get maven home set by MVN
     MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
 
-    ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+    echo "Creating distribution"
+    ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
       -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
     cd ..
-    cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
 
+    echo "Copying and signing python distribution"
+    PYTHON_DIST_NAME=pyspark-$PYSPARK_VERSION.tar.gz
+    cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_DIST_NAME .
+
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
+      --output $PYTHON_DIST_NAME.asc \
+      --detach-sig $PYTHON_DIST_NAME
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+      MD5 $PYTHON_DIST_NAME > \
+      $PYTHON_DIST_NAME.md5
+    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+      SHA512 $PYTHON_DIST_NAME > \
+      $PYTHON_DIST_NAME.sha
+
+    echo "Copying and signing regular binary distribution"
+    cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
       --output spark-$SPARK_VERSION-bin-$NAME.tgz.asc \
       --detach-sig spark-$SPARK_VERSION-bin-$NAME.tgz
@@ -208,6 +229,7 @@ if [[ "$1" == "package" ]]; then
   # Re-upload a second time and leave the files in the timestamped upload directory:
   LFTP mkdir -p $dest_dir
   LFTP mput -O $dest_dir 'spark-*'
+  LFTP mput -O $dest_dir 'pyspark-*'
   exit 0
 fi
 
diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh
index b7e5100ca7408..370a62ce15bc4 100755
--- a/dev/create-release/release-tag.sh
+++ b/dev/create-release/release-tag.sh
@@ -65,6 +65,7 @@ sed -i".tmp1" 's/Version.*$/Version: '"$RELEASE_VERSION"'/g' R/pkg/DESCRIPTION
 # Set the release version in docs
 sed -i".tmp1" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$RELEASE_VERSION"'/g' docs/_config.yml
 sed -i".tmp2" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$RELEASE_VERSION"'/g' docs/_config.yml
+sed -i".tmp3" 's/__version__ = .*$/__version__ = "'"$RELEASE_VERSION"'"/' python/pyspark/version.py
 
 git commit -a -m "Preparing Spark release $RELEASE_TAG"
 echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH"
@@ -74,12 +75,16 @@ git tag $RELEASE_TAG
 $MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs
 # Remove -SNAPSHOT before setting the R version as R expects version strings to only have numbers
 R_NEXT_VERSION=`echo $NEXT_VERSION | sed 's/-SNAPSHOT//g'`
-sed -i".tmp2" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION
+sed -i".tmp4" 's/Version.*$/Version: '"$R_NEXT_VERSION"'/g' R/pkg/DESCRIPTION
+# Write out the R_NEXT_VERSION to PySpark version info we use dev0 instead of SNAPSHOT to be closer
+# to PEP440.
+sed -i".tmp5" 's/__version__ = .*$/__version__ = "'"$R_NEXT_VERSION.dev0"'"/' python/pyspark/version.py
+
 
 # Update docs with next version
-sed -i".tmp3" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$NEXT_VERSION"'/g' docs/_config.yml
+sed -i".tmp6" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$NEXT_VERSION"'/g' docs/_config.yml
 # Use R version for short version
-sed -i".tmp4" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$R_NEXT_VERSION"'/g' docs/_config.yml
+sed -i".tmp7" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$R_NEXT_VERSION"'/g' docs/_config.yml
 
 git commit -a -m "Preparing development version $NEXT_VERSION"
 
diff --git a/dev/lint-python b/dev/lint-python
index 63487043a50b6..3f878c2dad6b1 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -20,7 +20,9 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
+# TODO: fix pep8 errors with the rest of the Python scripts under dev
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/*.py ./dev/run-tests-jenkins.py"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/pip-sanity-check.py"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
 PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 9be4fdfa51c93..49b46fbc3fb27 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -33,6 +33,7 @@ SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)"
 DISTDIR="$SPARK_HOME/dist"
 
 MAKE_TGZ=false
+MAKE_PIP=false
 NAME=none
 MVN="$SPARK_HOME/build/mvn"
 
@@ -40,7 +41,7 @@ function exit_with_usage {
   echo "make-distribution.sh - tool for making binary distributions of Spark"
   echo ""
   echo "usage:"
-  cl_options="[--name] [--tgz] [--mvn <mvn-command>]"
+  cl_options="[--name] [--tgz] [--pip] [--mvn <mvn-command>]"
   echo "make-distribution.sh $cl_options <maven build options>"
   echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
@@ -67,6 +68,9 @@ while (( "$#" )); do
     --tgz)
       MAKE_TGZ=true
       ;;
+    --pip)
+      MAKE_PIP=true
+      ;;
     --mvn)
       MVN="$2"
       shift
@@ -201,6 +205,16 @@ fi
 # Copy data files
 cp -r "$SPARK_HOME/data" "$DISTDIR"
 
+# Make pip package
+if [ "$MAKE_PIP" == "true" ]; then
+  echo "Building python distribution package"
+  cd $SPARK_HOME/python
+  python setup.py sdist
+  cd ..
+else
+  echo "Skipping creating pip installable PySpark"
+fi
+
 # Copy other things
 mkdir "$DISTDIR"/conf
 cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py
new file mode 100644
index 0000000000000..430c2ab52766a
--- /dev/null
+++ b/dev/pip-sanity-check.py
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark.sql import SparkSession
+import sys
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("PipSanityCheck")\
+        .getOrCreate()
+    sc = spark.sparkContext
+    rdd = sc.parallelize(range(100), 10)
+    value = rdd.reduce(lambda x, y: x + y)
+    if (value != 4950):
+        print("Value {0} did not match expected value.".format(value), file=sys.stderr)
+        sys.exit(-1)
+    print("Successfully ran pip sanity check")
+
+    spark.stop()
diff --git a/dev/run-pip-tests b/dev/run-pip-tests
new file mode 100755
index 0000000000000..e1da18e60bb3d
--- /dev/null
+++ b/dev/run-pip-tests
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Stop on error
+set -e
+# Set nullglob for when we are checking existence based on globs
+shopt -s nullglob
+
+FWDIR="$(cd "$(dirname "$0")"/..; pwd)"
+cd "$FWDIR"
+
+echo "Constucting virtual env for testing"
+VIRTUALENV_BASE=$(mktemp -d)
+
+# Clean up the virtual env enviroment used if we created one.
+function delete_virtualenv() {
+  echo "Cleaning up temporary directory - $VIRTUALENV_BASE"
+  rm -rf "$VIRTUALENV_BASE"
+}
+trap delete_virtualenv EXIT
+
+# Some systems don't have pip or virtualenv - in those cases our tests won't work.
+if ! hash virtualenv 2>/dev/null; then
+  echo "Missing virtualenv skipping pip installability tests."
+  exit 0
+fi
+if ! hash pip 2>/dev/null; then
+  echo "Missing pip, skipping pip installability tests."
+  exit 0
+fi
+
+# Figure out which Python execs we should test pip installation with
+PYTHON_EXECS=()
+if hash python2 2>/dev/null; then
+  # We do this since we are testing with virtualenv and the default virtual env python
+  # is in /usr/bin/python
+  PYTHON_EXECS+=('python2')
+elif hash python 2>/dev/null; then
+  # If python2 isn't installed fallback to python if available
+  PYTHON_EXECS+=('python')
+fi
+if hash python3 2>/dev/null; then
+  PYTHON_EXECS+=('python3')
+fi
+
+# Determine which version of PySpark we are building for archive name
+PYSPARK_VERSION=$(python -c "exec(open('python/pyspark/version.py').read());print __version__")
+PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
+# The pip install options we use for all the pip commands
+PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall "
+# Test both regular user and edit/dev install modes.
+PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
+	      "pip install $PIP_OPTIONS -e python/")
+
+for python in "${PYTHON_EXECS[@]}"; do
+  for install_command in "${PIP_COMMANDS[@]}"; do
+    echo "Testing pip installation with python $python"
+    # Create a temp directory for us to work in and save its name to a file for cleanup
+    echo "Using $VIRTUALENV_BASE for virtualenv"
+    VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
+    rm -rf "$VIRTUALENV_PATH"
+    mkdir -p "$VIRTUALENV_PATH"
+    virtualenv --python=$python "$VIRTUALENV_PATH"
+    source "$VIRTUALENV_PATH"/bin/activate
+    # Upgrade pip
+    pip install --upgrade pip
+
+    echo "Creating pip installable source dist"
+    cd "$FWDIR"/python
+    $python setup.py sdist
+
+
+    echo "Installing dist into virtual env"
+    cd dist
+    # Verify that the dist directory only contains one thing to install
+    sdists=(*.tar.gz)
+    if [ ${#sdists[@]} -ne 1 ]; then
+      echo "Unexpected number of targets found in dist directory - please cleanup existing sdists first."
+      exit -1
+    fi
+    # Do the actual installation
+    cd "$FWDIR"
+    $install_command
+
+    cd /
+
+    echo "Run basic sanity check on pip installed version with spark-submit"
+    spark-submit "$FWDIR"/dev/pip-sanity-check.py
+    echo "Run basic sanity check with import based"
+    python "$FWDIR"/dev/pip-sanity-check.py
+    echo "Run the tests for context.py"
+    python "$FWDIR"/python/pyspark/context.py
+
+    cd "$FWDIR"
+
+  done
+done
+
+exit 0
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index a48d918f9dc1f..1d1e72faccf2a 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -128,6 +128,7 @@ def run_tests(tests_timeout):
         ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests',
         ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests',
         ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests',
+        ERROR_CODES["BLOCK_PYSPARK_PIP_TESTS"]: 'PySpark pip packaging tests',
         ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests',
         ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % (
             tests_timeout)
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 5d661f5f1a1c5..ab285ac96af7e 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -432,6 +432,12 @@ def run_python_tests(test_modules, parallelism):
     run_cmd(command)
 
 
+def run_python_packaging_tests():
+    set_title_and_block("Running PySpark packaging tests", "BLOCK_PYSPARK_PIP_TESTS")
+    command = [os.path.join(SPARK_HOME, "dev", "run-pip-tests")]
+    run_cmd(command)
+
+
 def run_build_tests():
     set_title_and_block("Running build tests", "BLOCK_BUILD_TESTS")
     run_cmd([os.path.join(SPARK_HOME, "dev", "test-dependencies.sh")])
@@ -583,6 +589,7 @@ def main():
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
         run_python_tests(modules_with_python_tests, opts.parallelism)
+        run_python_packaging_tests()
     if any(m.should_run_r_tests for m in test_modules):
         run_sparkr_tests()
 
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index 89015f8c4fb9c..38f25da41f775 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -33,5 +33,6 @@
     "BLOCK_SPARKR_UNIT_TESTS": 20,
     "BLOCK_JAVA_STYLE": 21,
     "BLOCK_BUILD_TESTS": 22,
+    "BLOCK_PYSPARK_PIP_TESTS": 23,
     "BLOCK_TIMEOUT": 124
 }
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 2b404bd3e116c..88da0cc9c3bbf 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -265,6 +265,14 @@ or
 Java 8 tests are automatically enabled when a Java 8 JDK is detected.
 If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests.
 
+## PySpark pip installable
+
+If you are building Spark for use in a Python environment and you wish to pip install it, you will first need to build the Spark JARs as described above. Then you can construct an sdist package suitable for setup.py and pip installable package.
+
+    cd python; python setup.py sdist
+
+**Note:** Due to packaging requirements you can not directly pip install from the Python directory, rather you must first build the sdist package as described above.
+
 ## PySpark Tests with Maven
 
 If you are building PySpark and wish to run the PySpark tests you will need to build Spark with Hive support.
diff --git a/docs/index.md b/docs/index.md
index fe51439ae08d7..39de11de854a7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -14,7 +14,9 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog
 
 Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark uses Hadoop's client libraries for HDFS and YARN. Downloads are pre-packaged for a handful of popular Hadoop versions.
 Users can also download a "Hadoop free" binary and run Spark with any Hadoop version
-[by augmenting Spark's classpath](hadoop-provided.html). 
+[by augmenting Spark's classpath](hadoop-provided.html).
+Scala and Java users can include Spark in their projects using its maven cooridnates and in the future Python users can also install Spark from PyPI.
+
 
 If you'd like to build Spark from 
 source, visit [Building Spark](building-spark.html).
diff --git a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
index 62a22008d0d5d..250b2a882feb5 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
@@ -357,7 +357,7 @@ static int javaMajorVersion(String javaVersion) {
   static String findJarsDir(String sparkHome, String scalaVersion, boolean failIfNotFound) {
     // TODO: change to the correct directory once the assembly build is changed.
     File libdir;
-    if (new File(sparkHome, "RELEASE").isFile()) {
+    if (new File(sparkHome, "jars").isDirectory()) {
       libdir = new File(sparkHome, "jars");
       checkState(!failIfNotFound || libdir.isDirectory(),
         "Library directory '%s' does not exist.",
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
new file mode 100644
index 0000000000000..bbcce1baa439d
--- /dev/null
+++ b/python/MANIFEST.in
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+global-exclude *.py[cod] __pycache__ .DS_Store
+recursive-include deps/jars *.jar
+graft deps/bin
+recursive-include deps/examples *.py
+recursive-include lib *.zip
+include README.md
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000000000..0a5c8010b8486
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,32 @@
+# Apache Spark
+
+Spark is a fast and general cluster computing system for Big Data. It provides
+high-level APIs in Scala, Java, Python, and R, and an optimized engine that
+supports general computation graphs for data analysis. It also supports a
+rich set of higher-level tools including Spark SQL for SQL and DataFrames,
+MLlib for machine learning, GraphX for graph processing,
+and Spark Streaming for stream processing.
+
+<http://spark.apache.org/>
+
+## Online Documentation
+
+You can find the latest Spark documentation, including a programming
+guide, on the [project web page](http://spark.apache.org/documentation.html)
+
+
+## Python Packaging
+
+This README file only contains basic information related to pip installed PySpark.
+This packaging is currently experimental and may change in future versions (although we will do our best to keep compatibility).
+Using PySpark requires the Spark JARs, and if you are building this from source please see the builder instructions at
+["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
+
+The Python packaging for Spark is not intended to replace all of the other use cases. This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. You can download the full version of Spark from the [Apache Spark downloads page](http://spark.apache.org/downloads.html).
+
+
+**NOTE:** If you are using this with a Spark standalone cluster you must ensure that the version (including minor version) matches or you may experience odd errors.
+
+## Python Requirements
+
+At its core PySpark depends on Py4J (currently version 0.10.4), but additional sub-packages have their own requirements (including numpy and pandas).
\ No newline at end of file
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
index ec1687415a7f6..5f93586a48a5a 100644
--- a/python/pyspark/__init__.py
+++ b/python/pyspark/__init__.py
@@ -50,6 +50,7 @@
 from pyspark.serializers import MarshalSerializer, PickleSerializer
 from pyspark.status import *
 from pyspark.profiler import Profiler, BasicProfiler
+from pyspark.version import __version__
 
 
 def since(version):
diff --git a/python/pyspark/find_spark_home.py b/python/pyspark/find_spark_home.py
new file mode 100755
index 0000000000000..212a618b767ab
--- /dev/null
+++ b/python/pyspark/find_spark_home.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script attempt to determine the correct setting for SPARK_HOME given
+# that Spark may have been installed on the system with pip.
+
+from __future__ import print_function
+import os
+import sys
+
+
+def _find_spark_home():
+    """Find the SPARK_HOME."""
+    # If the enviroment has SPARK_HOME set trust it.
+    if "SPARK_HOME" in os.environ:
+        return os.environ["SPARK_HOME"]
+
+    def is_spark_home(path):
+        """Takes a path and returns true if the provided path could be a reasonable SPARK_HOME"""
+        return (os.path.isfile(os.path.join(path, "bin/spark-submit")) and
+                (os.path.isdir(os.path.join(path, "jars")) or
+                 os.path.isdir(os.path.join(path, "assembly"))))
+
+    paths = ["../", os.path.dirname(os.path.realpath(__file__))]
+
+    # Add the path of the PySpark module if it exists
+    if sys.version < "3":
+        import imp
+        try:
+            module_home = imp.find_module("pyspark")[1]
+            paths.append(module_home)
+            # If we are installed in edit mode also look two dirs up
+            paths.append(os.path.join(module_home, "../../"))
+        except ImportError:
+            # Not pip installed no worries
+            pass
+    else:
+        from importlib.util import find_spec
+        try:
+            module_home = os.path.dirname(find_spec("pyspark").origin)
+            paths.append(module_home)
+            # If we are installed in edit mode also look two dirs up
+            paths.append(os.path.join(module_home, "../../"))
+        except ImportError:
+            # Not pip installed no worries
+            pass
+
+    # Normalize the paths
+    paths = [os.path.abspath(p) for p in paths]
+
+    try:
+        return next(path for path in paths if is_spark_home(path))
+    except StopIteration:
+        print("Could not find valid SPARK_HOME while searching {0}".format(paths), file=sys.stderr)
+        exit(-1)
+
+if __name__ == "__main__":
+    print(_find_spark_home())
diff --git a/python/pyspark/java_gateway.py b/python/pyspark/java_gateway.py
index c1cf843d84388..3c783ae541a1f 100644
--- a/python/pyspark/java_gateway.py
+++ b/python/pyspark/java_gateway.py
@@ -29,6 +29,7 @@
     xrange = range
 
 from py4j.java_gateway import java_import, JavaGateway, GatewayClient
+from pyspark.find_spark_home import _find_spark_home
 from pyspark.serializers import read_int
 
 
@@ -41,7 +42,7 @@ def launch_gateway(conf=None):
     if "PYSPARK_GATEWAY_PORT" in os.environ:
         gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
     else:
-        SPARK_HOME = os.environ["SPARK_HOME"]
+        SPARK_HOME = _find_spark_home()
         # Launch the Py4j gateway using Spark's run command so that we pick up the
         # proper classpath and settings from spark-env.sh
         on_windows = platform.system() == "Windows"
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
new file mode 100644
index 0000000000000..08a301695fda7
--- /dev/null
+++ b/python/pyspark/version.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "2.1.0.dev0"
diff --git a/python/setup.cfg b/python/setup.cfg
new file mode 100644
index 0000000000000..d100b932bbafc
--- /dev/null
+++ b/python/setup.cfg
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[bdist_wheel]
+universal = 1
+
+[metadata]
+description-file = README.md
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000000000..625aea04073f5
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import glob
+import os
+import sys
+from setuptools import setup, find_packages
+from shutil import copyfile, copytree, rmtree
+
+if sys.version_info < (2, 7):
+    print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
+          file=sys.stderr)
+    exit(-1)
+
+try:
+    exec(open('pyspark/version.py').read())
+except IOError:
+    print("Failed to load PySpark version file for packaging. You must be in Spark's python dir.",
+          file=sys.stderr)
+    sys.exit(-1)
+VERSION = __version__
+# A temporary path so we can access above the Python project root and fetch scripts and jars we need
+TEMP_PATH = "deps"
+SPARK_HOME = os.path.abspath("../")
+
+# Provide guidance about how to use setup.py
+incorrect_invocation_message = """
+If you are installing pyspark from spark source, you must first build Spark and
+run sdist.
+
+    To build Spark with maven you can run:
+      ./build/mvn -DskipTests clean package
+    Building the source dist is done in the Python directory:
+      cd python
+      python setup.py sdist
+      pip install dist/*.tar.gz"""
+
+# Figure out where the jars are we need to package with PySpark.
+JARS_PATH = glob.glob(os.path.join(SPARK_HOME, "assembly/target/scala-*/jars/"))
+
+if len(JARS_PATH) == 1:
+    JARS_PATH = JARS_PATH[0]
+elif (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1):
+    # Release mode puts the jars in a jars directory
+    JARS_PATH = os.path.join(SPARK_HOME, "jars")
+elif len(JARS_PATH) > 1:
+    print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
+        JARS_PATH), file=sys.stderr)
+    sys.exit(-1)
+elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
+    print(incorrect_invocation_message, file=sys.stderr)
+    sys.exit(-1)
+
+EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
+SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
+JARS_TARGET = os.path.join(TEMP_PATH, "jars")
+EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
+
+
+# Check and see if we are under the spark path in which case we need to build the symlink farm.
+# This is important because we only want to build the symlink farm while under Spark otherwise we
+# want to use the symlink farm. And if the symlink farm exists under while under Spark (e.g. a
+# partially built sdist) we should error and have the user sort it out.
+in_spark = (os.path.isfile("../core/src/main/scala/org/apache/spark/SparkContext.scala") or
+            (os.path.isfile("../RELEASE") and len(glob.glob("../jars/spark*core*.jar")) == 1))
+
+
+def _supports_symlinks():
+    """Check if the system supports symlinks (e.g. *nix) or not."""
+    return getattr(os, "symlink", None) is not None
+
+
+if (in_spark):
+    # Construct links for setup
+    try:
+        os.mkdir(TEMP_PATH)
+    except:
+        print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
+              file=sys.stderr)
+        exit(-1)
+
+try:
+    # We copy the shell script to be under pyspark/python/pyspark so that the launcher scripts
+    # find it where expected. The rest of the files aren't copied because they are accessed
+    # using Python imports instead which will be resolved correctly.
+    try:
+        os.makedirs("pyspark/python/pyspark")
+    except OSError:
+        # Don't worry if the directory already exists.
+        pass
+    copyfile("pyspark/shell.py", "pyspark/python/pyspark/shell.py")
+
+    if (in_spark):
+        # Construct the symlink farm - this is necessary since we can't refer to the path above the
+        # package root and we need to copy the jars and scripts which are up above the python root.
+        if _supports_symlinks():
+            os.symlink(JARS_PATH, JARS_TARGET)
+            os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
+            os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+        else:
+            # For windows fall back to the slower copytree
+            copytree(JARS_PATH, JARS_TARGET)
+            copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
+            copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+    else:
+        # If we are not inside of SPARK_HOME verify we have the required symlink farm
+        if not os.path.exists(JARS_TARGET):
+            print("To build packaging must be in the python directory under the SPARK_HOME.",
+                  file=sys.stderr)
+
+    if not os.path.isdir(SCRIPTS_TARGET):
+        print(incorrect_invocation_message, file=sys.stderr)
+        exit(-1)
+
+    # Scripts directive requires a list of each script path and does not take wild cards.
+    script_names = os.listdir(SCRIPTS_TARGET)
+    scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))
+    # We add find_spark_home.py to the bin directory we install so that pip installed PySpark
+    # will search for SPARK_HOME with Python.
+    scripts.append("pyspark/find_spark_home.py")
+
+    # Parse the README markdown file into rst for PyPI
+    long_description = "!!!!! missing pandoc do not upload to PyPI !!!!"
+    try:
+        import pypandoc
+        long_description = pypandoc.convert('README.md', 'rst')
+    except ImportError:
+        print("Could not import pypandoc - required to package PySpark", file=sys.stderr)
+
+    setup(
+        name='pyspark',
+        version=VERSION,
+        description='Apache Spark Python API',
+        long_description=long_description,
+        author='Spark Developers',
+        author_email='dev@spark.apache.org',
+        url='https://github.com/apache/spark/tree/master/python',
+        packages=['pyspark',
+                  'pyspark.mllib',
+                  'pyspark.ml',
+                  'pyspark.sql',
+                  'pyspark.streaming',
+                  'pyspark.bin',
+                  'pyspark.jars',
+                  'pyspark.python.pyspark',
+                  'pyspark.python.lib',
+                  'pyspark.examples.src.main.python'],
+        include_package_data=True,
+        package_dir={
+            'pyspark.jars': 'deps/jars',
+            'pyspark.bin': 'deps/bin',
+            'pyspark.python.lib': 'lib',
+            'pyspark.examples.src.main.python': 'deps/examples',
+        },
+        package_data={
+            'pyspark.jars': ['*.jar'],
+            'pyspark.bin': ['*'],
+            'pyspark.python.lib': ['*.zip'],
+            'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
+        scripts=scripts,
+        license='http://www.apache.org/licenses/LICENSE-2.0',
+        install_requires=['py4j==0.10.4'],
+        setup_requires=['pypandoc'],
+        extras_require={
+            'ml': ['numpy>=1.7'],
+            'mllib': ['numpy>=1.7'],
+            'sql': ['pandas']
+        },
+        classifiers=[
+            'Development Status :: 5 - Production/Stable',
+            'License :: OSI Approved :: Apache Software License',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: Implementation :: CPython',
+            'Programming Language :: Python :: Implementation :: PyPy']
+    )
+finally:
+    # We only cleanup the symlink farm if we were in Spark, otherwise we are installing rather than
+    # packaging.
+    if (in_spark):
+        # Depending on cleaning up the symlink farm or copied version
+        if _supports_symlinks():
+            os.remove(os.path.join(TEMP_PATH, "jars"))
+            os.remove(os.path.join(TEMP_PATH, "bin"))
+            os.remove(os.path.join(TEMP_PATH, "examples"))
+        else:
+            rmtree(os.path.join(TEMP_PATH, "jars"))
+            rmtree(os.path.join(TEMP_PATH, "bin"))
+            rmtree(os.path.join(TEMP_PATH, "examples"))
+        os.rmdir(TEMP_PATH)

From 014fceee04c69d7944c74b3794e821e4d1003dd0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 17 Nov 2016 00:00:38 -0800
Subject: [PATCH 133/534] [SPARK-18464][SQL] support old table which doesn't
 store schema in metastore

## What changes were proposed in this pull request?

Before Spark 2.1, users can create an external data source table without schema, and we will infer the table schema at runtime. In Spark 2.1, we decided to infer the schema when the table was created, so that we don't need to infer it again and again at runtime.

This is a good improvement, but we should still respect and support old tables which doesn't store table schema in metastore.

## How was this patch tested?

regression test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15900 from cloud-fan/hive-catalog.

(cherry picked from commit 07b3f045cd6f79b92bc86b3b1b51d3d5e6bd37ce)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/execution/command/tables.scala  |  8 ++++++-
 .../spark/sql/hive/HiveExternalCatalog.scala  |  5 +++++
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  4 +++-
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 22 +++++++++++++++++++
 4 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 119e732d0202c..7049e53a78684 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -431,7 +431,13 @@ case class DescribeTableCommand(
       describeSchema(catalog.lookupRelation(table).schema, result)
     } else {
       val metadata = catalog.getTableMetadata(table)
-      describeSchema(metadata.schema, result)
+      if (metadata.schema.isEmpty) {
+        // In older version(prior to 2.1) of Spark, the table schema can be empty and should be
+        // inferred at runtime. We should still support it.
+        describeSchema(catalog.lookupRelation(metadata.identifier).schema, result)
+      } else {
+        describeSchema(metadata.schema, result)
+      }
 
       describePartitionInfo(metadata, result)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index cbd00da81cfcd..843305883abc8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -1023,6 +1023,11 @@ object HiveExternalCatalog {
       // After SPARK-6024, we removed this flag.
       // Although we are not using `spark.sql.sources.schema` any more, we need to still support.
       DataType.fromJson(schema.get).asInstanceOf[StructType]
+    } else if (props.filterKeys(_.startsWith(DATASOURCE_SCHEMA_PREFIX)).isEmpty) {
+      // If there is no schema information in table properties, it means the schema of this table
+      // was empty when saving into metastore, which is possible in older version(prior to 2.1) of
+      // Spark. We should respect it.
+      new StructType()
     } else {
       val numSchemaParts = props.get(DATASOURCE_SCHEMA_NUMPARTS)
       if (numSchemaParts.isDefined) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 8e5fc88aad448..edbde5d10b47c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -64,7 +64,9 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val dataSource =
           DataSource(
             sparkSession,
-            userSpecifiedSchema = Some(table.schema),
+            // In older version(prior to 2.1) of Spark, the table schema can be empty and should be
+            // inferred at runtime. We should still support it.
+            userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = table.provider.get,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c50f92e783c88..4ab1a54edc46d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1371,4 +1371,26 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       }
     }
   }
+
+  test("SPARK-18464: support old table which doesn't store schema in table properties") {
+    withTable("old") {
+      withTempPath { path =>
+        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
+        val tableDesc = CatalogTable(
+          identifier = TableIdentifier("old", Some("default")),
+          tableType = CatalogTableType.EXTERNAL,
+          storage = CatalogStorageFormat.empty.copy(
+            properties = Map("path" -> path.getAbsolutePath)
+          ),
+          schema = new StructType(),
+          properties = Map(
+            HiveExternalCatalog.DATASOURCE_PROVIDER -> "parquet"))
+        hiveClient.createTable(tableDesc, ignoreIfExists = false)
+
+        checkAnswer(spark.table("old"), Row(1, "a"))
+
+        checkAnswer(sql("DESC old"), Row("i", "int", null) :: Row("j", "string", null) :: Nil)
+      }
+    }
+  }
 }

From 2ee4fc8891be53b2fae43faa5cd09ade32173bba Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Thu, 17 Nov 2016 11:13:22 +0000
Subject: [PATCH 134/534] [YARN][DOC] Remove non-Yarn specific configurations
 from running-on-yarn.md

## What changes were proposed in this pull request?

Remove `spark.driver.memory`, `spark.executor.memory`,  `spark.driver.cores`, and `spark.executor.cores` from `running-on-yarn.md` as they are not Yarn-specific, and they are also defined in`configuration.md`.

## How was this patch tested?
Build passed & Manually check.

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15869 from weiqingy/yarnDoc.

(cherry picked from commit a3cac7bd86a6fe8e9b42da1bf580aaeb59378304)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/running-on-yarn.md | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index fe0221ce7c5b6..4d1fafc07b8fc 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -117,28 +117,6 @@ To use a custom metrics.properties for the application master and executors, upd
     Use lower-case suffixes, e.g. <code>k</code>, <code>m</code>, <code>g</code>, <code>t</code>, and <code>p</code>, for kibi-, mebi-, gibi-, tebi-, and pebibytes, respectively.
   </td>
 </tr>
-<tr>
-  <td><code>spark.driver.memory</code></td>
-  <td>1g</td>
-  <td>
-    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
-    (e.g. <code>1g</code>, <code>2g</code>).
-
-    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
-    directly in your application, because the driver JVM has already started at that point.
-    Instead, please set this through the <code>--driver-memory</code> command line option
-    or in your default properties file.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.driver.cores</code></td>
-  <td><code>1</code></td>
-  <td>
-    Number of cores used by the driver in YARN cluster mode.
-    Since the driver is run in the same JVM as the YARN Application Master in cluster mode, this also controls the cores used by the YARN Application Master.
-    In client mode, use <code>spark.yarn.am.cores</code> to control the number of cores used by the YARN Application Master instead.
-  </td>
-</tr>
 <tr>
   <td><code>spark.yarn.am.cores</code></td>
   <td><code>1</code></td>
@@ -233,13 +211,6 @@ To use a custom metrics.properties for the application master and executors, upd
     Comma-separated list of jars to be placed in the working directory of each executor.
   </td>
 </tr>
-<tr>
-  <td><code>spark.executor.cores</code></td>
-  <td>1 in YARN mode, all the available cores on the worker in standalone mode.</td>
-  <td>
-    The number of cores to use on each executor. For YARN and standalone mode only.
-  </td>
-</tr>
 <tr>
  <td><code>spark.executor.instances</code></td>
   <td><code>2</code></td>
@@ -247,13 +218,6 @@ To use a custom metrics.properties for the application master and executors, upd
     The number of executors for static allocation. With <code>spark.dynamicAllocation.enabled</code>, the initial set of executors will be at least this large.
   </td>
 </tr>
-<tr>
-  <td><code>spark.executor.memory</code></td>
-  <td>1g</td>
-  <td>
-    Amount of memory to use per executor process (e.g. <code>2g</code>, <code>8g</code>).
-  </td>
-</tr>
 <tr>
  <td><code>spark.yarn.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>

From 4fcecb4cf081fba0345f1939420ca1d9f6de720c Mon Sep 17 00:00:00 2001
From: anabranch <wac.chambers@gmail.com>
Date: Thu, 17 Nov 2016 11:34:55 +0000
Subject: [PATCH 135/534] [SPARK-18365][DOCS] Improve Sample Method
 Documentation

## What changes were proposed in this pull request?

I found the documentation for the sample method to be confusing, this adds more clarification across all languages.

- [x] Scala
- [x] Python
- [x] R
- [x] RDD Scala
- [ ] RDD Python with SEED
- [X] RDD Java
- [x] RDD Java with SEED
- [x] RDD Python

## How was this patch tested?

NA

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

Author: anabranch <wac.chambers@gmail.com>
Author: Bill Chambers <bill@databricks.com>

Closes #15815 from anabranch/SPARK-18365.

(cherry picked from commit 49b6f456aca350e9e2c170782aa5cc75e7822680)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 R/pkg/R/DataFrame.R                                    |  4 +++-
 .../main/scala/org/apache/spark/api/java/JavaRDD.scala |  8 ++++++--
 core/src/main/scala/org/apache/spark/rdd/RDD.scala     |  3 +++
 python/pyspark/rdd.py                                  |  5 +++++
 python/pyspark/sql/dataframe.py                        |  5 +++++
 .../src/main/scala/org/apache/spark/sql/Dataset.scala  | 10 ++++++++--
 6 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1cf9b38ea6483..4e3d97bb3ad07 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -936,7 +936,9 @@ setMethod("unique",
 
 #' Sample
 #'
-#' Return a sampled subset of this SparkDataFrame using a random seed.
+#' Return a sampled subset of this SparkDataFrame using a random seed. 
+#' Note: this is not guaranteed to provide exactly the fraction specified
+#' of the total count of of the given SparkDataFrame.
 #'
 #' @param x A SparkDataFrame
 #' @param withReplacement Sampling with replacement or not
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index 20d6c9341bf7a..d67cff64e6e46 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -98,7 +98,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
   def repartition(numPartitions: Int): JavaRDD[T] = rdd.repartition(numPartitions)
 
   /**
-   * Return a sampled subset of this RDD.
+   * Return a sampled subset of this RDD with a random seed.
+   * Note: this is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
    *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
@@ -109,7 +111,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
     sample(withReplacement, fraction, Utils.random.nextLong)
 
   /**
-   * Return a sampled subset of this RDD.
+   * Return a sampled subset of this RDD, with a user-supplied seed.
+   * Note: this is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
    *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index e018af35cb18d..cded899db1f5c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -466,6 +466,9 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a sampled subset of this RDD.
    *
+   * Note: this is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
+   *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 2de2c2fd1a60b..a163ceafe9d3b 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -386,6 +386,11 @@ def sample(self, withReplacement, fraction, seed=None):
             with replacement: expected number of times each element is chosen; fraction must be >= 0
         :param seed: seed for the random number generator
 
+        .. note::
+
+            This is not guaranteed to provide exactly the fraction specified of the total count
+            of the given :class:`DataFrame`.
+
         >>> rdd = sc.parallelize(range(100), 4)
         >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14
         True
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 29710acf54c4f..38998900837cf 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -549,6 +549,11 @@ def distinct(self):
     def sample(self, withReplacement, fraction, seed=None):
         """Returns a sampled subset of this :class:`DataFrame`.
 
+        .. note::
+
+            This is not guaranteed to provide exactly the fraction specified of the total count
+            of the given :class:`DataFrame`.
+
         >>> df.sample(False, 0.5, 42).count()
         2
         """
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index af30683cc01c4..3761773698df3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1646,7 +1646,10 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new Dataset by sampling a fraction of rows.
+   * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed.
+   *
+   * Note: this is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[Dataset]].
    *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
@@ -1665,7 +1668,10 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new Dataset by sampling a fraction of rows, using a random seed.
+   * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed.
+   *
+   * Note: this is NOT guaranteed to provide exactly the fraction of the total count
+   * of the given [[Dataset]].
    *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.

From 42777b1b3c10d3945494e27f1dedd43f2f836361 Mon Sep 17 00:00:00 2001
From: VinceShieh <vincent.xie@intel.com>
Date: Thu, 17 Nov 2016 13:37:42 +0000
Subject: [PATCH 136/534] [SPARK-17462][MLLIB]use VersionUtils to parse Spark
 version strings

## What changes were proposed in this pull request?

Several places in MLlib use custom regexes or other approaches to parse Spark versions.
Those should be fixed to use the VersionUtils. This PR replaces custom regexes with
VersionUtils to get Spark version numbers.
## How was this patch tested?

Existing tests.

Signed-off-by: VinceShieh vincent.xieintel.com

Author: VinceShieh <vincent.xie@intel.com>

Closes #15055 from VinceShieh/SPARK-17462.

(cherry picked from commit de77c67750dc868d75d6af173c3820b75a9fe4b7)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/scala/org/apache/spark/ml/clustering/KMeans.scala  | 6 ++----
 mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala  | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index a0d481b294ac7..26505b4cc1501 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -33,6 +33,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.util.VersionUtils.majorVersion
 
 /**
  * Common params for KMeans and KMeansModel
@@ -232,10 +233,7 @@ object KMeansModel extends MLReadable[KMeansModel] {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
       val dataPath = new Path(path, "data").toString
 
-      val versionRegex = "([0-9]+)\\.(.+)".r
-      val versionRegex(major, _) = metadata.sparkVersion
-
-      val clusterCenters = if (major.toInt >= 2) {
+      val clusterCenters = if (majorVersion(metadata.sparkVersion) >= 2) {
         val data: Dataset[Data] = sparkSession.read.parquet(dataPath).as[Data]
         data.collect().sortBy(_.clusterIdx).map(_.clusterCenter).map(OldVectors.fromML)
       } else {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 444006fe1edb6..1e49352b8517e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.util.VersionUtils.majorVersion
 
 /**
  * Params for [[PCA]] and [[PCAModel]].
@@ -204,11 +205,8 @@ object PCAModel extends MLReadable[PCAModel] {
     override def load(path: String): PCAModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
-      val versionRegex = "([0-9]+)\\.(.+)".r
-      val versionRegex(major, _) = metadata.sparkVersion
-
       val dataPath = new Path(path, "data").toString
-      val model = if (major.toInt >= 2) {
+      val model = if (majorVersion(metadata.sparkVersion) >= 2) {
         val Row(pc: DenseMatrix, explainedVariance: DenseVector) =
           sparkSession.read.parquet(dataPath)
             .select("pc", "explainedVariance")

From 536a2159393c82d414cc46797c8bfd958f453d33 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 17 Nov 2016 13:40:16 +0000
Subject: [PATCH 137/534] [SPARK-18480][DOCS] Fix wrong links for ML guide docs

## What changes were proposed in this pull request?
1, There are two `[Graph.partitionBy]` in `graphx-programming-guide.md`, the first one had no effert.
2, `DataFrame`, `Transformer`, `Pipeline` and `Parameter`  in `ml-pipeline.md` were linked to `ml-guide.html` by mistake.
3, `PythonMLLibAPI` in `mllib-linear-methods.md` was not accessable, because class `PythonMLLibAPI` is private.
4, Other link updates.
## How was this patch tested?
 manual tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15912 from zhengruifeng/md_fix.

(cherry picked from commit cdaf4ce9fe58c4606be8aa2a5c3756d30545c850)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/graphx-programming-guide.md                     |  1 -
 docs/ml-classification-regression.md                 |  4 ++--
 docs/ml-features.md                                  |  2 +-
 docs/ml-pipeline.md                                  | 12 ++++++------
 docs/mllib-linear-methods.md                         |  4 +---
 .../main/scala/org/apache/spark/ml/feature/LSH.scala |  2 +-
 .../spark/ml/tree/impl/GradientBoostedTrees.scala    |  8 ++++----
 .../org/apache/spark/ml/tree/impl/RandomForest.scala |  8 ++++----
 8 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 1097cf1211c1f..e271b28fb4f28 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -36,7 +36,6 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT
 [Graph.fromEdgeTuples]: api/scala/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexId,VertexId)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int]
 [Graph.fromEdges]: api/scala/index.html#org.apache.spark.graphx.Graph$@fromEdges[VD,ED](RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
 [PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy
-[Graph.partitionBy]: api/scala/index.html#org.apache.spark.graphx.Graph$@partitionBy(partitionStrategy:org.apache.spark.graphx.PartitionStrategy):org.apache.spark.graphx.Graph[VD,ED]
 [PageRank]: api/scala/index.html#org.apache.spark.graphx.lib.PageRank$
 [ConnectedComponents]: api/scala/index.html#org.apache.spark.graphx.lib.ConnectedComponents$
 [TriangleCount]: api/scala/index.html#org.apache.spark.graphx.lib.TriangleCount$
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index cb2ccbf4fe157..c72c01fcff830 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -984,7 +984,7 @@ Random forests combine many decision trees in order to reduce the risk of overfi
 The `spark.ml` implementation supports random forests for binary and multiclass classification and for regression,
 using both continuous and categorical features.
 
-For more information on the algorithm itself, please see the [`spark.mllib` documentation on random forests](mllib-ensembles.html).
+For more information on the algorithm itself, please see the [`spark.mllib` documentation on random forests](mllib-ensembles.html#random-forests).
 
 ### Inputs and Outputs
 
@@ -1065,7 +1065,7 @@ GBTs iteratively train decision trees in order to minimize a loss function.
 The `spark.ml` implementation supports GBTs for binary classification and for regression,
 using both continuous and categorical features.
 
-For more information on the algorithm itself, please see the [`spark.mllib` documentation on GBTs](mllib-ensembles.html).
+For more information on the algorithm itself, please see the [`spark.mllib` documentation on GBTs](mllib-ensembles.html#gradient-boosted-trees-gbts).
 
 ### Inputs and Outputs
 
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 903177210d820..45724a3716e74 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -694,7 +694,7 @@ for more details on the API.
 `VectorIndexer` helps index categorical features in datasets of `Vector`s.
 It can both automatically decide which features are categorical and convert original values to category indices.  Specifically, it does the following:
 
-1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.mllib.linalg.Vector) and a parameter `maxCategories`.
+1. Take an input column of type [Vector](api/scala/index.html#org.apache.spark.ml.linalg.Vector) and a parameter `maxCategories`.
 2. Decide which features should be categorical based on the number of distinct values, where features with at most `maxCategories` are declared categorical.
 3. Compute 0-based category indices for each categorical feature.
 4. Index categorical features and transform original feature values to indices.
diff --git a/docs/ml-pipeline.md b/docs/ml-pipeline.md
index b4d6be94f5eb0..0384513ab7014 100644
--- a/docs/ml-pipeline.md
+++ b/docs/ml-pipeline.md
@@ -38,26 +38,26 @@ algorithms into a single pipeline, or workflow.
 This section covers the key concepts introduced by the Pipelines API, where the pipeline concept is
 mostly inspired by the [scikit-learn](http://scikit-learn.org/) project.
 
-* **[`DataFrame`](ml-guide.html#dataframe)**: This ML API uses `DataFrame` from Spark SQL as an ML
+* **[`DataFrame`](ml-pipeline.html#dataframe)**: This ML API uses `DataFrame` from Spark SQL as an ML
   dataset, which can hold a variety of data types.
   E.g., a `DataFrame` could have different columns storing text, feature vectors, true labels, and predictions.
 
-* **[`Transformer`](ml-guide.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
+* **[`Transformer`](ml-pipeline.html#transformers)**: A `Transformer` is an algorithm which can transform one `DataFrame` into another `DataFrame`.
 E.g., an ML model is a `Transformer` which transforms a `DataFrame` with features into a `DataFrame` with predictions.
 
-* **[`Estimator`](ml-guide.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
+* **[`Estimator`](ml-pipeline.html#estimators)**: An `Estimator` is an algorithm which can be fit on a `DataFrame` to produce a `Transformer`.
 E.g., a learning algorithm is an `Estimator` which trains on a `DataFrame` and produces a model.
 
-* **[`Pipeline`](ml-guide.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
+* **[`Pipeline`](ml-pipeline.html#pipeline)**: A `Pipeline` chains multiple `Transformer`s and `Estimator`s together to specify an ML workflow.
 
-* **[`Parameter`](ml-guide.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
+* **[`Parameter`](ml-pipeline.html#parameters)**: All `Transformer`s and `Estimator`s now share a common API for specifying parameters.
 
 ## DataFrame
 
 Machine learning can be applied to a wide variety of data types, such as vectors, text, images, and structured data.
 This API adopts the `DataFrame` from Spark SQL in order to support a variety of data types.
 
-`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#spark-sql-datatype-reference) for a list of supported types.
+`DataFrame` supports many basic and structured types; see the [Spark SQL datatype reference](sql-programming-guide.html#data-types) for a list of supported types.
 In addition to the types listed in the Spark SQL guide, `DataFrame` can use ML [`Vector`](mllib-data-types.html#local-vector) types.
 
 A `DataFrame` can be created either implicitly or explicitly from a regular `RDD`.  See the code examples below and the [Spark SQL programming guide](sql-programming-guide.html) for examples.
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index 816bdf1317000..3085539b40e61 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -139,7 +139,7 @@ and logistic regression.
 Linear SVMs supports only binary classification, while logistic regression supports both binary and
 multiclass classification problems.
 For both methods, `spark.mllib` supports L1 and L2 regularized variants.
-The training data set is represented by an RDD of [LabeledPoint](mllib-data-types.html) in MLlib,
+The training data set is represented by an RDD of [LabeledPoint](mllib-data-types.html#labeled-point) in MLlib,
 where labels are class indices starting from zero: $0, 1, 2, \ldots$.
 
 ### Linear Support Vector Machines (SVMs)
@@ -491,5 +491,3 @@ Algorithms are all implemented in Scala:
 * [RidgeRegressionWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.RidgeRegressionWithSGD)
 * [LassoWithSGD](api/scala/index.html#org.apache.spark.mllib.regression.LassoWithSGD)
 
-Python calls the Scala implementation via
-[PythonMLLibAPI](api/scala/index.html#org.apache.spark.mllib.api.python.PythonMLLibAPI).
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
index 333a8c364a884..eb117c40eea3a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -40,7 +40,7 @@ private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
    * @group param
    */
   final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" +
-    "increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
+    " increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
     " improves the running performance", ParamValidators.gt(0))
 
   /** @group getParam */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index 7bef899a633d9..ede0a060eef95 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -34,7 +34,7 @@ private[spark] object GradientBoostedTrees extends Logging {
 
   /**
    * Method to train a gradient boosting model
-   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param input Training dataset: RDD of [[LabeledPoint]].
    * @param seed Random seed.
    * @return tuple of ensemble models and weights:
    *         (array of decision tree models, array of model weights)
@@ -59,7 +59,7 @@ private[spark] object GradientBoostedTrees extends Logging {
 
   /**
    * Method to validate a gradient boosting model
-   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param input Training dataset: RDD of [[LabeledPoint]].
    * @param validationInput Validation dataset.
    *                        This dataset should be different from the training dataset,
    *                        but it should follow the same distribution.
@@ -162,7 +162,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * Method to calculate error of the base learner for the gradient boosting calculation.
    * Note: This method is not used by the gradient boosting algorithm but is useful for debugging
    * purposes.
-   * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param data Training dataset: RDD of [[LabeledPoint]].
    * @param trees Boosted Decision Tree models
    * @param treeWeights Learning rates at each boosting iteration.
    * @param loss evaluation metric.
@@ -184,7 +184,7 @@ private[spark] object GradientBoostedTrees extends Logging {
   /**
    * Method to compute error or loss for every iteration of gradient boosting.
    *
-   * @param data RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param data RDD of [[LabeledPoint]]
    * @param trees Boosted Decision Tree models
    * @param treeWeights Learning rates at each boosting iteration.
    * @param loss evaluation metric.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index b504f411d256d..8ae5ca3c84b0e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -82,7 +82,7 @@ private[spark] object RandomForest extends Logging {
   /**
    * Train a random forest.
    *
-   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param input Training data: RDD of [[LabeledPoint]]
    * @return an unweighted set of trees
    */
   def run(
@@ -343,7 +343,7 @@ private[spark] object RandomForest extends Logging {
   /**
    * Given a group of nodes, this finds the best split for each node.
    *
-   * @param input Training data: RDD of [[org.apache.spark.ml.tree.impl.TreePoint]]
+   * @param input Training data: RDD of [[TreePoint]]
    * @param metadata Learning and dataset metadata
    * @param topNodesForGroup For each tree in group, tree index -> root node.
    *                         Used for matching instances with nodes.
@@ -854,10 +854,10 @@ private[spark] object RandomForest extends Logging {
    *       and for multiclass classification with a high-arity feature,
    *       there is one bin per category.
    *
-   * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
+   * @param input Training data: RDD of [[LabeledPoint]]
    * @param metadata Learning and dataset metadata
    * @param seed random seed
-   * @return Splits, an Array of [[org.apache.spark.mllib.tree.model.Split]]
+   * @return Splits, an Array of [[Split]]
    *          of size (numFeatures, numSplits)
    */
   protected[tree] def findSplits(

From 978798880c0b1e6a15e8a342847e1ff4d83a5ac0 Mon Sep 17 00:00:00 2001
From: root <root@iZbp1gsnrlfzjxh82cz80vZ.(none)>
Date: Thu, 17 Nov 2016 17:04:19 +0000
Subject: [PATCH 138/534] [SPARK-18490][SQL] duplication nodename extrainfo for
 ShuffleExchange

## What changes were proposed in this pull request?

   In ShuffleExchange, the nodename's extraInfo are the same when exchangeCoordinator.isEstimated
 is true or false.

Merge the two situation in the PR.

Author: root <root@iZbp1gsnrlfzjxh82cz80vZ.(none)>

Closes #15920 from windpiger/DupNodeNameShuffleExchange.

(cherry picked from commit b0aa1aa1af6c513a6a881eaea96abdd2b480ef98)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../apache/spark/sql/execution/exchange/ShuffleExchange.scala | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
index 7a4a251370706..125a4930c6528 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
@@ -45,9 +45,7 @@ case class ShuffleExchange(
 
   override def nodeName: String = {
     val extraInfo = coordinator match {
-      case Some(exchangeCoordinator) if exchangeCoordinator.isEstimated =>
-        s"(coordinator id: ${System.identityHashCode(coordinator)})"
-      case Some(exchangeCoordinator) if !exchangeCoordinator.isEstimated =>
+      case Some(exchangeCoordinator) =>
         s"(coordinator id: ${System.identityHashCode(coordinator)})"
       case None => ""
     }

From fc466be4fd8def06880f59d50e5567c22cc53d6a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 17 Nov 2016 17:31:12 -0800
Subject: [PATCH 139/534] [SPARK-18360][SQL] default table path of tables in
 default database should depend on the location of default database

## What changes were proposed in this pull request?

The current semantic of the warehouse config:

1. it's a static config, which means you can't change it once your spark application is launched.
2. Once a database is created, its location won't change even the warehouse path config is changed.
3. default database is a special case, although its location is fixed, but the locations of tables created in it are not. If a Spark app starts with warehouse path B(while the location of default database is A), then users create a table `tbl` in default database, its location will be `B/tbl` instead of `A/tbl`. If uses change the warehouse path config to C, and create another table `tbl2`, its location will still be `B/tbl2` instead of `C/tbl2`.

rule 3 doesn't make sense and I think we made it by mistake, not intentionally. Data source tables don't follow rule 3 and treat default database like normal ones.

This PR fixes hive serde tables to make it consistent with data source tables.

## How was this patch tested?

HiveSparkSubmitSuite

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15812 from cloud-fan/default-db.

(cherry picked from commit ce13c2672318242748f7520ed4ce6bcfad4fb428)
Signed-off-by: Yin Huai <yhuai@databricks.com>
---
 .../spark/sql/hive/HiveExternalCatalog.scala  | 237 ++++++++++--------
 .../spark/sql/hive/HiveSparkSubmitSuite.scala |  76 +++++-
 2 files changed, 190 insertions(+), 123 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 843305883abc8..cacffcf33c263 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -197,136 +197,151 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     if (tableDefinition.tableType == VIEW) {
       client.createTable(tableDefinition, ignoreIfExists)
-    } else if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) {
-      // Here we follow data source tables and put table metadata like provider, schema, etc. in
-      // table properties, so that we can work around the Hive metastore issue about not case
-      // preserving and make Hive serde table support mixed-case column names.
-      val tableWithDataSourceProps = tableDefinition.copy(
-        properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
-      client.createTable(tableWithDataSourceProps, ignoreIfExists)
     } else {
-      // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type
-      // support, no column nullability, etc., we should do some extra works before saving table
-      // metadata into Hive metastore:
-      //  1. Put table metadata like provider, schema, etc. in table properties.
-      //  2. Check if this table is hive compatible.
-      //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
-      //         spec to empty and save table metadata to Hive.
-      //    2.2  If it's hive compatible, set serde information in table metadata and try to save
-      //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
-      val tableProperties = tableMetaToTableProps(tableDefinition)
-
       // Ideally we should not create a managed table with location, but Hive serde table can
       // specify location for managed table. And in [[CreateDataSourceTableAsSelectCommand]] we have
       // to create the table directory and write out data before we create this table, to avoid
       // exposing a partial written table.
       val needDefaultTableLocation = tableDefinition.tableType == MANAGED &&
         tableDefinition.storage.locationUri.isEmpty
+
       val tableLocation = if (needDefaultTableLocation) {
         Some(defaultTablePath(tableDefinition.identifier))
       } else {
         tableDefinition.storage.locationUri
       }
-      // Ideally we should also put `locationUri` in table properties like provider, schema, etc.
-      // However, in older version of Spark we already store table location in storage properties
-      // with key "path". Here we keep this behaviour for backward compatibility.
-      val storagePropsWithLocation = tableDefinition.storage.properties ++
-        tableLocation.map("path" -> _)
-
-      // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
-      // bucket specification to empty. Note that partition columns are retained, so that we can
-      // call partition-related Hive API later.
-      def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
-        tableDefinition.copy(
-          // Hive only allows directory paths as location URIs while Spark SQL data source tables
-          // also allow file paths. For non-hive-compatible format, we should not set location URI
-          // to avoid hive metastore to throw exception.
-          storage = tableDefinition.storage.copy(
-            locationUri = None,
-            properties = storagePropsWithLocation),
-          schema = tableDefinition.partitionSchema,
-          bucketSpec = None,
-          properties = tableDefinition.properties ++ tableProperties)
+
+      if (tableDefinition.provider.get == DDLUtils.HIVE_PROVIDER) {
+        val tableWithDataSourceProps = tableDefinition.copy(
+          // We can't leave `locationUri` empty and count on Hive metastore to set a default table
+          // location, because Hive metastore uses hive.metastore.warehouse.dir to generate default
+          // table location for tables in default database, while we expect to use the location of
+          // default database.
+          storage = tableDefinition.storage.copy(locationUri = tableLocation),
+          // Here we follow data source tables and put table metadata like provider, schema, etc. in
+          // table properties, so that we can work around the Hive metastore issue about not case
+          // preserving and make Hive serde table support mixed-case column names.
+          properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
+        client.createTable(tableWithDataSourceProps, ignoreIfExists)
+      } else {
+        createDataSourceTable(
+          tableDefinition.withNewStorage(locationUri = tableLocation),
+          ignoreIfExists)
       }
+    }
+  }
 
-      // converts the table metadata to Hive compatible format, i.e. set the serde information.
-      def newHiveCompatibleMetastoreTable(serde: HiveSerDe): CatalogTable = {
-        val location = if (tableDefinition.tableType == EXTERNAL) {
-          // When we hit this branch, we are saving an external data source table with hive
-          // compatible format, which means the data source is file-based and must have a `path`.
-          require(tableDefinition.storage.locationUri.isDefined,
-            "External file-based data source table must have a `path` entry in storage properties.")
-          Some(new Path(tableDefinition.location).toUri.toString)
-        } else {
-          None
-        }
+  private def createDataSourceTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = {
+    // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type
+    // support, no column nullability, etc., we should do some extra works before saving table
+    // metadata into Hive metastore:
+    //  1. Put table metadata like provider, schema, etc. in table properties.
+    //  2. Check if this table is hive compatible.
+    //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
+    //         spec to empty and save table metadata to Hive.
+    //    2.2  If it's hive compatible, set serde information in table metadata and try to save
+    //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
+    val tableProperties = tableMetaToTableProps(table)
+
+    // Ideally we should also put `locationUri` in table properties like provider, schema, etc.
+    // However, in older version of Spark we already store table location in storage properties
+    // with key "path". Here we keep this behaviour for backward compatibility.
+    val storagePropsWithLocation = table.storage.properties ++
+      table.storage.locationUri.map("path" -> _)
+
+    // converts the table metadata to Spark SQL specific format, i.e. set data schema, names and
+    // bucket specification to empty. Note that partition columns are retained, so that we can
+    // call partition-related Hive API later.
+    def newSparkSQLSpecificMetastoreTable(): CatalogTable = {
+      table.copy(
+        // Hive only allows directory paths as location URIs while Spark SQL data source tables
+        // also allow file paths. For non-hive-compatible format, we should not set location URI
+        // to avoid hive metastore to throw exception.
+        storage = table.storage.copy(
+          locationUri = None,
+          properties = storagePropsWithLocation),
+        schema = table.partitionSchema,
+        bucketSpec = None,
+        properties = table.properties ++ tableProperties)
+    }
 
-        tableDefinition.copy(
-          storage = tableDefinition.storage.copy(
-            locationUri = location,
-            inputFormat = serde.inputFormat,
-            outputFormat = serde.outputFormat,
-            serde = serde.serde,
-            properties = storagePropsWithLocation
-          ),
-          properties = tableDefinition.properties ++ tableProperties)
+    // converts the table metadata to Hive compatible format, i.e. set the serde information.
+    def newHiveCompatibleMetastoreTable(serde: HiveSerDe): CatalogTable = {
+      val location = if (table.tableType == EXTERNAL) {
+        // When we hit this branch, we are saving an external data source table with hive
+        // compatible format, which means the data source is file-based and must have a `path`.
+        require(table.storage.locationUri.isDefined,
+          "External file-based data source table must have a `path` entry in storage properties.")
+        Some(new Path(table.location).toUri.toString)
+      } else {
+        None
       }
 
-      val qualifiedTableName = tableDefinition.identifier.quotedString
-      val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get)
-      val skipHiveMetadata = tableDefinition.storage.properties
-        .getOrElse("skipHiveMetadata", "false").toBoolean
-
-      val (hiveCompatibleTable, logMessage) = maybeSerde match {
-        case _ if skipHiveMetadata =>
-          val message =
-            s"Persisting data source table $qualifiedTableName into Hive metastore in" +
-              "Spark SQL specific format, which is NOT compatible with Hive."
-          (None, message)
-
-        // our bucketing is un-compatible with hive(different hash function)
-        case _ if tableDefinition.bucketSpec.nonEmpty =>
-          val message =
-            s"Persisting bucketed data source table $qualifiedTableName into " +
-              "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. "
-          (None, message)
-
-        case Some(serde) =>
-          val message =
-            s"Persisting file based data source table $qualifiedTableName into " +
-              s"Hive metastore in Hive compatible format."
-          (Some(newHiveCompatibleMetastoreTable(serde)), message)
-
-        case _ =>
-          val provider = tableDefinition.provider.get
-          val message =
-            s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
-              s"Persisting data source table $qualifiedTableName into Hive metastore in " +
-              s"Spark SQL specific format, which is NOT compatible with Hive."
-          (None, message)
-      }
+      table.copy(
+        storage = table.storage.copy(
+          locationUri = location,
+          inputFormat = serde.inputFormat,
+          outputFormat = serde.outputFormat,
+          serde = serde.serde,
+          properties = storagePropsWithLocation
+        ),
+        properties = table.properties ++ tableProperties)
+    }
 
-      (hiveCompatibleTable, logMessage) match {
-        case (Some(table), message) =>
-          // We first try to save the metadata of the table in a Hive compatible way.
-          // If Hive throws an error, we fall back to save its metadata in the Spark SQL
-          // specific way.
-          try {
-            logInfo(message)
-            saveTableIntoHive(table, ignoreIfExists)
-          } catch {
-            case NonFatal(e) =>
-              val warningMessage =
-                s"Could not persist ${tableDefinition.identifier.quotedString} in a Hive " +
-                  "compatible way. Persisting it into Hive metastore in Spark SQL specific format."
-              logWarning(warningMessage, e)
-              saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
-          }
+    val qualifiedTableName = table.identifier.quotedString
+    val maybeSerde = HiveSerDe.sourceToSerDe(table.provider.get)
+    val skipHiveMetadata = table.storage.properties
+      .getOrElse("skipHiveMetadata", "false").toBoolean
+
+    val (hiveCompatibleTable, logMessage) = maybeSerde match {
+      case _ if skipHiveMetadata =>
+        val message =
+          s"Persisting data source table $qualifiedTableName into Hive metastore in" +
+            "Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
+
+      // our bucketing is un-compatible with hive(different hash function)
+      case _ if table.bucketSpec.nonEmpty =>
+        val message =
+          s"Persisting bucketed data source table $qualifiedTableName into " +
+            "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. "
+        (None, message)
+
+      case Some(serde) =>
+        val message =
+          s"Persisting file based data source table $qualifiedTableName into " +
+            s"Hive metastore in Hive compatible format."
+        (Some(newHiveCompatibleMetastoreTable(serde)), message)
+
+      case _ =>
+        val provider = table.provider.get
+        val message =
+          s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
+            s"Persisting data source table $qualifiedTableName into Hive metastore in " +
+            s"Spark SQL specific format, which is NOT compatible with Hive."
+        (None, message)
+    }
 
-        case (None, message) =>
-          logWarning(message)
-          saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
-      }
+    (hiveCompatibleTable, logMessage) match {
+      case (Some(table), message) =>
+        // We first try to save the metadata of the table in a Hive compatible way.
+        // If Hive throws an error, we fall back to save its metadata in the Spark SQL
+        // specific way.
+        try {
+          logInfo(message)
+          saveTableIntoHive(table, ignoreIfExists)
+        } catch {
+          case NonFatal(e) =>
+            val warningMessage =
+              s"Could not persist ${table.identifier.quotedString} in a Hive " +
+                "compatible way. Persisting it into Hive metastore in Spark SQL specific format."
+            logWarning(warningMessage, e)
+            saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
+        }
+
+      case (None, message) =>
+        logWarning(message)
+        saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index fbd705172cae6..a670560c5969d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -24,6 +24,7 @@ import java.util.Date
 import scala.collection.mutable.ArrayBuffer
 import scala.tools.nsc.Properties
 
+import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfterEach, Matchers}
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
@@ -33,11 +34,12 @@ import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
-import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, FunctionResource, JarResource}
+import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
 import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
-import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.types.{DecimalType, StructType}
 import org.apache.spark.util.{ResetSystemProperties, Utils}
 
 /**
@@ -295,6 +297,20 @@ class HiveSparkSubmitSuite
     runSparkSubmit(args)
   }
 
+  test("SPARK-18360: default table path of tables in default database should depend on the " +
+    "location of default database") {
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+    val args = Seq(
+      "--class", SPARK_18360.getClass.getName.stripSuffix("$"),
+      "--name", "SPARK-18360",
+      "--master", "local-cluster[2,1,1024]",
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.master.rest.enabled=false",
+      "--driver-java-options", "-Dderby.system.durability=test",
+      unusedJar.toString)
+    runSparkSubmit(args)
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   // This is copied from org.apache.spark.deploy.SparkSubmitSuite
   private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -397,11 +413,7 @@ object SetWarehouseLocationTest extends Logging {
   def main(args: Array[String]): Unit = {
     Utils.configTestLog4j("INFO")
 
-    val sparkConf = new SparkConf(loadDefaults = true)
-    val builder = SparkSession.builder()
-      .config(sparkConf)
-      .config("spark.ui.enabled", "false")
-      .enableHiveSupport()
+    val sparkConf = new SparkConf(loadDefaults = true).set("spark.ui.enabled", "false")
     val providedExpectedWarehouseLocation =
       sparkConf.getOption("spark.sql.test.expectedWarehouseDir")
 
@@ -410,7 +422,7 @@ object SetWarehouseLocationTest extends Logging {
         // If spark.sql.test.expectedWarehouseDir is set, the warehouse dir is set
         // through spark-summit. So, neither spark.sql.warehouse.dir nor
         // hive.metastore.warehouse.dir is set at here.
-        (builder.getOrCreate(), warehouseDir)
+        (new TestHiveContext(new SparkContext(sparkConf)).sparkSession, warehouseDir)
       case None =>
         val warehouseLocation = Utils.createTempDir()
         warehouseLocation.delete()
@@ -420,10 +432,10 @@ object SetWarehouseLocationTest extends Logging {
         // spark.sql.warehouse.dir and hive.metastore.warehouse.dir.
         // We are expecting that the value of spark.sql.warehouse.dir will override the
         // value of hive.metastore.warehouse.dir.
-        val session = builder
-          .config("spark.sql.warehouse.dir", warehouseLocation.toString)
-          .config("hive.metastore.warehouse.dir", hiveWarehouseLocation.toString)
-          .getOrCreate()
+        val session = new TestHiveContext(new SparkContext(sparkConf
+          .set("spark.sql.warehouse.dir", warehouseLocation.toString)
+          .set("hive.metastore.warehouse.dir", hiveWarehouseLocation.toString)))
+          .sparkSession
         (session, warehouseLocation.toString)
 
     }
@@ -801,3 +813,43 @@ object SPARK_14244 extends QueryTest {
     }
   }
 }
+
+object SPARK_18360 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession.builder()
+      .config("spark.ui.enabled", "false")
+      .enableHiveSupport().getOrCreate()
+
+    val defaultDbLocation = spark.catalog.getDatabase("default").locationUri
+    assert(new Path(defaultDbLocation) == new Path(spark.sharedState.warehousePath))
+
+    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+    try {
+      val tableMeta = CatalogTable(
+        identifier = TableIdentifier("test_tbl", Some("default")),
+        tableType = CatalogTableType.MANAGED,
+        storage = CatalogStorageFormat.empty,
+        schema = new StructType().add("i", "int"),
+        provider = Some(DDLUtils.HIVE_PROVIDER))
+
+      val newWarehousePath = Utils.createTempDir().getAbsolutePath
+      hiveClient.runSqlHive(s"SET hive.metastore.warehouse.dir=$newWarehousePath")
+      hiveClient.createTable(tableMeta, ignoreIfExists = false)
+      val rawTable = hiveClient.getTable("default", "test_tbl")
+      // Hive will use the value of `hive.metastore.warehouse.dir` to generate default table
+      // location for tables in default database.
+      assert(rawTable.storage.locationUri.get.contains(newWarehousePath))
+      hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = false, purge = false)
+
+      spark.sharedState.externalCatalog.createTable(tableMeta, ignoreIfExists = false)
+      val readBack = spark.sharedState.externalCatalog.getTable("default", "test_tbl")
+      // Spark SQL will use the location of default database to generate default table
+      // location for tables in default database.
+      assert(readBack.storage.locationUri.get.contains(defaultDbLocation))
+    } finally {
+      hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = true, purge = false)
+      hiveClient.runSqlHive(s"SET hive.metastore.warehouse.dir=$defaultDbLocation")
+    }
+  }
+}

From e8b1955e20a966da9a95f75320680cbab1096540 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 17 Nov 2016 18:45:15 -0800
Subject: [PATCH 140/534] [SPARK-18462] Fix ClassCastException in
 SparkListenerDriverAccumUpdates event

## What changes were proposed in this pull request?

This patch fixes a `ClassCastException: java.lang.Integer cannot be cast to java.lang.Long` error which could occur in the HistoryServer while trying to process a deserialized `SparkListenerDriverAccumUpdates` event.

The problem stems from how `jackson-module-scala` handles primitive type parameters (see https://github.com/FasterXML/jackson-module-scala/wiki/FAQ#deserializing-optionint-and-other-primitive-challenges for more details). This was causing a problem where our code expected a field to be deserialized as a `(Long, Long)` tuple but we got an `(Int, Int)` tuple instead.

This patch hacks around this issue by registering a custom `Converter` with Jackson in order to deserialize the tuples as `(Object, Object)` and perform the appropriate casting.

## How was this patch tested?

New regression tests in `SQLListenerSuite`.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #15922 from JoshRosen/SPARK-18462.

(cherry picked from commit d9dd979d170f44383a9a87f892f2486ddb3cca7d)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/execution/ui/SQLListener.scala  | 39 +++++++++++++++-
 .../sql/execution/ui/SQLListenerSuite.scala   | 44 ++++++++++++++++++-
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index 60f13432d78d2..5daf21595d8a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -19,6 +19,11 @@ package org.apache.spark.sql.execution.ui
 
 import scala.collection.mutable
 
+import com.fasterxml.jackson.databind.JavaType
+import com.fasterxml.jackson.databind.`type`.TypeFactory
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize
+import com.fasterxml.jackson.databind.util.Converter
+
 import org.apache.spark.{JobExecutionStatus, SparkConf}
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.internal.Logging
@@ -43,9 +48,41 @@ case class SparkListenerSQLExecutionEnd(executionId: Long, time: Long)
   extends SparkListenerEvent
 
 @DeveloperApi
-case class SparkListenerDriverAccumUpdates(executionId: Long, accumUpdates: Seq[(Long, Long)])
+case class SparkListenerDriverAccumUpdates(
+    executionId: Long,
+    @JsonDeserialize(contentConverter = classOf[LongLongTupleConverter])
+    accumUpdates: Seq[(Long, Long)])
   extends SparkListenerEvent
 
+/**
+ * Jackson [[Converter]] for converting an (Int, Int) tuple into a (Long, Long) tuple.
+ *
+ * This is necessary due to limitations in how Jackson's scala module deserializes primitives;
+ * see the "Deserializing Option[Int] and other primitive challenges" section in
+ * https://github.com/FasterXML/jackson-module-scala/wiki/FAQ for a discussion of this issue and
+ * SPARK-18462 for the specific problem that motivated this conversion.
+ */
+private class LongLongTupleConverter extends Converter[(Object, Object), (Long, Long)] {
+
+  override def convert(in: (Object, Object)): (Long, Long) = {
+    def toLong(a: Object): Long = a match {
+      case i: java.lang.Integer => i.intValue()
+      case l: java.lang.Long => l.longValue()
+    }
+    (toLong(in._1), toLong(in._2))
+  }
+
+  override def getInputType(typeFactory: TypeFactory): JavaType = {
+    val objectType = typeFactory.uncheckedSimpleType(classOf[Object])
+    typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(objectType, objectType))
+  }
+
+  override def getOutputType(typeFactory: TypeFactory): JavaType = {
+    val longType = typeFactory.uncheckedSimpleType(classOf[Long])
+    typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(longType, longType))
+  }
+}
+
 class SQLHistoryListenerFactory extends SparkHistoryListenerFactory {
 
   override def createListeners(conf: SparkConf, sparkUI: SparkUI): Seq[SparkListener] = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 19b6d2603129c..7b4ff675fba72 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.ui
 
 import java.util.Properties
 
+import org.json4s.jackson.JsonMethods._
 import org.mockito.Mockito.mock
 
 import org.apache.spark._
@@ -35,10 +36,10 @@ import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlanIn
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.{AccumulatorMetadata, LongAccumulator}
+import org.apache.spark.util.{AccumulatorMetadata, JsonProtocol, LongAccumulator}
 
 
-class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
+class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTestUtils {
   import testImplicits._
   import org.apache.spark.AccumulatorSuite.makeInfo
 
@@ -416,6 +417,45 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     assert(driverUpdates(physicalPlan.longMetric("dummy").id) == expectedAccumValue)
   }
 
+  test("roundtripping SparkListenerDriverAccumUpdates through JsonProtocol (SPARK-18462)") {
+    val event = SparkListenerDriverAccumUpdates(1L, Seq((2L, 3L)))
+    val json = JsonProtocol.sparkEventToJson(event)
+    assertValidDataInJson(json,
+      parse("""
+        |{
+        |  "Event": "org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates",
+        |  "executionId": 1,
+        |  "accumUpdates": [[2,3]]
+        |}
+      """.stripMargin))
+    JsonProtocol.sparkEventFromJson(json) match {
+      case SparkListenerDriverAccumUpdates(executionId, accums) =>
+        assert(executionId == 1L)
+        accums.foreach { case (a, b) =>
+          assert(a == 2L)
+          assert(b == 3L)
+        }
+    }
+
+    // Test a case where the numbers in the JSON can only fit in longs:
+    val longJson = parse(
+      """
+        |{
+        |  "Event": "org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates",
+        |  "executionId": 4294967294,
+        |  "accumUpdates": [[4294967294,3]]
+        |}
+      """.stripMargin)
+    JsonProtocol.sparkEventFromJson(longJson) match {
+      case SparkListenerDriverAccumUpdates(executionId, accums) =>
+        assert(executionId == 4294967294L)
+        accums.foreach { case (a, b) =>
+          assert(a == 4294967294L)
+          assert(b == 3L)
+        }
+    }
+  }
+
 }
 
 

From 5912c19e76719a1c388a7a151af03ebf71b8f0db Mon Sep 17 00:00:00 2001
From: Tyson Condie <tcondie@gmail.com>
Date: Fri, 18 Nov 2016 11:11:24 -0800
Subject: [PATCH 141/534] [SPARK-18187][SQL] CompactibleFileStreamLog should
 not use "compactInterval" direcly with user setting.

## What changes were proposed in this pull request?
CompactibleFileStreamLog relys on "compactInterval" to detect a compaction batch. If the "compactInterval" is reset by user, CompactibleFileStreamLog will return wrong answer, resulting data loss. This PR procides a way to check the validity of 'compactInterval', and calculate an appropriate value.

## How was this patch tested?
When restart a stream, we change the 'spark.sql.streaming.fileSource.log.compactInterval' different with the former one.

The primary solution to this issue was given by uncleGen
Added extensions include an additional metadata field in OffsetSeq and CompactibleFileStreamLog APIs. zsxwing

Author: Tyson Condie <tcondie@gmail.com>
Author: genmao.ygm <genmao.ygm@genmaoygmdeMacBook-Air.local>

Closes #15852 from tcondie/spark-18187.

(cherry picked from commit 51baca2219fda8692b88fc8552548544aec73a1e)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../streaming/CompactibleFileStreamLog.scala  | 61 ++++++++++++++++++-
 .../streaming/FileStreamSinkLog.scala         |  8 ++-
 .../streaming/FileStreamSourceLog.scala       |  9 +--
 .../execution/streaming/HDFSMetadataLog.scala |  2 +-
 .../sql/execution/streaming/OffsetSeq.scala   | 12 +++-
 .../execution/streaming/OffsetSeqLog.scala    | 31 +++++++---
 .../CompactibleFileStreamLogSuite.scala       | 33 ++++++++++
 .../sql/streaming/FileStreamSourceSuite.scala | 41 ++++++++-----
 .../spark/sql/streaming/StreamTest.scala      | 20 +++++-
 9 files changed, 178 insertions(+), 39 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index 8af3db1968882..8529ceac30f1e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -63,7 +63,46 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
 
   protected def isDeletingExpiredLog: Boolean
 
-  protected def compactInterval: Int
+  protected def defaultCompactInterval: Int
+
+  protected final lazy val compactInterval: Int = {
+    // SPARK-18187: "compactInterval" can be set by user via defaultCompactInterval.
+    // If there are existing log entries, then we should ensure a compatible compactInterval
+    // is used, irrespective of the defaultCompactInterval. There are three cases:
+    //
+    // 1. If there is no '.compact' file, we can use the default setting directly.
+    // 2. If there are two or more '.compact' files, we use the interval of patch id suffix with
+    // '.compact' as compactInterval. This case could arise if isDeletingExpiredLog == false.
+    // 3. If there is only one '.compact' file, then we must find a compact interval
+    // that is compatible with (i.e., a divisor of) the previous compact file, and that
+    // faithfully tries to represent the revised default compact interval i.e., is at least
+    // is large if possible.
+    // e.g., if defaultCompactInterval is 5 (and previous compact interval could have
+    // been any 2,3,4,6,12), then a log could be: 11.compact, 12, 13, in which case
+    // will ensure that the new compactInterval = 6 > 5 and (11 + 1) % 6 == 0
+    val compactibleBatchIds = fileManager.list(metadataPath, batchFilesFilter)
+      .filter(f => f.getPath.toString.endsWith(CompactibleFileStreamLog.COMPACT_FILE_SUFFIX))
+      .map(f => pathToBatchId(f.getPath))
+      .sorted
+      .reverse
+
+    // Case 1
+    var interval = defaultCompactInterval
+    if (compactibleBatchIds.length >= 2) {
+      // Case 2
+      val latestCompactBatchId = compactibleBatchIds(0)
+      val previousCompactBatchId = compactibleBatchIds(1)
+      interval = (latestCompactBatchId - previousCompactBatchId).toInt
+    } else if (compactibleBatchIds.length == 1) {
+      // Case 3
+      interval = CompactibleFileStreamLog.deriveCompactInterval(
+        defaultCompactInterval, compactibleBatchIds(0).toInt)
+    }
+    assert(interval > 0, s"intervalValue = $interval not positive value.")
+    logInfo(s"Set the compact interval to $interval " +
+      s"[defaultCompactInterval: $defaultCompactInterval]")
+    interval
+  }
 
   /**
    * Filter out the obsolete logs.
@@ -245,4 +284,24 @@ object CompactibleFileStreamLog {
   def nextCompactionBatchId(batchId: Long, compactInterval: Long): Long = {
     (batchId + compactInterval + 1) / compactInterval * compactInterval - 1
   }
+
+  /**
+   * Derives a compact interval from the latest compact batch id and
+   * a default compact interval.
+   */
+  def deriveCompactInterval(defaultInterval: Int, latestCompactBatchId: Int) : Int = {
+    if (latestCompactBatchId + 1 <= defaultInterval) {
+      latestCompactBatchId + 1
+    } else if (defaultInterval < (latestCompactBatchId + 1) / 2) {
+      // Find the first divisor >= default compact interval
+      def properDivisors(min: Int, n: Int) =
+        (min to n/2).view.filter(i => n % i == 0) :+ n
+
+      properDivisors(defaultInterval, latestCompactBatchId + 1).head
+    } else {
+      // default compact interval > than any divisor other than latest compact id
+      latestCompactBatchId + 1
+    }
+  }
 }
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
index b4f14151f1ef2..eb6eed87eca7b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLog.scala
@@ -88,9 +88,11 @@ class FileStreamSinkLog(
 
   protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion
 
-  protected override val compactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval
-  require(compactInterval > 0,
-    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " +
+  protected override val defaultCompactInterval =
+    sparkSession.sessionState.conf.fileSinkLogCompactInterval
+
+  require(defaultCompactInterval > 0,
+    s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " +
       "to a positive value.")
 
   override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
index fe81b15607068..327b3ac267766 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
@@ -38,11 +38,12 @@ class FileStreamSourceLog(
   import CompactibleFileStreamLog._
 
   // Configurations about metadata compaction
-  protected override val compactInterval =
+  protected override val defaultCompactInterval: Int =
     sparkSession.sessionState.conf.fileSourceLogCompactInterval
-  require(compactInterval > 0,
-    s"Please set ${SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key} (was $compactInterval) to a " +
-      s"positive value.")
+
+  require(defaultCompactInterval > 0,
+    s"Please set ${SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key} " +
+      s"(was $defaultCompactInterval) to a positive value.")
 
   protected override val fileCleanupDelayMs =
     sparkSession.sessionState.conf.fileSourceLogCleanupDelay
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index db7057d7da70c..080729b2ca8d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -70,7 +70,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
   /**
    * A `PathFilter` to filter only batch files
    */
-  private val batchFilesFilter = new PathFilter {
+  protected val batchFilesFilter = new PathFilter {
     override def accept(path: Path): Boolean = isBatchFile(path)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index a4e1fe6797097..7469caeee3be5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -23,7 +23,7 @@ package org.apache.spark.sql.execution.streaming
  * [[Source]]s that are present in a streaming query. This is similar to simplified, single-instance
  * vector clock that must progress linearly forward.
  */
-case class OffsetSeq(offsets: Seq[Option[Offset]]) {
+case class OffsetSeq(offsets: Seq[Option[Offset]], metadata: Option[String] = None) {
 
   /**
    * Unpacks an offset into [[StreamProgress]] by associating each offset with the order list of
@@ -47,7 +47,13 @@ object OffsetSeq {
    * Returns a [[OffsetSeq]] with a variable sequence of offsets.
    * `nulls` in the sequence are converted to `None`s.
    */
-  def fill(offsets: Offset*): OffsetSeq = {
-    OffsetSeq(offsets.map(Option(_)))
+  def fill(offsets: Offset*): OffsetSeq = OffsetSeq.fill(None, offsets: _*)
+
+  /**
+   * Returns a [[OffsetSeq]] with metadata and a variable sequence of offsets.
+   * `nulls` in the sequence are converted to `None`s.
+   */
+  def fill(metadata: Option[String], offsets: Offset*): OffsetSeq = {
+    OffsetSeq(offsets.map(Option(_)), metadata)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
index d1c9d95be9fdb..cc25b4474ba2c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -33,12 +33,13 @@ import org.apache.spark.sql.SparkSession
  * by a newline character. If a source offset is missing, then
  * that line will contain a string value defined in the
  * SERIALIZED_VOID_OFFSET variable in [[OffsetSeqLog]] companion object.
- * For instance, when dealine wiht [[LongOffset]] types:
- *   v1   // version 1
- *   {0}  // LongOffset 0
- *   {3}  // LongOffset 3
- *   -    // No offset for this source i.e., an invalid JSON string
- *   {2}  // LongOffset 2
+ * For instance, when dealing with [[LongOffset]] types:
+ *   v1        // version 1
+ *   metadata
+ *   {0}       // LongOffset 0
+ *   {3}       // LongOffset 3
+ *   -         // No offset for this source i.e., an invalid JSON string
+ *   {2}       // LongOffset 2
  *   ...
  */
 class OffsetSeqLog(sparkSession: SparkSession, path: String)
@@ -58,13 +59,25 @@ class OffsetSeqLog(sparkSession: SparkSession, path: String)
     if (version != OffsetSeqLog.VERSION) {
       throw new IllegalStateException(s"Unknown log version: ${version}")
     }
-    OffsetSeq.fill(lines.map(parseOffset).toArray: _*)
+
+    // read metadata
+    val metadata = lines.next().trim match {
+      case "" => None
+      case md => Some(md)
+    }
+    OffsetSeq.fill(metadata, lines.map(parseOffset).toArray: _*)
   }
 
-  override protected def serialize(metadata: OffsetSeq, out: OutputStream): Unit = {
+  override protected def serialize(offsetSeq: OffsetSeq, out: OutputStream): Unit = {
     // called inside a try-finally where the underlying stream is closed in the caller
     out.write(OffsetSeqLog.VERSION.getBytes(UTF_8))
-    metadata.offsets.map(_.map(_.json)).foreach { offset =>
+
+    // write metadata
+    out.write('\n')
+    out.write(offsetSeq.metadata.getOrElse("").getBytes(UTF_8))
+
+    // write offsets, one per line
+    offsetSeq.offsets.map(_.map(_.json)).foreach { offset =>
       out.write('\n')
       offset match {
         case Some(json: String) => out.write(json.getBytes(UTF_8))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
new file mode 100644
index 0000000000000..2cd2157b293cb
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.SparkFunSuite
+
+class CompactibleFileStreamLogSuite extends SparkFunSuite {
+
+  import CompactibleFileStreamLog._
+
+  test("deriveCompactInterval") {
+    // latestCompactBatchId(4) + 1 <= default(5)
+    // then use latestestCompactBatchId + 1 === 5
+    assert(5 === deriveCompactInterval(5, 4))
+    // First divisor of 10 greater than 4 === 5
+    assert(5 === deriveCompactInterval(4, 9))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index b365af76c3795..a099153d2e58e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.streaming
 
 import java.io.File
 
+import scala.collection.mutable
+
 import org.scalatest.PrivateMethodTester
 import org.scalatest.time.SpanSugar._
 
@@ -896,32 +898,38 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
-  test("compacat metadata log") {
+  test("compact interval metadata log") {
     val _sources = PrivateMethod[Seq[Source]]('sources)
     val _metadataLog = PrivateMethod[FileStreamSourceLog]('metadataLog)
 
-    def verify(execution: StreamExecution)
-      (batchId: Long, expectedBatches: Int): Boolean = {
+    def verify(
+        execution: StreamExecution,
+        batchId: Long,
+        expectedBatches: Int,
+        expectedCompactInterval: Int): Boolean = {
       import CompactibleFileStreamLog._
 
       val fileSource = (execution invokePrivate _sources()).head.asInstanceOf[FileStreamSource]
       val metadataLog = fileSource invokePrivate _metadataLog()
 
-      if (isCompactionBatch(batchId, 2)) {
+      if (isCompactionBatch(batchId, expectedCompactInterval)) {
         val path = metadataLog.batchIdToPath(batchId)
 
         // Assert path name should be ended with compact suffix.
-        assert(path.getName.endsWith(COMPACT_FILE_SUFFIX))
+        assert(path.getName.endsWith(COMPACT_FILE_SUFFIX),
+          "path does not end with compact file suffix")
 
         // Compacted batch should include all entries from start.
         val entries = metadataLog.get(batchId)
-        assert(entries.isDefined)
-        assert(entries.get.length === metadataLog.allFiles().length)
-        assert(metadataLog.get(None, Some(batchId)).flatMap(_._2).length === entries.get.length)
+        assert(entries.isDefined, "Entries not defined")
+        assert(entries.get.length === metadataLog.allFiles().length, "clean up check")
+        assert(metadataLog.get(None, Some(batchId)).flatMap(_._2).length ===
+          entries.get.length, "Length check")
       }
 
       assert(metadataLog.allFiles().sortBy(_.batchId) ===
-        metadataLog.get(None, Some(batchId)).flatMap(_._2).sortBy(_.batchId))
+        metadataLog.get(None, Some(batchId)).flatMap(_._2).sortBy(_.batchId),
+        "Batch id mismatch")
 
       metadataLog.get(None, Some(batchId)).flatMap(_._2).length === expectedBatches
     }
@@ -932,26 +940,27 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       ) {
         val fileStream = createFileStream("text", src.getCanonicalPath)
         val filtered = fileStream.filter($"value" contains "keep")
+        val updateConf = Map(SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "5")
 
         testStream(filtered)(
           AddTextFileData("drop1\nkeep2\nkeep3", src, tmp),
           CheckAnswer("keep2", "keep3"),
-          AssertOnQuery(verify(_)(0L, 1)),
+          AssertOnQuery(verify(_, 0L, 1, 2)),
           AddTextFileData("drop4\nkeep5\nkeep6", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6"),
-          AssertOnQuery(verify(_)(1L, 2)),
+          AssertOnQuery(verify(_, 1L, 2, 2)),
           AddTextFileData("drop7\nkeep8\nkeep9", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9"),
-          AssertOnQuery(verify(_)(2L, 3)),
+          AssertOnQuery(verify(_, 2L, 3, 2)),
           StopStream,
-          StartStream(),
-          AssertOnQuery(verify(_)(2L, 3)),
+          StartStream(additionalConfs = updateConf),
+          AssertOnQuery(verify(_, 2L, 3, 2)),
           AddTextFileData("drop10\nkeep11", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9", "keep11"),
-          AssertOnQuery(verify(_)(3L, 4)),
+          AssertOnQuery(verify(_, 3L, 4, 2)),
           AddTextFileData("drop12\nkeep13", src, tmp),
           CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9", "keep11", "keep13"),
-          AssertOnQuery(verify(_)(4L, 5))
+          AssertOnQuery(verify(_, 4L, 5, 2))
         )
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 742833065144d..a6b2d4b9ab4c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -161,7 +161,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
       trigger: Trigger = ProcessingTime(0),
-      triggerClock: Clock = new SystemClock)
+      triggerClock: Clock = new SystemClock,
+      additionalConfs: Map[String, String] = Map.empty)
     extends StreamAction
 
   /** Advance the trigger clock's time manually. */
@@ -240,6 +241,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     var lastStream: StreamExecution = null
     val awaiting = new mutable.HashMap[Int, Offset]() // source index -> offset to wait for
     val sink = new MemorySink(stream.schema, outputMode)
+    val resetConfValues = mutable.Map[String, Option[String]]()
 
     @volatile
     var streamDeathCause: Throwable = null
@@ -330,7 +332,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       startedTest.foreach { action =>
         logInfo(s"Processing test stream action: $action")
         action match {
-          case StartStream(trigger, triggerClock) =>
+          case StartStream(trigger, triggerClock, additionalConfs) =>
             verify(currentStream == null, "stream already running")
             verify(triggerClock.isInstanceOf[SystemClock]
               || triggerClock.isInstanceOf[StreamManualClock],
@@ -338,6 +340,14 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
             if (triggerClock.isInstanceOf[StreamManualClock]) {
               manualClockExpectedTime = triggerClock.asInstanceOf[StreamManualClock].getTimeMillis()
             }
+
+            additionalConfs.foreach(pair => {
+              val value =
+                if (spark.conf.contains(pair._1)) Some(spark.conf.get(pair._1)) else None
+              resetConfValues(pair._1) = value
+              spark.conf.set(pair._1, pair._2)
+            })
+
             lastStream = currentStream
             currentStream =
               spark
@@ -519,6 +529,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
         currentStream.stop()
       }
       spark.streams.removeListener(statusCollector)
+
+      // Rollback prev configuration values
+      resetConfValues.foreach {
+        case (key, Some(value)) => spark.conf.set(key, value)
+        case (key, None) => spark.conf.unset(key)
+      }
     }
   }
 

From ec622eb7e1ffd0775c9ca4683d1032ca8d41654a Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Fri, 18 Nov 2016 11:19:49 -0800
Subject: [PATCH 142/534] [SPARK-18457][SQL] ORC and other columnar formats
 using HiveShim read all columns when doing a simple count

## What changes were proposed in this pull request?

When reading zero columns (e.g., count(*)) from ORC or any other format that uses HiveShim, actually set the read column list to empty for Hive to use.

## How was this patch tested?

Query correctness is handled by existing unit tests. I'm happy to add more if anyone can point out some case that is not covered.

Reduction in data read can be verified in the UI when built with a recent version of Hadoop say:
```
build/mvn -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.0 -Phive -DskipTests clean package
```
However the default Hadoop 2.2 that is used for unit tests does not report actual bytes read and instead just full file sizes (see FileScanRDD.scala line 80). Therefore I don't think there is a good way to add a unit test for this.

I tested with the following setup using above build options
```
case class OrcData(intField: Long, stringField: String)
spark.range(1,1000000).map(i => OrcData(i, s"part-$i")).toDF().write.format("orc").save("orc_test")

sql(
      s"""CREATE EXTERNAL TABLE orc_test(
         |  intField LONG,
         |  stringField STRING
         |)
         |STORED AS ORC
         |LOCATION '${System.getProperty("user.dir") + "/orc_test"}'
       """.stripMargin)
```

## Results

query | Spark 2.0.2 | this PR
---|---|---
`sql("select count(*) from orc_test").collect`|4.4 MB|199.4 KB
`sql("select intField from orc_test").collect`|743.4 KB|743.4 KB
`sql("select * from orc_test").collect`|4.4 MB|4.4 MB

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #15898 from aray/sql-orc-no-col.

(cherry picked from commit 795e9fc9213cb9941ae131aadcafddb94bde5f74)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/sql/hive/HiveShim.scala  |  6 ++---
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 25 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 0d2a765a388aa..9e9894803ce25 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -69,13 +69,13 @@ private[hive] object HiveShim {
   }
 
   /*
-   * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null or empty
+   * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null
    */
   def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) {
-    if (ids != null && ids.nonEmpty) {
+    if (ids != null) {
       ColumnProjectionUtils.appendReadColumns(conf, ids.asJava)
     }
-    if (names != null && names.nonEmpty) {
+    if (names != null) {
       appendReadColumnNames(conf, names)
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index ecb5972984523..a628977af2f4e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -20,11 +20,13 @@ package org.apache.spark.sql.hive.orc
 import java.nio.charset.StandardCharsets
 import java.sql.Timestamp
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.ql.io.orc.{OrcStruct, SparkOrcNewRecordReader}
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, RecordReaderIterator}
 import org.apache.spark.sql.hive.{HiveUtils, MetastoreRelation}
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
@@ -577,4 +579,25 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       assert(spark.table(tableName).schema == schema.copy(fields = expectedFields))
     }
   }
+
+  test("Empty schema does not read data from ORC file") {
+    val data = Seq((1, 1), (2, 2))
+    withOrcFile(data) { path =>
+      val requestedSchema = StructType(Nil)
+      val conf = new Configuration()
+      val physicalSchema = OrcFileOperator.readSchema(Seq(path), Some(conf)).get
+      OrcRelation.setRequiredColumns(conf, physicalSchema, requestedSchema)
+      val maybeOrcReader = OrcFileOperator.getFileReader(path, Some(conf))
+      assert(maybeOrcReader.isDefined)
+      val orcRecordReader = new SparkOrcNewRecordReader(
+        maybeOrcReader.get, conf, 0, maybeOrcReader.get.getContentLength)
+
+      val recordsIterator = new RecordReaderIterator[OrcStruct](orcRecordReader)
+      try {
+        assert(recordsIterator.next().toString == "{null, null}")
+      } finally {
+        recordsIterator.close()
+      }
+    }
+  }
 }

From 6717981e4d76f0794a75c60586de4677c49659ad Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 18 Nov 2016 21:45:18 +0000
Subject: [PATCH 143/534] [SPARK-18422][CORE] Fix wholeTextFiles test to pass
 on Windows in JavaAPISuite

## What changes were proposed in this pull request?

This PR fixes the test `wholeTextFiles` in `JavaAPISuite.java`. This is failed due to the different path format on Windows.

For example, the path in `container` was

```
C:\projects\spark\target\tmp\1478967560189-0/part-00000
```

whereas `new URI(res._1()).getPath()` was as below:

```
/C:/projects/spark/target/tmp/1478967560189-0/part-00000
```

## How was this patch tested?

Tests in `JavaAPISuite.java`.

Tested via AppVeyor.

**Before**
Build: https://ci.appveyor.com/project/spark-test/spark/build/63-JavaAPISuite-1
Diff: https://github.com/apache/spark/compare/master...spark-test:JavaAPISuite-1

```
[info] Test org.apache.spark.JavaAPISuite.wholeTextFiles started
[error] Test org.apache.spark.JavaAPISuite.wholeTextFiles failed: java.lang.AssertionError: expected:<spark is easy to use.
[error] > but was:<null>, took 0.578 sec
[error]     at org.apache.spark.JavaAPISuite.wholeTextFiles(JavaAPISuite.java:1089)
...
```

**After**
Build started: [CORE] `org.apache.spark.JavaAPISuite` [![PR-15866](https://ci.appveyor.com/api/projects/status/github/spark-test/spark?branch=198DDA52-F201-4D2B-BE2F-244E0C1725B2&svg=true)](https://ci.appveyor.com/project/spark-test/spark/branch/198DDA52-F201-4D2B-BE2F-244E0C1725B2)
Diff: https://github.com/apache/spark/compare/master...spark-test:198DDA52-F201-4D2B-BE2F-244E0C1725B2

```
[info] Test org.apache.spark.JavaAPISuite.wholeTextFiles started
...
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15866 from HyukjinKwon/SPARK-18422.

(cherry picked from commit 40d59ff5eaac6df237fe3d50186695c3806b268c)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../java/org/apache/spark/JavaAPISuite.java     | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 533025ba83e72..7bebe0612f9a8 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -20,7 +20,6 @@
 import java.io.*;
 import java.nio.channels.FileChannel;
 import java.nio.ByteBuffer;
-import java.net.URI;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -46,6 +45,7 @@
 import com.google.common.collect.Lists;
 import com.google.common.base.Throwables;
 import com.google.common.io.Files;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.DefaultCodec;
@@ -1075,18 +1075,23 @@ public void wholeTextFiles() throws Exception {
     byte[] content2 = "spark is also easy to use.\n".getBytes(StandardCharsets.UTF_8);
 
     String tempDirName = tempDir.getAbsolutePath();
-    Files.write(content1, new File(tempDirName + "/part-00000"));
-    Files.write(content2, new File(tempDirName + "/part-00001"));
+    String path1 = new Path(tempDirName, "part-00000").toUri().getPath();
+    String path2 = new Path(tempDirName, "part-00001").toUri().getPath();
+
+    Files.write(content1, new File(path1));
+    Files.write(content2, new File(path2));
 
     Map<String, String> container = new HashMap<>();
-    container.put(tempDirName+"/part-00000", new Text(content1).toString());
-    container.put(tempDirName+"/part-00001", new Text(content2).toString());
+    container.put(path1, new Text(content1).toString());
+    container.put(path2, new Text(content2).toString());
 
     JavaPairRDD<String, String> readRDD = sc.wholeTextFiles(tempDirName, 3);
     List<Tuple2<String, String>> result = readRDD.collect();
 
     for (Tuple2<String, String> res : result) {
-      assertEquals(res._2(), container.get(new URI(res._1()).getPath()));
+      // Note that the paths from `wholeTextFiles` are in URI format on Windows,
+      // for example, file:/C:/a/b/c.
+      assertEquals(res._2(), container.get(new Path(res._1()).toUri().getPath()));
     }
   }
 

From 136f687c6282c328c2ae121fc3d45207550d184b Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 18 Nov 2016 16:13:02 -0800
Subject: [PATCH 144/534] [SPARK-18477][SS] Enable interrupts for HDFS in
 HDFSMetadataLog

## What changes were proposed in this pull request?

HDFS `write` may just hang until timeout if some network error happens. It's better to enable interrupts to allow stopping the query fast on HDFS.

This PR just changes the logic to only disable interrupts for local file system, as HADOOP-10622 only happens for local file system.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15911 from zsxwing/interrupt-on-dfs.

(cherry picked from commit e5f5c29e021d504284fe5ad1a77dcd5a992ac10a)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../execution/streaming/HDFSMetadataLog.scala | 56 ++++++++++++++-----
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index 080729b2ca8d6..d95ec7f67feb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -105,25 +105,34 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
   /**
    * Store the metadata for the specified batchId and return `true` if successful. If the batchId's
    * metadata has already been stored, this method will return `false`.
-   *
-   * Note that this method must be called on a [[org.apache.spark.util.UninterruptibleThread]]
-   * so that interrupts can be disabled while writing the batch file. This is because there is a
-   * potential dead-lock in Hadoop "Shell.runCommand" before 2.5.0 (HADOOP-10622). If the thread
-   * running "Shell.runCommand" is interrupted, then the thread can get deadlocked. In our
-   * case, `writeBatch` creates a file using HDFS API and calls "Shell.runCommand" to set the
-   * file permissions, and can get deadlocked if the stream execution thread is stopped by
-   * interrupt. Hence, we make sure that this method is called on [[UninterruptibleThread]] which
-   * allows us to disable interrupts here. Also see SPARK-14131.
    */
   override def add(batchId: Long, metadata: T): Boolean = {
     get(batchId).map(_ => false).getOrElse {
       // Only write metadata when the batch has not yet been written
-      Thread.currentThread match {
-        case ut: UninterruptibleThread =>
-          ut.runUninterruptibly { writeBatch(batchId, metadata, serialize) }
-        case _ =>
-          throw new IllegalStateException(
-            "HDFSMetadataLog.add() must be executed on a o.a.spark.util.UninterruptibleThread")
+      if (fileManager.isLocalFileSystem) {
+        Thread.currentThread match {
+          case ut: UninterruptibleThread =>
+            // When using a local file system, "writeBatch" must be called on a
+            // [[org.apache.spark.util.UninterruptibleThread]] so that interrupts can be disabled
+            // while writing the batch file. This is because there is a potential dead-lock in
+            // Hadoop "Shell.runCommand" before 2.5.0 (HADOOP-10622). If the thread running
+            // "Shell.runCommand" is interrupted, then the thread can get deadlocked. In our case,
+            // `writeBatch` creates a file using HDFS API and will call "Shell.runCommand" to set
+            // the file permission if using the local file system, and can get deadlocked if the
+            // stream execution thread is stopped by interrupt. Hence, we make sure that
+            // "writeBatch" is called on [[UninterruptibleThread]] which allows us to disable
+            // interrupts here. Also see SPARK-14131.
+            ut.runUninterruptibly { writeBatch(batchId, metadata, serialize) }
+          case _ =>
+            throw new IllegalStateException(
+              "HDFSMetadataLog.add() on a local file system must be executed on " +
+                "a o.a.spark.util.UninterruptibleThread")
+        }
+      } else {
+        // For a distributed file system, such as HDFS or S3, if the network is broken, write
+        // operations may just hang until timeout. We should enable interrupts to allow stopping
+        // the query fast.
+        writeBatch(batchId, metadata, serialize)
       }
       true
     }
@@ -298,6 +307,9 @@ object HDFSMetadataLog {
 
     /** Recursively delete a path if it exists. Should not throw exception if file doesn't exist. */
     def delete(path: Path): Unit
+
+    /** Whether the file systme is a local FS. */
+    def isLocalFileSystem: Boolean
   }
 
   /**
@@ -342,6 +354,13 @@ object HDFSMetadataLog {
         // ignore if file has already been deleted
       }
     }
+
+    override def isLocalFileSystem: Boolean = fc.getDefaultFileSystem match {
+      case _: local.LocalFs | _: local.RawLocalFs =>
+        // LocalFs = RawLocalFs + ChecksumFs
+        true
+      case _ => false
+    }
   }
 
   /**
@@ -398,5 +417,12 @@ object HDFSMetadataLog {
           // ignore if file has already been deleted
       }
     }
+
+    override def isLocalFileSystem: Boolean = fs match {
+      case _: LocalFileSystem | _: RawLocalFileSystem =>
+        // LocalFileSystem = RawLocalFileSystem + ChecksumFileSystem
+        true
+      case _ => false
+    }
   }
 }

From 4b1df0e89badd9bb175673aefc96d3f9358e976d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Fri, 18 Nov 2016 16:34:11 -0800
Subject: [PATCH 145/534] [SPARK-18505][SQL] Simplify AnalyzeColumnCommand

## What changes were proposed in this pull request?
I'm spending more time at the design & code level for cost-based optimizer now, and have found a number of issues related to maintainability and compatibility that I will like to address.

This is a small pull request to clean up AnalyzeColumnCommand:

1. Removed warning on duplicated columns. Warnings in log messages are useless since most users that run SQL don't see them.
2. Removed the nested updateStats function, by just inlining the function.
3. Renamed a few functions to better reflect what they do.
4. Removed the factory apply method for ColumnStatStruct. It is a bad pattern to use a apply method that returns an instantiation of a class that is not of the same type (ColumnStatStruct.apply used to return CreateNamedStruct).
5. Renamed ColumnStatStruct to just AnalyzeColumnCommand.
6. Added more documentation explaining some of the non-obvious return types and code blocks.

In follow-up pull requests, I'd like to address the following:

1. Get rid of the Map[String, ColumnStat] map, since internally we should be using Attribute to reference columns, rather than strings.
2. Decouple the fields exposed by ColumnStat and internals of Spark SQL's execution path. Currently the two are coupled because ColumnStat takes in an InternalRow.
3. Correctness: Remove code path that stores statistics in the catalog using the base64 encoding of the UnsafeRow format, which is not stable across Spark versions.
4. Clearly document the data representation stored in the catalog for statistics.

## How was this patch tested?
Affected test cases have been updated.

Author: Reynold Xin <rxin@databricks.com>

Closes #15933 from rxin/SPARK-18505.

(cherry picked from commit 6f7ff75091154fed7649ea6d79e887aad9fbde6a)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../command/AnalyzeColumnCommand.scala        | 115 ++++++++++--------
 .../spark/sql/StatisticsColumnSuite.scala     |   2 +-
 .../org/apache/spark/sql/StatisticsTest.scala |   7 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  |   4 +-
 .../sql/hive/client/HiveClientImpl.scala      |   2 +-
 5 files changed, 74 insertions(+), 56 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 6141fab4aff0d..7fc57d09e9243 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.execution.command
 
-import scala.collection.mutable
-
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
@@ -44,13 +43,16 @@ case class AnalyzeColumnCommand(
     val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db))
     val relation = EliminateSubqueryAliases(sessionState.catalog.lookupRelation(tableIdentWithDB))
 
-    relation match {
+    // Compute total size
+    val (catalogTable: CatalogTable, sizeInBytes: Long) = relation match {
       case catalogRel: CatalogRelation =>
-        updateStats(catalogRel.catalogTable,
+        // This is a Hive serde format table
+        (catalogRel.catalogTable,
           AnalyzeTableCommand.calculateTotalSize(sessionState, catalogRel.catalogTable))
 
       case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
-        updateStats(logicalRel.catalogTable.get,
+        // This is a data source format table
+        (logicalRel.catalogTable.get,
           AnalyzeTableCommand.calculateTotalSize(sessionState, logicalRel.catalogTable.get))
 
       case otherRelation =>
@@ -58,45 +60,45 @@ case class AnalyzeColumnCommand(
           s"${otherRelation.nodeName}.")
     }
 
-    def updateStats(catalogTable: CatalogTable, newTotalSize: Long): Unit = {
-      val (rowCount, columnStats) = computeColStats(sparkSession, relation)
-      // We also update table-level stats in order to keep them consistent with column-level stats.
-      val statistics = Statistics(
-        sizeInBytes = newTotalSize,
-        rowCount = Some(rowCount),
-        // Newly computed column stats should override the existing ones.
-        colStats = catalogTable.stats.map(_.colStats).getOrElse(Map()) ++ columnStats)
-      sessionState.catalog.alterTable(catalogTable.copy(stats = Some(statistics)))
-      // Refresh the cached data source table in the catalog.
-      sessionState.catalog.refreshTable(tableIdentWithDB)
-    }
+    // Compute stats for each column
+    val (rowCount, newColStats) =
+      AnalyzeColumnCommand.computeColStats(sparkSession, relation, columnNames)
+
+    // We also update table-level stats in order to keep them consistent with column-level stats.
+    val statistics = Statistics(
+      sizeInBytes = sizeInBytes,
+      rowCount = Some(rowCount),
+      // Newly computed column stats should override the existing ones.
+      colStats = catalogTable.stats.map(_.colStats).getOrElse(Map.empty) ++ newColStats)
+
+    sessionState.catalog.alterTable(catalogTable.copy(stats = Some(statistics)))
+
+    // Refresh the cached data source table in the catalog.
+    sessionState.catalog.refreshTable(tableIdentWithDB)
 
     Seq.empty[Row]
   }
+}
 
+object AnalyzeColumnCommand extends Logging {
+
+  /**
+   * Compute stats for the given columns.
+   * @return (row count, map from column name to ColumnStats)
+   *
+   * This is visible for testing.
+   */
   def computeColStats(
       sparkSession: SparkSession,
-      relation: LogicalPlan): (Long, Map[String, ColumnStat]) = {
+      relation: LogicalPlan,
+      columnNames: Seq[String]): (Long, Map[String, ColumnStat]) = {
 
-    // check correctness of column names
-    val attributesToAnalyze = mutable.MutableList[Attribute]()
-    val duplicatedColumns = mutable.MutableList[String]()
+    // Resolve the column names and dedup using AttributeSet
     val resolver = sparkSession.sessionState.conf.resolver
-    columnNames.foreach { col =>
+    val attributesToAnalyze = AttributeSet(columnNames.map { col =>
       val exprOption = relation.output.find(attr => resolver(attr.name, col))
-      val expr = exprOption.getOrElse(throw new AnalysisException(s"Invalid column name: $col."))
-      // do deduplication
-      if (!attributesToAnalyze.contains(expr)) {
-        attributesToAnalyze += expr
-      } else {
-        duplicatedColumns += col
-      }
-    }
-    if (duplicatedColumns.nonEmpty) {
-      logWarning("Duplicate column names were deduplicated in `ANALYZE TABLE` statement. " +
-        s"Input columns: ${columnNames.mkString("(", ", ", ")")}. " +
-        s"Duplicate columns: ${duplicatedColumns.mkString("(", ", ", ")")}.")
-    }
+      exprOption.getOrElse(throw new AnalysisException(s"Invalid column name: $col."))
+    }).toSeq
 
     // Collect statistics per column.
     // The first element in the result will be the overall row count, the following elements
@@ -104,22 +106,21 @@ case class AnalyzeColumnCommand(
     // The layout of each struct follows the layout of the ColumnStats.
     val ndvMaxErr = sparkSession.sessionState.conf.ndvMaxError
     val expressions = Count(Literal(1)).toAggregateExpression() +:
-      attributesToAnalyze.map(ColumnStatStruct(_, ndvMaxErr))
+      attributesToAnalyze.map(AnalyzeColumnCommand.createColumnStatStruct(_, ndvMaxErr))
     val namedExpressions = expressions.map(e => Alias(e, e.toString)())
     val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, namedExpressions, relation))
       .queryExecution.toRdd.collect().head
 
     // unwrap the result
+    // TODO: Get rid of numFields by using the public Dataset API.
     val rowCount = statsRow.getLong(0)
     val columnStats = attributesToAnalyze.zipWithIndex.map { case (expr, i) =>
-      val numFields = ColumnStatStruct.numStatFields(expr.dataType)
+      val numFields = AnalyzeColumnCommand.numStatFields(expr.dataType)
       (expr.name, ColumnStat(statsRow.getStruct(i + 1, numFields)))
     }.toMap
     (rowCount, columnStats)
   }
-}
 
-object ColumnStatStruct {
   private val zero = Literal(0, LongType)
   private val one = Literal(1, LongType)
 
@@ -137,7 +138,11 @@ object ColumnStatStruct {
   private def numTrues(e: Expression): Expression = Sum(If(e, one, zero))
   private def numFalses(e: Expression): Expression = Sum(If(Not(e), one, zero))
 
-  private def getStruct(exprs: Seq[Expression]): CreateNamedStruct = {
+  /**
+   * Creates a struct that groups the sequence of expressions together. This is used to create
+   * one top level struct per column.
+   */
+  private def createStruct(exprs: Seq[Expression]): CreateNamedStruct = {
     CreateStruct(exprs.map { expr: Expression =>
       expr.transformUp {
         case af: AggregateFunction => af.toAggregateExpression()
@@ -161,6 +166,7 @@ object ColumnStatStruct {
     Seq(numNulls(e), numTrues(e), numFalses(e))
   }
 
+  // TODO(rxin): Get rid of this function.
   def numStatFields(dataType: DataType): Int = {
     dataType match {
       case BinaryType | BooleanType => 3
@@ -168,14 +174,25 @@ object ColumnStatStruct {
     }
   }
 
-  def apply(attr: Attribute, relativeSD: Double): CreateNamedStruct = attr.dataType match {
-    // Use aggregate functions to compute statistics we need.
-    case _: NumericType | TimestampType | DateType => getStruct(numericColumnStat(attr, relativeSD))
-    case StringType => getStruct(stringColumnStat(attr, relativeSD))
-    case BinaryType => getStruct(binaryColumnStat(attr))
-    case BooleanType => getStruct(booleanColumnStat(attr))
-    case otherType =>
-      throw new AnalysisException("Analyzing columns is not supported for column " +
-        s"${attr.name} of data type: ${attr.dataType}.")
+  /**
+   * Creates a struct expression that contains the statistics to collect for a column.
+   *
+   * @param attr column to collect statistics
+   * @param relativeSD relative error for approximate number of distinct values.
+   */
+  def createColumnStatStruct(attr: Attribute, relativeSD: Double): CreateNamedStruct = {
+    attr.dataType match {
+      case _: NumericType | TimestampType | DateType =>
+        createStruct(numericColumnStat(attr, relativeSD))
+      case StringType =>
+        createStruct(stringColumnStat(attr, relativeSD))
+      case BinaryType =>
+        createStruct(binaryColumnStat(attr))
+      case BooleanType =>
+        createStruct(booleanColumnStat(attr))
+      case otherType =>
+        throw new AnalysisException("Analyzing columns is not supported for column " +
+            s"${attr.name} of data type: ${attr.dataType}.")
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
index f1a201abd8da6..e866ac2cb3b34 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
@@ -79,7 +79,7 @@ class StatisticsColumnSuite extends StatisticsTest {
         val tableIdent = TableIdentifier(table, Some("default"))
         val relation = spark.sessionState.catalog.lookupRelation(tableIdent)
         val (_, columnStats) =
-          AnalyzeColumnCommand(tableIdent, columnsToAnalyze).computeColStats(spark, relation)
+          AnalyzeColumnCommand.computeColStats(spark, relation, columnsToAnalyze)
         assert(columnStats.contains(colName1))
         assert(columnStats.contains(colName2))
         // check deduplication
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
index 5134ac0e7e5b3..915ee0d31bca2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
-import org.apache.spark.sql.execution.command.{AnalyzeColumnCommand, ColumnStatStruct}
+import org.apache.spark.sql.execution.command.AnalyzeColumnCommand
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
+
 trait StatisticsTest extends QueryTest with SharedSQLContext {
 
   def checkColStats(
@@ -36,7 +37,7 @@ trait StatisticsTest extends QueryTest with SharedSQLContext {
       val tableIdent = TableIdentifier(table, Some("default"))
       val relation = spark.sessionState.catalog.lookupRelation(tableIdent)
       val (_, columnStats) =
-        AnalyzeColumnCommand(tableIdent, columns.map(_.name)).computeColStats(spark, relation)
+        AnalyzeColumnCommand.computeColStats(spark, relation, columns.map(_.name))
       expectedColStatsSeq.foreach { case (field, expectedColStat) =>
         assert(columnStats.contains(field.name))
         val colStat = columnStats(field.name)
@@ -48,7 +49,7 @@ trait StatisticsTest extends QueryTest with SharedSQLContext {
 
         // check if we get the same colStat after encoding and decoding
         val encodedCS = colStat.toString
-        val numFields = ColumnStatStruct.numStatFields(field.dataType)
+        val numFields = AnalyzeColumnCommand.numStatFields(field.dataType)
         val decodedCS = ColumnStat(numFields, encodedCS)
         StatisticsTest.checkColStat(
           dataType = field.dataType,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index cacffcf33c263..5dbb4024bbee0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
-import org.apache.spark.sql.execution.command.{ColumnStatStruct, DDLUtils}
+import org.apache.spark.sql.execution.command.{AnalyzeColumnCommand, DDLUtils}
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -634,7 +634,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         .map { case (k, v) => (k.drop(STATISTICS_COL_STATS_PREFIX.length), v) }
       val colStats: Map[String, ColumnStat] = tableWithSchema.schema.collect {
         case f if colStatsProps.contains(f.name) =>
-          val numFields = ColumnStatStruct.numStatFields(f.dataType)
+          val numFields = AnalyzeColumnCommand.numStatFields(f.dataType)
           (f.name, ColumnStat(numFields, colStatsProps(f.name)))
       }.toMap
       tableWithSchema.copy(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 2bf9a26b0b7fc..daae8523c6366 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -97,7 +97,7 @@ private[hive] class HiveClientImpl(
   }
 
   // Create an internal session state for this HiveClientImpl.
-  val state = {
+  val state: SessionState = {
     val original = Thread.currentThread().getContextClassLoader
     // Switch to the initClassLoader.
     Thread.currentThread().setContextClassLoader(initClassLoader)

From b4bad04c5e20b06992100c1d44ece9d3a5b4f817 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 18 Nov 2016 16:34:38 -0800
Subject: [PATCH 146/534] [SPARK-18497][SS] Make ForeachSink support watermark

## What changes were proposed in this pull request?

The issue in ForeachSink is the new created DataSet still uses the old QueryExecution. When `foreachPartition` is called, `QueryExecution.toString` will be called and then fail because it doesn't know how to plan EventTimeWatermark.

This PR just replaces the QueryExecution with IncrementalExecution to fix the issue.

## How was this patch tested?

`test("foreach with watermark")`.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15934 from zsxwing/SPARK-18497.

(cherry picked from commit 2a40de408b5eb47edba92f9fe92a42ed1e78bf98)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/execution/streaming/ForeachSink.scala | 16 ++++-----
 .../streaming/ForeachSinkSuite.scala          | 35 +++++++++++++++++++
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
index f5c550dd6ac3a..c93fcfb77cc93 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
@@ -47,22 +47,22 @@ class ForeachSink[T : Encoder](writer: ForeachWriter[T]) extends Sink with Seria
     // method supporting incremental planning. But in the long run, we should generally make newly
     // created Datasets use `IncrementalExecution` where necessary (which is SPARK-16264 tries to
     // resolve).
-
+    val incrementalExecution = data.queryExecution.asInstanceOf[IncrementalExecution]
     val datasetWithIncrementalExecution =
-      new Dataset(data.sparkSession, data.logicalPlan, implicitly[Encoder[T]]) {
+      new Dataset(data.sparkSession, incrementalExecution, implicitly[Encoder[T]]) {
         override lazy val rdd: RDD[T] = {
           val objectType = exprEnc.deserializer.dataType
           val deserialized = CatalystSerde.deserialize[T](logicalPlan)
 
           // was originally: sparkSession.sessionState.executePlan(deserialized) ...
-          val incrementalExecution = new IncrementalExecution(
+          val newIncrementalExecution = new IncrementalExecution(
             this.sparkSession,
             deserialized,
-            data.queryExecution.asInstanceOf[IncrementalExecution].outputMode,
-            data.queryExecution.asInstanceOf[IncrementalExecution].checkpointLocation,
-            data.queryExecution.asInstanceOf[IncrementalExecution].currentBatchId,
-            data.queryExecution.asInstanceOf[IncrementalExecution].currentEventTimeWatermark)
-          incrementalExecution.toRdd.mapPartitions { rows =>
+            incrementalExecution.outputMode,
+            incrementalExecution.checkpointLocation,
+            incrementalExecution.currentBatchId,
+            incrementalExecution.currentEventTimeWatermark)
+          newIncrementalExecution.toRdd.mapPartitions { rows =>
             rows.map(_.get(0, objectType))
           }.asInstanceOf[RDD[T]]
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index 9e059216110f2..ee6261036fdd0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -25,6 +25,7 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.ForeachWriter
+import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, StreamTest}
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -169,6 +170,40 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
       assert(errorEvent.error.get.getMessage === "error")
     }
   }
+
+  test("foreach with watermark") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"count".as[Long])
+      .map(_.toInt)
+      .repartition(1)
+
+    val query = windowedAggregation
+      .writeStream
+      .outputMode(OutputMode.Complete)
+      .foreach(new TestForeachWriter())
+      .start()
+    try {
+      inputData.addData(10, 11, 12)
+      query.processAllAvailable()
+
+      val allEvents = ForeachSinkSuite.allEvents()
+      assert(allEvents.size === 1)
+      val expectedEvents = Seq(
+        ForeachSinkSuite.Open(partition = 0, version = 0),
+        ForeachSinkSuite.Process(value = 3),
+        ForeachSinkSuite.Close(None)
+      )
+      assert(allEvents === Seq(expectedEvents))
+    } finally {
+      query.stop()
+    }
+  }
 }
 
 /** A global object to collect events in the executor */

From 693401be24bfefe5305038b87888cdeb641d7642 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 19 Nov 2016 09:00:11 +0000
Subject: [PATCH 147/534] [SPARK-18448][CORE] SparkSession should implement
 java.lang.AutoCloseable like JavaSparkContext

## What changes were proposed in this pull request?

Just adds `close()` + `Closeable` as a synonym for `stop()`. This makes it usable in Java in try-with-resources, as suggested by ash211  (`Closeable` extends `AutoCloseable` BTW)

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15932 from srowen/SPARK-18448.

(cherry picked from commit db9fb9baacbf8640dd37a507b7450db727c7e6ea)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/scala/org/apache/spark/sql/SparkSession.scala | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 3045eb69f427f..58b2ab3957173 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import java.beans.Introspector
+import java.io.Closeable
 import java.util.concurrent.atomic.AtomicReference
 
 import scala.collection.JavaConverters._
@@ -72,7 +73,7 @@ import org.apache.spark.util.Utils
 class SparkSession private(
     @transient val sparkContext: SparkContext,
     @transient private val existingSharedState: Option[SharedState])
-  extends Serializable with Logging { self =>
+  extends Serializable with Closeable with Logging { self =>
 
   private[sql] def this(sc: SparkContext) {
     this(sc, None)
@@ -647,6 +648,13 @@ class SparkSession private(
     sparkContext.stop()
   }
 
+  /**
+   * Synonym for `stop()`.
+   *
+   * @since 2.2.0
+   */
+  override def close(): Unit = stop()
+
   /**
    * Parses the data type in our internal string representation. The data type string should
    * have the same format as the one generated by `toString` in scala.

From 4b396a6545ec0f1e31b0e211228f04bdc5660300 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 19 Nov 2016 11:24:15 +0000
Subject: [PATCH 148/534] [SPARK-18445][BUILD][DOCS] Fix the markdown for
 `Note:`/`NOTE:`/`Note that`/`'''Note:'''` across Scala/Java API documentation

It seems in Scala/Java,

- `Note:`
- `NOTE:`
- `Note that`
- `'''Note:'''`
- `note`

This PR proposes to fix those to `note` to be consistent.

**Before**

- Scala
  ![2016-11-17 6 16 39](https://cloud.githubusercontent.com/assets/6477701/20383180/1a7aed8c-acf2-11e6-9611-5eaf6d52c2e0.png)

- Java
  ![2016-11-17 6 14 41](https://cloud.githubusercontent.com/assets/6477701/20383096/c8ffc680-acf1-11e6-914a-33460bf1401d.png)

**After**

- Scala
  ![2016-11-17 6 16 44](https://cloud.githubusercontent.com/assets/6477701/20383167/09940490-acf2-11e6-937a-0d5e1dc2cadf.png)

- Java
  ![2016-11-17 6 13 39](https://cloud.githubusercontent.com/assets/6477701/20383132/e7c2a57e-acf1-11e6-9c47-b849674d4d88.png)

The notes were found via

```bash
grep -r "NOTE: " . | \ # Note:|NOTE:|Note that|'''Note:'''
grep -v "// NOTE: " | \  # starting with // does not appear in API documentation.
grep -E '.scala|.java' | \ # java/scala files
grep -v Suite | \ # exclude tests
grep -v Test | \ # exclude tests
grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation
-e 'org.apache.spark.api.java.function' \ # note that this is a regular expression. So actual matches were mostly `org/apache/spark/api/java/functions ...`
-e 'org.apache.spark.api.r' \
...
```

```bash
grep -r "Note that " . | \ # Note:|NOTE:|Note that|'''Note:'''
grep -v "// Note that " | \  # starting with // does not appear in API documentation.
grep -E '.scala|.java' | \ # java/scala files
grep -v Suite | \ # exclude tests
grep -v Test | \ # exclude tests
grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation
-e 'org.apache.spark.api.java.function' \
-e 'org.apache.spark.api.r' \
...
```

```bash
grep -r "Note: " . | \ # Note:|NOTE:|Note that|'''Note:'''
grep -v "// Note: " | \  # starting with // does not appear in API documentation.
grep -E '.scala|.java' | \ # java/scala files
grep -v Suite | \ # exclude tests
grep -v Test | \ # exclude tests
grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation
-e 'org.apache.spark.api.java.function' \
-e 'org.apache.spark.api.r' \
...
```

```bash
grep -r "'''Note:'''" . | \ # Note:|NOTE:|Note that|'''Note:'''
grep -v "// '''Note:''' " | \  # starting with // does not appear in API documentation.
grep -E '.scala|.java' | \ # java/scala files
grep -v Suite | \ # exclude tests
grep -v Test | \ # exclude tests
grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation
-e 'org.apache.spark.api.java.function' \
-e 'org.apache.spark.api.r' \
...
```

And then fixed one by one comparing with API documentation/access modifiers.

After that, manually tested via `jekyll build`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15889 from HyukjinKwon/SPARK-18437.

(cherry picked from commit d5b1d5fc80153571c308130833d0c0774de62c92)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../org/apache/spark/ContextCleaner.scala     |  2 +-
 .../scala/org/apache/spark/Partitioner.scala  |  2 +-
 .../scala/org/apache/spark/SparkConf.scala    |  6 +-
 .../scala/org/apache/spark/SparkContext.scala | 47 ++++++++-------
 .../apache/spark/api/java/JavaDoubleRDD.scala |  4 +-
 .../apache/spark/api/java/JavaPairRDD.scala   | 26 ++++----
 .../org/apache/spark/api/java/JavaRDD.scala   | 12 ++--
 .../apache/spark/api/java/JavaRDDLike.scala   |  3 +-
 .../spark/api/java/JavaSparkContext.scala     | 21 +++----
 .../api/java/JavaSparkStatusTracker.scala     |  2 +-
 .../apache/spark/io/CompressionCodec.scala    | 23 ++++---
 .../apache/spark/partial/BoundedDouble.scala  |  2 +-
 .../org/apache/spark/rdd/CoGroupedRDD.scala   |  8 +--
 .../apache/spark/rdd/DoubleRDDFunctions.scala |  2 +-
 .../org/apache/spark/rdd/HadoopRDD.scala      |  6 +-
 .../org/apache/spark/rdd/NewHadoopRDD.scala   |  6 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   | 23 +++----
 .../spark/rdd/PartitionPruningRDD.scala       |  2 +-
 .../spark/rdd/PartitionwiseSampledRDD.scala   |  2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala | 46 +++++++-------
 .../apache/spark/rdd/RDDCheckpointData.scala  |  2 +-
 .../spark/rdd/ReliableCheckpointRDD.scala     |  2 +-
 .../spark/rdd/SequenceFileRDDFunctions.scala  |  5 +-
 .../apache/spark/rdd/ZippedWithIndexRDD.scala |  2 +-
 .../spark/scheduler/AccumulableInfo.scala     | 10 ++--
 .../spark/serializer/JavaSerializer.scala     |  2 +-
 .../spark/serializer/KryoSerializer.scala     |  2 +-
 .../apache/spark/serializer/Serializer.scala  |  2 +-
 .../apache/spark/storage/StorageUtils.scala   | 19 +++---
 .../org/apache/spark/util/AccumulatorV2.scala |  5 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   |  2 +-
 docs/mllib-isotonic-regression.md             |  2 +-
 docs/streaming-programming-guide.md           |  2 +-
 .../spark/sql/kafka010/KafkaSource.scala      |  2 +-
 .../spark/streaming/kafka/KafkaUtils.scala    |  8 +--
 .../streaming/kinesis/KinesisUtils.scala      | 60 +++++++++----------
 .../kinesis/KinesisBackedBlockRDDSuite.scala  |  2 +-
 .../apache/spark/graphx/impl/GraphImpl.scala  |  2 +-
 .../apache/spark/graphx/lib/PageRank.scala    |  2 +-
 .../org/apache/spark/ml/linalg/Vectors.scala  |  2 +-
 .../scala/org/apache/spark/ml/Model.scala     |  2 +-
 .../DecisionTreeClassifier.scala              |  6 +-
 .../ml/classification/GBTClassifier.scala     |  6 +-
 .../classification/LogisticRegression.scala   | 36 +++++------
 .../spark/ml/clustering/GaussianMixture.scala |  6 +-
 .../spark/ml/feature/MinMaxScaler.scala       |  3 +-
 .../spark/ml/feature/OneHotEncoder.scala      |  3 +-
 .../org/apache/spark/ml/feature/PCA.scala     |  5 +-
 .../spark/ml/feature/StopWordsRemover.scala   |  5 +-
 .../spark/ml/feature/StringIndexer.scala      |  6 +-
 .../org/apache/spark/ml/param/params.scala    |  2 +-
 .../ml/regression/DecisionTreeRegressor.scala |  6 +-
 .../GeneralizedLinearRegression.scala         |  4 +-
 .../ml/regression/LinearRegression.scala      | 28 +++++----
 .../ml/source/libsvm/LibSVMDataSource.scala   |  2 +-
 .../ml/tree/impl/GradientBoostedTrees.scala   |  4 +-
 .../org/apache/spark/ml/util/ReadWrite.scala  |  2 +-
 .../classification/LogisticRegression.scala   | 28 +++++----
 .../spark/mllib/classification/SVM.scala      | 20 ++++---
 .../mllib/clustering/GaussianMixture.scala    |  8 +--
 .../spark/mllib/clustering/KMeans.scala       |  8 ++-
 .../apache/spark/mllib/clustering/LDA.scala   |  4 +-
 .../spark/mllib/clustering/LDAModel.scala     |  2 +-
 .../spark/mllib/clustering/LDAOptimizer.scala |  6 +-
 .../mllib/evaluation/AreaUnderCurve.scala     |  2 +-
 .../apache/spark/mllib/linalg/Vectors.scala   |  6 +-
 .../linalg/distributed/BlockMatrix.scala      |  2 +-
 .../linalg/distributed/IndexedRowMatrix.scala |  5 +-
 .../mllib/linalg/distributed/RowMatrix.scala  | 21 ++++---
 .../spark/mllib/optimization/Gradient.scala   |  3 +-
 .../apache/spark/mllib/rdd/RDDFunctions.scala |  2 +-
 .../MatrixFactorizationModel.scala            |  6 +-
 .../apache/spark/mllib/stat/Statistics.scala  | 34 +++++------
 .../spark/mllib/tree/DecisionTree.scala       | 32 +++++-----
 .../apache/spark/mllib/tree/loss/Loss.scala   | 12 ++--
 .../mllib/tree/model/treeEnsembleModels.scala |  4 +-
 pom.xml                                       |  7 +++
 project/SparkBuild.scala                      |  3 +-
 python/pyspark/mllib/stat/KernelDensity.py    |  2 +-
 python/pyspark/mllib/util.py                  |  2 +-
 python/pyspark/rdd.py                         |  4 +-
 python/pyspark/streaming/kafka.py             |  4 +-
 .../scala/org/apache/spark/sql/Encoders.scala |  8 +--
 .../sql/types/CalendarIntervalType.scala      |  4 +-
 .../scala/org/apache/spark/sql/Column.scala   |  2 +-
 .../spark/sql/DataFrameStatFunctions.scala    |  3 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  2 +-
 .../scala/org/apache/spark/sql/Dataset.scala  | 56 ++++++++---------
 .../org/apache/spark/sql/SQLContext.scala     |  7 ++-
 .../org/apache/spark/sql/SparkSession.scala   |  9 +--
 .../apache/spark/sql/UDFRegistration.scala    |  3 +-
 .../execution/streaming/state/package.scala   |  4 +-
 .../sql/expressions/UserDefinedFunction.scala |  8 ++-
 .../org/apache/spark/sql/functions.scala      | 22 +++----
 .../apache/spark/sql/jdbc/JdbcDialects.scala  |  2 +-
 .../apache/spark/sql/sources/interfaces.scala | 10 ++--
 .../sql/util/QueryExecutionListener.scala     |  8 ++-
 .../columnar/InMemoryColumnarQuerySuite.scala |  2 +-
 .../spark/streaming/StreamingContext.scala    | 18 +++---
 .../streaming/api/java/JavaPairDStream.scala  |  2 +-
 .../api/java/JavaStreamingContext.scala       | 40 +++++++------
 .../spark/streaming/dstream/DStream.scala     |  4 +-
 .../dstream/MapWithStateDStream.scala         |  2 +-
 .../WriteAheadLogBackedBlockRDDSuite.scala    |  2 +-
 104 files changed, 516 insertions(+), 435 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
index 5678d790e9e76..af913454fce69 100644
--- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala
+++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -139,7 +139,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
     periodicGCService.shutdown()
   }
 
-  /** Register a RDD for cleanup when it is garbage collected. */
+  /** Register an RDD for cleanup when it is garbage collected. */
   def registerRDDForCleanup(rdd: RDD[_]): Unit = {
     registerForCleanup(rdd, CleanRDD(rdd.id))
   }
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index 93dfbc0e6ed65..f83f5278e8b8f 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -101,7 +101,7 @@ class HashPartitioner(partitions: Int) extends Partitioner {
  * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
  * equal ranges. The ranges are determined by sampling the content of the RDD passed in.
  *
- * Note that the actual number of partitions created by the RangePartitioner might not be the same
+ * @note The actual number of partitions created by the RangePartitioner might not be the same
  * as the `partitions` parameter, in the case where the number of sampled records is less than
  * the value of `partitions`.
  */
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index c9c342df82c97..04d657c09afd0 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -42,10 +42,10 @@ import org.apache.spark.util.Utils
  * All setter methods in this class support chaining. For example, you can write
  * `new SparkConf().setMaster("local").setAppName("My app")`.
  *
- * Note that once a SparkConf object is passed to Spark, it is cloned and can no longer be modified
- * by the user. Spark does not support modifying the configuration at runtime.
- *
  * @param loadDefaults whether to also load values from Java system properties
+ *
+ * @note Once a SparkConf object is passed to Spark, it is cloned and can no longer be modified
+ * by the user. Spark does not support modifying the configuration at runtime.
  */
 class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Serializable {
 
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 25a3d609a6b09..1261e3e735761 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -281,7 +281,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * A default Hadoop Configuration for the Hadoop code (e.g. file systems) that we reuse.
    *
-   * '''Note:''' As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
+   * @note As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
    * plan to set some global configurations for all Hadoop RDDs.
    */
   def hadoopConfiguration: Configuration = _hadoopConfiguration
@@ -700,7 +700,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Execute a block of code in a scope such that all new RDDs created in this body will
    * be part of the same scope. For more detail, see {{org.apache.spark.rdd.RDDOperationScope}}.
    *
-   * Note: Return statements are NOT allowed in the given body.
+   * @note Return statements are NOT allowed in the given body.
    */
   private[spark] def withScope[U](body: => U): U = RDDOperationScope.withScope[U](this)(body)
 
@@ -927,7 +927,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Load data from a flat binary file, assuming the length of each record is constant.
    *
-   * '''Note:''' We ensure that the byte array for each record in the resulting RDD
+   * @note We ensure that the byte array for each record in the resulting RDD
    * has the provided record length.
    *
    * @param path Directory to the input data files, the path can be comma separated paths as the
@@ -970,7 +970,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * @param valueClass Class of the values
    * @param minPartitions Minimum number of Hadoop Splits to generate.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -995,7 +995,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /** Get an RDD for a Hadoop file with an arbitrary InputFormat
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1034,7 +1034,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path, minPartitions)
    * }}}
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1058,7 +1058,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path)
    * }}}
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1084,7 +1084,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1124,7 +1124,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * @param kClass Class of the keys
    * @param vClass Class of the values
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1150,7 +1150,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Get an RDD for a Hadoop SequenceFile with given key and value types.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1169,7 +1169,7 @@ class SparkContext(config: SparkConf) extends Logging {
   /**
    * Get an RDD for a Hadoop SequenceFile with given key and value types.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1199,7 +1199,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * for the appropriate type. In addition, we pass the converter a ClassTag of its type to
    * allow it to figure out the Writable class to use in the subclass case.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
    * operation will create many references to the same object.
    * If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
@@ -1330,16 +1330,18 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Register the given accumulator.  Note that accumulators must be registered before use, or it
-   * will throw exception.
+   * Register the given accumulator.
+   *
+   * @note Accumulators must be registered before use, or it will throw exception.
    */
   def register(acc: AccumulatorV2[_, _]): Unit = {
     acc.register(this)
   }
 
   /**
-   * Register the given accumulator with given name.  Note that accumulators must be registered
-   * before use, or it will throw exception.
+   * Register the given accumulator with given name.
+   *
+   * @note Accumulators must be registered before use, or it will throw exception.
    */
   def register(acc: AccumulatorV2[_, _], name: String): Unit = {
     acc.register(this, name = Some(name))
@@ -1550,7 +1552,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executors.
    *
-   * Note: This is an indication to the cluster manager that the application wishes to adjust
+   * @note This is an indication to the cluster manager that the application wishes to adjust
    * its resource usage downwards. If the application wishes to replace the executors it kills
    * through this method with new ones, it should follow up explicitly with a call to
    * {{SparkContext#requestExecutors}}.
@@ -1572,7 +1574,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executor.
    *
-   * Note: This is an indication to the cluster manager that the application wishes to adjust
+   * @note This is an indication to the cluster manager that the application wishes to adjust
    * its resource usage downwards. If the application wishes to replace the executor it kills
    * through this method with a new one, it should follow up explicitly with a call to
    * {{SparkContext#requestExecutors}}.
@@ -1590,7 +1592,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * this request. This assumes the cluster manager will automatically and eventually
    * fulfill all missing application resource requests.
    *
-   * Note: The replace is by no means guaranteed; another application on the same cluster
+   * @note The replace is by no means guaranteed; another application on the same cluster
    * can steal the window of opportunity and acquire this application's resources in the
    * mean time.
    *
@@ -1639,7 +1641,8 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /**
    * Returns an immutable map of RDDs that have marked themselves as persistent via cache() call.
-   * Note that this does not necessarily mean the caching or computation was successful.
+   *
+   * @note This does not necessarily mean the caching or computation was successful.
    */
   def getPersistentRDDs: Map[Int, RDD[_]] = persistentRdds.toMap
 
@@ -2298,7 +2301,7 @@ object SparkContext extends Logging {
    * singleton object. Because we can only have one active SparkContext per JVM,
    * this is useful when applications may wish to share a SparkContext.
    *
-   * Note: This function cannot be used to create multiple SparkContext instances
+   * @note This function cannot be used to create multiple SparkContext instances
    * even if multiple contexts are allowed.
    */
   def getOrCreate(config: SparkConf): SparkContext = {
@@ -2323,7 +2326,7 @@ object SparkContext extends Logging {
    *
    * This method allows not passing a SparkConf (useful if just retrieving).
    *
-   * Note: This function cannot be used to create multiple SparkContext instances
+   * @note This function cannot be used to create multiple SparkContext instances
    * even if multiple contexts are allowed.
    */
   def getOrCreate(): SparkContext = {
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index 0026fc9dad517..a32a4b28c1731 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -153,7 +153,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: JavaDoubleRDD): JavaDoubleRDD = fromRDD(srdd.intersection(other.srdd))
 
@@ -256,7 +256,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
    *  e.g 1&lt;=x&lt;10 , 10&lt;=x&lt;20, 20&lt;=x&lt;50
    *  And on the input of 1 and 50 we would have a histogram of 1,0,0
    *
-   * Note: if your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
+   * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
    * from an O(log n) insertion to O(1) per element. (where n = # buckets) if you set evenBuckets
    * to true.
    * buckets must be sorted and not contain any duplicates.
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 1c95bc4bfcaaf..bff5a29bb60f1 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -206,7 +206,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: JavaPairRDD[K, V]): JavaPairRDD[K, V] =
     new JavaPairRDD[K, V](rdd.intersection(other.rdd))
@@ -223,9 +223,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   /**
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
-   * "combined type" C. Note that V and C can be different -- for example, one might group an
-   * RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
-   * functions:
+   * "combined type" C.
+   *
+   * Users provide three functions:
    *
    *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
    *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
@@ -234,6 +234,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * In addition, users can control the partitioning of the output RDD, the serializer that is use
    * for the shuffle, and whether to perform map-side aggregation (if a mapper can produce multiple
    * items with the same key).
+   *
+   * @note V and C can be different -- for example, one might group an RDD of type (Int, Int) into
+   * an RDD of type (Int, List[Int]).
    */
   def combineByKey[C](createCombiner: JFunction[V, C],
       mergeValue: JFunction2[C, V, C],
@@ -255,9 +258,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   /**
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns a JavaPairRDD[(K, V)] into a result of type JavaPairRDD[(K, C)], for a
-   * "combined type" C. Note that V and C can be different -- for example, one might group an
-   * RDD of type (Int, Int) into an RDD of type (Int, List[Int]). Users provide three
-   * functions:
+   * "combined type" C.
+   *
+   * Users provide three functions:
    *
    *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
    *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
@@ -265,6 +268,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    *
    * In addition, users can control the partitioning of the output RDD. This method automatically
    * uses map-side aggregation in shuffling the RDD.
+   *
+   * @note V and C can be different -- for example, one might group an RDD of type (Int, Int) into
+   * an RDD of type (Int, List[Int]).
    */
   def combineByKey[C](createCombiner: JFunction[V, C],
       mergeValue: JFunction2[C, V, C],
@@ -398,7 +404,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Allows controlling the
    * partitioning of the resulting key-value pair RDD by passing a Partitioner.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
+   * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
    * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
    * will provide much better performance.
    */
@@ -409,7 +415,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Hash-partitions the
    * resulting RDD with into `numPartitions` partitions.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
+   * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
    * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
    * will provide much better performance.
    */
@@ -539,7 +545,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * Group the values for each key in the RDD into a single sequence. Hash-partitions the
    * resulting RDD with the existing partitioner/parallelism level.
    *
-   * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over
+   * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
    * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
    * will provide much better performance.
    */
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index d67cff64e6e46..ccd94f876e0b8 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -99,27 +99,29 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
 
   /**
    * Return a sampled subset of this RDD with a random seed.
-   * Note: this is NOT guaranteed to provide exactly the fraction of the count
-   * of the given [[RDD]].
    *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
    *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
    */
   def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] =
     sample(withReplacement, fraction, Utils.random.nextLong)
 
   /**
    * Return a sampled subset of this RDD, with a user-supplied seed.
-   * Note: this is NOT guaranteed to provide exactly the fraction of the count
-   * of the given [[RDD]].
    *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
    *  with replacement: expected number of times each element is chosen; fraction must be >= 0
    * @param seed seed for the random number generator
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] =
     wrapRDD(rdd.sample(withReplacement, fraction, seed))
@@ -157,7 +159,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.intersection(other.rdd))
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index a37c52cbaf210..eda16d957cc58 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -47,7 +47,8 @@ private[spark] abstract class AbstractJavaRDDLike[T, This <: JavaRDDLike[T, This
 
 /**
  * Defines operations common to several Java RDD implementations.
- * Note that this trait is not intended to be implemented by user code.
+ *
+ * @note This trait is not intended to be implemented by user code.
  */
 trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def wrapRDD(rdd: RDD[T]): This
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 4e50c2686dd53..38d347aeab8c6 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -298,7 +298,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop SequenceFile with given key and value types.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -316,7 +316,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop SequenceFile.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -366,7 +366,7 @@ class JavaSparkContext(val sc: SparkContext)
    * @param valueClass Class of the values
    * @param minPartitions Minimum number of Hadoop Splits to generate.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -396,7 +396,7 @@ class JavaSparkContext(val sc: SparkContext)
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -416,7 +416,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop file with an arbitrary InputFormat.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -437,7 +437,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Get an RDD for a Hadoop file with an arbitrary InputFormat
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -458,7 +458,7 @@ class JavaSparkContext(val sc: SparkContext)
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -487,7 +487,7 @@ class JavaSparkContext(val sc: SparkContext)
    * @param kClass Class of the keys
    * @param vClass Class of the values
    *
-   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * @note Because Hadoop's RecordReader class re-uses the same Writable object for each
    * record, directly caching the returned RDD will create many references to the same object.
    * If you plan to directly cache Hadoop writable objects, you should first copy them using
    * a `map` function.
@@ -694,7 +694,7 @@ class JavaSparkContext(val sc: SparkContext)
   /**
    * Returns the Hadoop configuration used for the Hadoop code (e.g. file systems) we reuse.
    *
-   * '''Note:''' As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
+   * @note As it will be reused in all Hadoop RDDs, it's better not to modify it unless you
    * plan to set some global configurations for all Hadoop RDDs.
    */
   def hadoopConfiguration(): Configuration = {
@@ -811,7 +811,8 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Returns a Java map of JavaRDDs that have marked themselves as persistent via cache() call.
-   * Note that this does not necessarily mean the caching or computation was successful.
+   *
+   * @note This does not necessarily mean the caching or computation was successful.
    */
   def getPersistentRDDs: JMap[java.lang.Integer, JavaRDD[_]] = {
     sc.getPersistentRDDs.mapValues(s => JavaRDD.fromRDD(s))
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala
index 99ca3c77cced0..6aa290ecd7bb5 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkStatusTracker.scala
@@ -31,7 +31,7 @@ import org.apache.spark.{SparkContext, SparkJobInfo, SparkStageInfo}
  * will provide information for the last `spark.ui.retainedStages` stages and
  * `spark.ui.retainedJobs` jobs.
  *
- * NOTE: this class's constructor should be considered private and may be subject to change.
+ * @note This class's constructor should be considered private and may be subject to change.
  */
 class JavaSparkStatusTracker private[spark] (sc: SparkContext) {
 
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index ae014becef755..6ba79e506a648 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -32,9 +32,8 @@ import org.apache.spark.util.Utils
  * CompressionCodec allows the customization of choosing different compression implementations
  * to be used in block storage.
  *
- * Note: The wire protocol for a codec is not guaranteed compatible across versions of Spark.
- *       This is intended for use as an internal compression utility within a single
- *       Spark application.
+ * @note The wire protocol for a codec is not guaranteed compatible across versions of Spark.
+ * This is intended for use as an internal compression utility within a single Spark application.
  */
 @DeveloperApi
 trait CompressionCodec {
@@ -103,9 +102,9 @@ private[spark] object CompressionCodec {
  * LZ4 implementation of [[org.apache.spark.io.CompressionCodec]].
  * Block size can be configured by `spark.io.compression.lz4.blockSize`.
  *
- * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
- *       of Spark. This is intended for use as an internal compression utility within a single Spark
- *       application.
+ * @note The wire protocol for this codec is not guaranteed to be compatible across versions
+ * of Spark. This is intended for use as an internal compression utility within a single Spark
+ * application.
  */
 @DeveloperApi
 class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
@@ -123,9 +122,9 @@ class LZ4CompressionCodec(conf: SparkConf) extends CompressionCodec {
  * :: DeveloperApi ::
  * LZF implementation of [[org.apache.spark.io.CompressionCodec]].
  *
- * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
- *       of Spark. This is intended for use as an internal compression utility within a single Spark
- *       application.
+ * @note The wire protocol for this codec is not guaranteed to be compatible across versions
+ * of Spark. This is intended for use as an internal compression utility within a single Spark
+ * application.
  */
 @DeveloperApi
 class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
@@ -143,9 +142,9 @@ class LZFCompressionCodec(conf: SparkConf) extends CompressionCodec {
  * Snappy implementation of [[org.apache.spark.io.CompressionCodec]].
  * Block size can be configured by `spark.io.compression.snappy.blockSize`.
  *
- * Note: The wire protocol for this codec is not guaranteed to be compatible across versions
- *       of Spark. This is intended for use as an internal compression utility within a single Spark
- *       application.
+ * @note The wire protocol for this codec is not guaranteed to be compatible across versions
+ * of Spark. This is intended for use as an internal compression utility within a single Spark
+ * application.
  */
 @DeveloperApi
 class SnappyCompressionCodec(conf: SparkConf) extends CompressionCodec {
diff --git a/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala b/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala
index ab6aba6fc7d6a..8f579c5a3033c 100644
--- a/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala
+++ b/core/src/main/scala/org/apache/spark/partial/BoundedDouble.scala
@@ -28,7 +28,7 @@ class BoundedDouble(val mean: Double, val confidence: Double, val low: Double, v
     this.mean.hashCode ^ this.confidence.hashCode ^ this.low.hashCode ^ this.high.hashCode
 
   /**
-   * Note that consistent with Double, any NaN value will make equality false
+   * @note Consistent with Double, any NaN value will make equality false
    */
   override def equals(that: Any): Boolean =
     that match {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 2381f54ee3f06..a091f06b4ed7c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -66,14 +66,14 @@ private[spark] class CoGroupPartition(
 
 /**
  * :: DeveloperApi ::
- * A RDD that cogroups its parents. For each key k in parent RDDs, the resulting RDD contains a
+ * An RDD that cogroups its parents. For each key k in parent RDDs, the resulting RDD contains a
  * tuple with the list of values for that key.
  *
- * Note: This is an internal API. We recommend users use RDD.cogroup(...) instead of
- * instantiating this directly.
- *
  * @param rdds parent RDDs.
  * @param part partitioner used to partition the shuffle output
+ *
+ * @note This is an internal API. We recommend users use RDD.cogroup(...) instead of
+ * instantiating this directly.
  */
 @DeveloperApi
 class CoGroupedRDD[K: ClassTag](
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index a05a770b40c57..f3ab324d59119 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -158,7 +158,7 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
    *  e.g 1<=x<10 , 10<=x<20, 20<=x<=50
    *  And on the input of 1 and 50 we would have a histogram of 1, 0, 1
    *
-   * Note: if your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
+   * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
    * from an O(log n) insertion to O(1) per element. (where n = # buckets) if you set evenBuckets
    * to true.
    * buckets must be sorted and not contain any duplicates.
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 36a2f5c87e372..86351b8c575e5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -84,9 +84,6 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp
  * An RDD that provides core functionality for reading data stored in Hadoop (e.g., files in HDFS,
  * sources in HBase, or S3), using the older MapReduce API (`org.apache.hadoop.mapred`).
  *
- * Note: Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.hadoopRDD()]]
- *
  * @param sc The SparkContext to associate the RDD with.
  * @param broadcastedConf A general Hadoop Configuration, or a subclass of it. If the enclosed
  *   variable references an instance of JobConf, then that JobConf will be used for the Hadoop job.
@@ -97,6 +94,9 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
  * @param minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate.
+ *
+ * @note Instantiating this class directly is not recommended, please use
+ * [[org.apache.spark.SparkContext.hadoopRDD()]]
  */
 @DeveloperApi
 class HadoopRDD[K, V](
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 488e777fea371..a5965f597038d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -57,13 +57,13 @@ private[spark] class NewHadoopPartition(
  * An RDD that provides core functionality for reading data stored in Hadoop (e.g., files in HDFS,
  * sources in HBase, or S3), using the new MapReduce API (`org.apache.hadoop.mapreduce`).
  *
- * Note: Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.newAPIHadoopRDD()]]
- *
  * @param sc The SparkContext to associate the RDD with.
  * @param inputFormatClass Storage format of the data to be read.
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
+ *
+ * @note Instantiating this class directly is not recommended, please use
+ * [[org.apache.spark.SparkContext.newAPIHadoopRDD()]]
  */
 @DeveloperApi
 class NewHadoopRDD[K, V](
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 67baad1c51bca..9ed0f3d8086a5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -59,8 +59,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * :: Experimental ::
    * Generic function to combine the elements for each key using a custom set of aggregation
    * functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
-   * Note that V and C can be different -- for example, one might group an RDD of type
-   * (Int, Int) into an RDD of type (Int, Seq[Int]). Users provide three functions:
+   *
+   * Users provide three functions:
    *
    *  - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
    *  - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
@@ -68,6 +68,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    *
    * In addition, users can control the partitioning of the output RDD, and whether to perform
    * map-side aggregation (if a mapper can produce multiple items with the same key).
+   *
+   * @note V and C can be different -- for example, one might group an RDD of type
+   * (Int, Int) into an RDD of type (Int, Seq[Int]).
    */
   @Experimental
   def combineByKeyWithClassTag[C](
@@ -363,7 +366,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   /**
    * Count the number of elements for each key, collecting the results to a local Map.
    *
-   * Note that this method should only be used if the resulting map is expected to be small, as
+   * @note This method should only be used if the resulting map is expected to be small, as
    * the whole thing is loaded into the driver's memory.
    * To handle very large results, consider using rdd.mapValues(_ => 1L).reduceByKey(_ + _), which
    * returns an RDD[T, Long] instead of a map.
@@ -490,11 +493,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * The ordering of elements within each group is not guaranteed, and may even differ
    * each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * @note This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    *
-   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for any
+   * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
    * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
    */
   def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] = self.withScope {
@@ -514,11 +517,11 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * resulting RDD with into `numPartitions` partitions. The ordering of elements within
    * each group is not guaranteed, and may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * @note This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    *
-   * Note: As currently implemented, groupByKey must be able to hold all the key-value pairs for any
+   * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
    * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
    */
   def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])] = self.withScope {
@@ -635,7 +638,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * within each group is not guaranteed, and may even differ each time the resulting RDD is
    * evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * @note This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
@@ -1016,7 +1019,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Output the RDD to any Hadoop-supported file system, using a Hadoop `OutputFormat` class
    * supporting the key and value types K and V in this RDD.
    *
-   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * @note We should make sure our tasks are idempotent when speculation is enabled, i.e. do
    * not use output committer that writes data directly.
    * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
    * result of using direct output committer with speculation enabled.
@@ -1070,7 +1073,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * output paths required (e.g. a table name to write to) in the same way as it would be
    * configured for a Hadoop MapReduce job.
    *
-   * Note that, we should make sure our tasks are idempotent when speculation is enabled, i.e. do
+   * @note We should make sure our tasks are idempotent when speculation is enabled, i.e. do
    * not use output committer that writes data directly.
    * There is an example in https://issues.apache.org/jira/browse/SPARK-10063 to show the bad
    * result of using direct output committer with speculation enabled.
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
index 0c6ddda52cee9..ce75a16031a3f 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionPruningRDD.scala
@@ -48,7 +48,7 @@ private[spark] class PruneDependency[T](rdd: RDD[T], partitionFilterFunc: Int =>
 
 /**
  * :: DeveloperApi ::
- * A RDD used to prune RDD partitions/partitions so we can avoid launching tasks on
+ * An RDD used to prune RDD partitions/partitions so we can avoid launching tasks on
  * all partitions. An example use case: If we know the RDD is partitioned by range,
  * and the execution DAG has a filter on the key, we can avoid launching tasks
  * on partitions that don't have the range covering the key.
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
index 3b1acacf409b9..6a89ea8786464 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -32,7 +32,7 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
 }
 
 /**
- * A RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
+ * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
  * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain
  * a random sample of the records in the partition. The random seeds assigned to the samplers
  * are guaranteed to have different values.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index cded899db1f5c..bff2b8f1d06c9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -428,7 +428,7 @@ abstract class RDD[T: ClassTag](
    * current upstream partitions will be executed in parallel (per whatever
    * the current partitioning is).
    *
-   * Note: With shuffle = true, you can actually coalesce to a larger number
+   * @note With shuffle = true, you can actually coalesce to a larger number
    * of partitions. This is useful if you have a small number of partitions,
    * say 100, potentially with a few partitions being abnormally large. Calling
    * coalesce(1000, shuffle = true) will result in 1000 partitions with the
@@ -466,14 +466,14 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a sampled subset of this RDD.
    *
-   * Note: this is NOT guaranteed to provide exactly the fraction of the count
-   * of the given [[RDD]].
-   *
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
    *  with replacement: expected number of times each element is chosen; fraction must be >= 0
    * @param seed seed for the random number generator
+   *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[RDD]].
    */
   def sample(
       withReplacement: Boolean,
@@ -537,13 +537,13 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a fixed-size sampled subset of this RDD in an array
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
-   * all the data is loaded into the driver's memory.
-   *
    * @param withReplacement whether sampling is done with replacement
    * @param num size of the returned sample
    * @param seed seed for the random number generator
    * @return sample of specified size in an array
+   *
+   * @note this method should only be used if the resulting array is expected to be small, as
+   * all the data is loaded into the driver's memory.
    */
   def takeSample(
       withReplacement: Boolean,
@@ -618,7 +618,7 @@ abstract class RDD[T: ClassTag](
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    */
   def intersection(other: RDD[T]): RDD[T] = withScope {
     this.map(v => (v, null)).cogroup(other.map(v => (v, null)))
@@ -630,7 +630,7 @@ abstract class RDD[T: ClassTag](
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    *
    * @param partitioner Partitioner to use for the resulting RDD
    */
@@ -646,7 +646,7 @@ abstract class RDD[T: ClassTag](
    * Return the intersection of this RDD and another one. The output will not contain any duplicate
    * elements, even if the input RDDs did.  Performs a hash partition across the cluster
    *
-   * Note that this method performs a shuffle internally.
+   * @note This method performs a shuffle internally.
    *
    * @param numPartitions How many partitions to use in the resulting RDD
    */
@@ -674,7 +674,7 @@ abstract class RDD[T: ClassTag](
    * mapping to that key. The ordering of elements within each group is not guaranteed, and
    * may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * @note This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
@@ -687,7 +687,7 @@ abstract class RDD[T: ClassTag](
    * mapping to that key. The ordering of elements within each group is not guaranteed, and
    * may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * @note This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
@@ -702,7 +702,7 @@ abstract class RDD[T: ClassTag](
    * mapping to that key. The ordering of elements within each group is not guaranteed, and
    * may even differ each time the resulting RDD is evaluated.
    *
-   * Note: This operation may be very expensive. If you are grouping in order to perform an
+   * @note This operation may be very expensive. If you are grouping in order to perform an
    * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
    * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
    */
@@ -921,7 +921,7 @@ abstract class RDD[T: ClassTag](
   /**
    * Return an array that contains all of the elements in this RDD.
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    */
   def collect(): Array[T] = withScope {
@@ -934,7 +934,7 @@ abstract class RDD[T: ClassTag](
    *
    * The iterator will consume as much memory as the largest partition in this RDD.
    *
-   * Note: this results in multiple Spark jobs, and if the input RDD is the result
+   * @note This results in multiple Spark jobs, and if the input RDD is the result
    * of a wide transformation (e.g. join with different partitioners), to avoid
    * recomputing the input RDD should be cached first.
    */
@@ -1182,7 +1182,7 @@ abstract class RDD[T: ClassTag](
   /**
    * Return the count of each unique value in this RDD as a local map of (value, count) pairs.
    *
-   * Note that this method should only be used if the resulting map is expected to be small, as
+   * @note This method should only be used if the resulting map is expected to be small, as
    * the whole thing is loaded into the driver's memory.
    * To handle very large results, consider using rdd.map(x =&gt; (x, 1L)).reduceByKey(_ + _), which
    * returns an RDD[T, Long] instead of a map.
@@ -1272,7 +1272,7 @@ abstract class RDD[T: ClassTag](
    * This is similar to Scala's zipWithIndex but it uses Long instead of Int as the index type.
    * This method needs to trigger a spark job when this RDD contains more than one partitions.
    *
-   * Note that some RDDs, such as those returned by groupBy(), do not guarantee order of
+   * @note Some RDDs, such as those returned by groupBy(), do not guarantee order of
    * elements in a partition. The index assigned to each element is therefore not guaranteed,
    * and may even change if the RDD is reevaluated. If a fixed ordering is required to guarantee
    * the same index assignments, you should sort the RDD with sortByKey() or save it to a file.
@@ -1286,7 +1286,7 @@ abstract class RDD[T: ClassTag](
    * 2*n+k, ..., where n is the number of partitions. So there may exist gaps, but this method
    * won't trigger a spark job, which is different from [[org.apache.spark.rdd.RDD#zipWithIndex]].
    *
-   * Note that some RDDs, such as those returned by groupBy(), do not guarantee order of
+   * @note Some RDDs, such as those returned by groupBy(), do not guarantee order of
    * elements in a partition. The unique ID assigned to each element is therefore not guaranteed,
    * and may even change if the RDD is reevaluated. If a fixed ordering is required to guarantee
    * the same index assignments, you should sort the RDD with sortByKey() or save it to a file.
@@ -1305,10 +1305,10 @@ abstract class RDD[T: ClassTag](
    * results from that partition to estimate the number of additional partitions needed to satisfy
    * the limit.
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    *
-   * @note due to complications in the internal implementation, this method will raise
+   * @note Due to complications in the internal implementation, this method will raise
    * an exception if called on an RDD of `Nothing` or `Null`.
    */
   def take(num: Int): Array[T] = withScope {
@@ -1370,7 +1370,7 @@ abstract class RDD[T: ClassTag](
    *   // returns Array(6, 5)
    * }}}
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    *
    * @param num k, the number of top elements to return
@@ -1393,7 +1393,7 @@ abstract class RDD[T: ClassTag](
    *   // returns Array(2, 3)
    * }}}
    *
-   * @note this method should only be used if the resulting array is expected to be small, as
+   * @note This method should only be used if the resulting array is expected to be small, as
    * all the data is loaded into the driver's memory.
    *
    * @param num k, the number of elements to return
@@ -1438,7 +1438,7 @@ abstract class RDD[T: ClassTag](
   }
 
   /**
-   * @note due to complications in the internal implementation, this method will raise an
+   * @note Due to complications in the internal implementation, this method will raise an
    * exception if called on an RDD of `Nothing` or `Null`. This may be come up in practice
    * because, for example, the type of `parallelize(Seq())` is `RDD[Nothing]`.
    * (`parallelize(Seq())` should be avoided anyway in favor of `parallelize(Seq[T]())`.)
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index 429514b4f6bee..1070bb96b2524 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -32,7 +32,7 @@ private[spark] object CheckpointState extends Enumeration {
 
 /**
  * This class contains all the information related to RDD checkpointing. Each instance of this
- * class is associated with a RDD. It manages process of checkpointing of the associated RDD,
+ * class is associated with an RDD. It manages process of checkpointing of the associated RDD,
  * as well as, manages the post-checkpoint state by providing the updated partitions,
  * iterator and preferred locations of the checkpointed RDD.
  */
diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
index eac901d10067c..7f399ecf81a08 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala
@@ -151,7 +151,7 @@ private[spark] object ReliableCheckpointRDD extends Logging {
   }
 
   /**
-   * Write a RDD partition's data to a checkpoint file.
+   * Write an RDD partition's data to a checkpoint file.
    */
   def writePartitionToCheckpointFile[T: ClassTag](
       path: String,
diff --git a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
index 1311b481c7c71..86a332790fb00 100644
--- a/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/SequenceFileRDDFunctions.scala
@@ -27,9 +27,10 @@ import org.apache.spark.internal.Logging
 
 /**
  * Extra functions available on RDDs of (key, value) pairs to create a Hadoop SequenceFile,
- * through an implicit conversion. Note that this can't be part of PairRDDFunctions because
- * we need more implicit parameters to convert our keys and values to Writable.
+ * through an implicit conversion.
  *
+ * @note This can't be part of PairRDDFunctions because we need more implicit parameters to
+ * convert our keys and values to Writable.
  */
 class SequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable : ClassTag](
     self: RDD[(K, V)],
diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
index b0e5ba0865c63..8425b211d6ecf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala
@@ -29,7 +29,7 @@ class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
 }
 
 /**
- * Represents a RDD zipped with its element indices. The ordering is first based on the partition
+ * Represents an RDD zipped with its element indices. The ordering is first based on the partition
  * index and then the ordering of items within each partition. So the first item in the first
  * partition gets index 0, and the last item in the last partition receives the largest index.
  *
diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
index cedacad44afec..0a5fe5a1d3ee1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
@@ -24,11 +24,6 @@ import org.apache.spark.annotation.DeveloperApi
  * :: DeveloperApi ::
  * Information about an [[org.apache.spark.Accumulable]] modified during a task or stage.
  *
- * Note: once this is JSON serialized the types of `update` and `value` will be lost and be
- * cast to strings. This is because the user can define an accumulator of any type and it will
- * be difficult to preserve the type in consumers of the event log. This does not apply to
- * internal accumulators that represent task level metrics.
- *
  * @param id accumulator ID
  * @param name accumulator name
  * @param update partial value from a task, may be None if used on driver to describe a stage
@@ -36,6 +31,11 @@ import org.apache.spark.annotation.DeveloperApi
  * @param internal whether this accumulator was internal
  * @param countFailedValues whether to count this accumulator's partial value if the task failed
  * @param metadata internal metadata associated with this accumulator, if any
+ *
+ * @note Once this is JSON serialized the types of `update` and `value` will be lost and be
+ * cast to strings. This is because the user can define an accumulator of any type and it will
+ * be difficult to preserve the type in consumers of the event log. This does not apply to
+ * internal accumulators that represent task level metrics.
  */
 @DeveloperApi
 case class AccumulableInfo private[spark] (
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 8b72da2ee01b7..f60dcfddfdc20 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -131,7 +131,7 @@ private[spark] class JavaSerializerInstance(
  * :: DeveloperApi ::
  * A Spark serializer that uses Java's built-in serialization.
  *
- * Note that this serializer is not guaranteed to be wire-compatible across different versions of
+ * @note This serializer is not guaranteed to be wire-compatible across different versions of
  * Spark. It is intended to be used to serialize/de-serialize data within a single
  * Spark application.
  */
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 0d26281fe1076..19e020c968a9a 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -45,7 +45,7 @@ import org.apache.spark.util.collection.CompactBuffer
 /**
  * A Spark serializer that uses the [[https://code.google.com/p/kryo/ Kryo serialization library]].
  *
- * Note that this serializer is not guaranteed to be wire-compatible across different versions of
+ * @note This serializer is not guaranteed to be wire-compatible across different versions of
  * Spark. It is intended to be used to serialize/de-serialize data within a single
  * Spark application.
  */
diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
index cb95246d5b0ca..afe6cd86059f0 100644
--- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala
@@ -40,7 +40,7 @@ import org.apache.spark.util.NextIterator
  *
  * 2. Java serialization interface.
  *
- * Note that serializers are not required to be wire-compatible across different versions of Spark.
+ * @note Serializers are not required to be wire-compatible across different versions of Spark.
  * They are intended to be used to serialize/de-serialize data within a single Spark application.
  */
 @DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
index fb9941bbd9e0f..e12f2e6095d5a 100644
--- a/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
+++ b/core/src/main/scala/org/apache/spark/storage/StorageUtils.scala
@@ -71,7 +71,7 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
   /**
    * Return the blocks stored in this block manager.
    *
-   * Note that this is somewhat expensive, as it involves cloning the underlying maps and then
+   * @note This is somewhat expensive, as it involves cloning the underlying maps and then
    * concatenating them together. Much faster alternatives exist for common operations such as
    * contains, get, and size.
    */
@@ -80,7 +80,7 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
   /**
    * Return the RDD blocks stored in this block manager.
    *
-   * Note that this is somewhat expensive, as it involves cloning the underlying maps and then
+   * @note This is somewhat expensive, as it involves cloning the underlying maps and then
    * concatenating them together. Much faster alternatives exist for common operations such as
    * getting the memory, disk, and off-heap memory sizes occupied by this RDD.
    */
@@ -128,7 +128,8 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
   /**
    * Return whether the given block is stored in this block manager in O(1) time.
-   * Note that this is much faster than `this.blocks.contains`, which is O(blocks) time.
+   *
+   * @note This is much faster than `this.blocks.contains`, which is O(blocks) time.
    */
   def containsBlock(blockId: BlockId): Boolean = {
     blockId match {
@@ -141,7 +142,8 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
   /**
    * Return the given block stored in this block manager in O(1) time.
-   * Note that this is much faster than `this.blocks.get`, which is O(blocks) time.
+   *
+   * @note This is much faster than `this.blocks.get`, which is O(blocks) time.
    */
   def getBlock(blockId: BlockId): Option[BlockStatus] = {
     blockId match {
@@ -154,19 +156,22 @@ class StorageStatus(val blockManagerId: BlockManagerId, val maxMem: Long) {
 
   /**
    * Return the number of blocks stored in this block manager in O(RDDs) time.
-   * Note that this is much faster than `this.blocks.size`, which is O(blocks) time.
+   *
+   * @note This is much faster than `this.blocks.size`, which is O(blocks) time.
    */
   def numBlocks: Int = _nonRddBlocks.size + numRddBlocks
 
   /**
    * Return the number of RDD blocks stored in this block manager in O(RDDs) time.
-   * Note that this is much faster than `this.rddBlocks.size`, which is O(RDD blocks) time.
+   *
+   * @note This is much faster than `this.rddBlocks.size`, which is O(RDD blocks) time.
    */
   def numRddBlocks: Int = _rddBlocks.values.map(_.size).sum
 
   /**
    * Return the number of blocks that belong to the given RDD in O(1) time.
-   * Note that this is much faster than `this.rddBlocksById(rddId).size`, which is
+   *
+   * @note This is much faster than `this.rddBlocksById(rddId).size`, which is
    * O(blocks in this RDD) time.
    */
   def numRddBlocksById(rddId: Int): Int = _rddBlocks.get(rddId).map(_.size).getOrElse(0)
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index d3ddd39131326..1326f0977c241 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -59,8 +59,9 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
   }
 
   /**
-   * Returns true if this accumulator has been registered.  Note that all accumulators must be
-   * registered before use, or it will throw exception.
+   * Returns true if this accumulator has been registered.
+   *
+   * @note All accumulators must be registered before use, or it will throw exception.
    */
   final def isRegistered: Boolean =
     metadata != null && AccumulatorContext.get(metadata.id).isDefined
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index bec95d13d193a..5e8a854e46a0f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -2076,7 +2076,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
   }
 
   /**
-   * Checks the DAGScheduler's internal logic for traversing a RDD DAG by making sure that
+   * Checks the DAGScheduler's internal logic for traversing an RDD DAG by making sure that
    * getShuffleDependencies correctly returns the direct shuffle dependencies of a particular
    * RDD. The test creates the following RDD graph (where n denotes a narrow dependency and s
    * denotes a shuffle dependency):
diff --git a/docs/mllib-isotonic-regression.md b/docs/mllib-isotonic-regression.md
index d90905a86ade9..ca84551506b2b 100644
--- a/docs/mllib-isotonic-regression.md
+++ b/docs/mllib-isotonic-regression.md
@@ -27,7 +27,7 @@ best fitting the original data points.
 [pool adjacent violators algorithm](http://doi.org/10.1198/TECH.2010.10111)
 which uses an approach to
 [parallelizing isotonic regression](http://doi.org/10.1007/978-3-642-99789-1_10).
-The training input is a RDD of tuples of three double values that represent
+The training input is an RDD of tuples of three double values that represent
 label, feature and weight in this order. Additionally IsotonicRegression algorithm has one
 optional parameter called $isotonic$ defaulting to true.
 This argument specifies if the isotonic regression is
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 0b0315b366501..18fc1cd934826 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -2191,7 +2191,7 @@ consistent batch processing times. Make sure you set the CMS GC on both the driv
 
 - When data is received from a stream source, receiver creates blocks of data.  A new block of data is generated every blockInterval milliseconds. N blocks of data are created during the batchInterval where N = batchInterval/blockInterval. These blocks are distributed by the BlockManager of the current executor to the block managers of other executors. After that, the Network Input Tracker running on the driver is informed about the block locations for further processing.
 
-- A RDD is created on the driver for the blocks created during the batchInterval. The blocks generated during the batchInterval are partitions of the RDD. Each partition is a task in spark. blockInterval== batchinterval would mean that a single partition is created and probably it is processed locally.
+- An RDD is created on the driver for the blocks created during the batchInterval. The blocks generated during the batchInterval are partitions of the RDD. Each partition is a task in spark. blockInterval== batchinterval would mean that a single partition is created and probably it is processed locally.
 
 - The map tasks on the blocks are processed in the executors (one that received the block, and another where the block was replicated) that has the blocks irrespective of block interval, unless non-local scheduling kicks in.
 Having bigger blockinterval means bigger blocks. A high value of `spark.locality.wait` increases the chance of processing a block on the local node. A balance needs to be found out between these two parameters to ensure that the bigger blocks are processed locally.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 5bcc5124b0915..341081a338c0e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -279,7 +279,7 @@ private[kafka010] case class KafkaSource(
       }
     }.toArray
 
-    // Create a RDD that reads from Kafka and get the (key, value) pair as byte arrays.
+    // Create an RDD that reads from Kafka and get the (key, value) pair as byte arrays.
     val rdd = new KafkaSourceRDD(
       sc, executorKafkaParams, offsetRanges, pollTimeoutMs).map { cr =>
       Row(cr.key, cr.value, cr.topic, cr.partition, cr.offset, cr.timestamp, cr.timestampType.id)
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index b17e198077949..56f0cb0b166a2 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -223,7 +223,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition.
+   * Create an RDD from Kafka using offset ranges for each topic and partition.
    *
    * @param sc SparkContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
@@ -255,7 +255,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * Create an RDD from Kafka using offset ranges for each topic and partition. This allows you
    * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
    * as the metadata.
    *
@@ -303,7 +303,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition.
+   * Create an RDD from Kafka using offset ranges for each topic and partition.
    *
    * @param jsc JavaSparkContext object
    * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
@@ -340,7 +340,7 @@ object KafkaUtils {
   }
 
   /**
-   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * Create an RDD from Kafka using offset ranges for each topic and partition. This allows you
    * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
    * as the metadata.
    *
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index a0007d33d6257..b2daffa34ccbf 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -33,10 +33,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -57,6 +53,10 @@ object KinesisUtils {
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    * @param messageHandler A custom message handler that can generate a generic output from a
    *                       Kinesis `Record`, which contains both message data, and metadata.
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream[T: ClassTag](
       ssc: StreamingContext,
@@ -81,10 +81,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   *  is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -107,6 +103,9 @@ object KinesisUtils {
    *                       Kinesis `Record`, which contains both message data, and metadata.
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   // scalastyle:off
   def createStream[T: ClassTag](
@@ -134,10 +133,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -156,6 +151,10 @@ object KinesisUtils {
    *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream(
       ssc: StreamingContext,
@@ -178,10 +177,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   *  The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   *  is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param ssc StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -202,6 +197,9 @@ object KinesisUtils {
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   def createStream(
       ssc: StreamingContext,
@@ -225,10 +223,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -250,6 +244,10 @@ object KinesisUtils {
    * @param messageHandler A custom message handler that can generate a generic output from a
    *                       Kinesis `Record`, which contains both message data, and metadata.
    * @param recordClass Class of the records in DStream
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream[T](
       jssc: JavaStreamingContext,
@@ -272,10 +270,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   * The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   * is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -299,6 +293,9 @@ object KinesisUtils {
    * @param recordClass Class of the records in DStream
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   // scalastyle:off
   def createStream[T](
@@ -326,10 +323,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note: The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
-   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
-   * gets the AWS credentials.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -348,6 +341,10 @@ object KinesisUtils {
    *                            details on the different types of checkpoints.
    * @param storageLevel Storage level to use for storing the received objects.
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
+   *
+   * @note The AWS credentials will be discovered using the DefaultAWSCredentialsProviderChain
+   * on the workers. See AWS documentation to understand how DefaultAWSCredentialsProviderChain
+   * gets the AWS credentials.
    */
   def createStream(
       jssc: JavaStreamingContext,
@@ -367,10 +364,6 @@ object KinesisUtils {
    * Create an input stream that pulls messages from a Kinesis stream.
    * This uses the Kinesis Client Library (KCL) to pull messages from Kinesis.
    *
-   * Note:
-   * The given AWS credentials will get saved in DStream checkpoints if checkpointing
-   * is enabled. Make sure that your checkpoint directory is secure.
-   *
    * @param jssc Java StreamingContext object
    * @param kinesisAppName  Kinesis application name used by the Kinesis Client Library
    *                        (KCL) to update DynamoDB
@@ -391,6 +384,9 @@ object KinesisUtils {
    *                     StorageLevel.MEMORY_AND_DISK_2 is recommended.
    * @param awsAccessKeyId  AWS AccessKeyId (if null, will use DefaultAWSCredentialsProviderChain)
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
+   *
+   * @note The given AWS credentials will get saved in DStream checkpoints if checkpointing
+   * is enabled. Make sure that your checkpoint directory is secure.
    */
   def createStream(
       jssc: JavaStreamingContext,
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
index 905c33834df16..a4d81a680979e 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala
@@ -221,7 +221,7 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
     assert(collectedData.toSet === testData.toSet)
 
     // Verify that the block fetching is skipped when isBlockValid is set to false.
-    // This is done by using a RDD whose data is only in memory but is set to skip block fetching
+    // This is done by using an RDD whose data is only in memory but is set to skip block fetching
     // Using that RDD will throw exception, as it skips block fetching even if the blocks are in
     // in BlockManager.
     if (testIsBlockValid) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index e18831382d4d5..3810110099993 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -42,7 +42,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
 
   @transient override val edges: EdgeRDDImpl[ED, VD] = replicatedVertexView.edges
 
-  /** Return a RDD that brings edges together with their source and destination vertices. */
+  /** Return an RDD that brings edges together with their source and destination vertices. */
   @transient override lazy val triplets: RDD[EdgeTriplet[VD, ED]] = {
     replicatedVertexView.upgrade(vertices, true, true)
     replicatedVertexView.edges.partitionsRDD.mapPartitions(_.flatMap {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index c0c3c73463aab..f926984aa6335 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -58,7 +58,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
  * `alpha` is the random reset probability (typically 0.15), `inNbrs[i]` is the set of
  * neighbors which link to `i` and `outDeg[j]` is the out degree of vertex `j`.
  *
- * Note that this is not the "normalized" PageRank and as a consequence pages that have no
+ * @note This is not the "normalized" PageRank and as a consequence pages that have no
  * inlinks will have a PageRank of alpha.
  */
 object PageRank extends Logging {
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 2e4a58dc6291c..22e4ec693b1f7 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -30,7 +30,7 @@ import org.apache.spark.annotation.Since
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
  *
- * Note: Users should not implement this interface.
+ * @note Users should not implement this interface.
  */
 @Since("2.0.0")
 sealed trait Vector extends Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
index 252acc156583f..c581fed177273 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -30,7 +30,7 @@ import org.apache.spark.ml.param.ParamMap
 abstract class Model[M <: Model[M]] extends Transformer {
   /**
    * The parent estimator that produced this model.
-   * Note: For ensembles' component Models, this value can be null.
+   * @note For ensembles' component Models, this value can be null.
    */
   @transient var parent: Estimator[M] = _
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index bb192ab5f25ab..7424031ed4608 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -207,9 +207,9 @@ class DecisionTreeClassificationModel private[ml] (
    *     where gain is scaled by the number of instances passing through node
    *   - Normalize importances for tree to sum to 1.
    *
-   * Note: Feature importance for single decision trees can have high variance due to
-   *       correlated predictor variables. Consider using a [[RandomForestClassifier]]
-   *       to determine feature importance instead.
+   * @note Feature importance for single decision trees can have high variance due to
+   * correlated predictor variables. Consider using a [[RandomForestClassifier]]
+   * to determine feature importance instead.
    */
   @Since("2.0.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(this, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index f8f164e8c14bd..52f93f5a6b345 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -43,7 +43,6 @@ import org.apache.spark.sql.types.DoubleType
  * Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting)
  * learning algorithm for classification.
  * It supports binary labels, as well as both continuous and categorical features.
- * Note: Multiclass labels are not currently supported.
  *
  * The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
  *
@@ -54,6 +53,8 @@ import org.apache.spark.sql.types.DoubleType
  *    based on the loss function, whereas the original gradient boosting method does not.
  *  - We expect to implement TreeBoost in the future:
  *    [https://issues.apache.org/jira/browse/SPARK-4240]
+ *
+ * @note Multiclass labels are not currently supported.
  */
 @Since("1.4.0")
 class GBTClassifier @Since("1.4.0") (
@@ -169,10 +170,11 @@ object GBTClassifier extends DefaultParamsReadable[GBTClassifier] {
  * Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting)
  * model for classification.
  * It supports binary labels, as well as both continuous and categorical features.
- * Note: Multiclass labels are not currently supported.
  *
  * @param _trees  Decision trees in the ensemble.
  * @param _treeWeights  Weights for the decision trees in the ensemble.
+ *
+ * @note Multiclass labels are not currently supported.
  */
 @Since("1.6.0")
 class GBTClassificationModel private[ml](
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 18b9b3043db8a..71a7fe53c15f8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1191,8 +1191,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
    * See http://en.wikipedia.org/wiki/Receiver_operating_characteristic
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
@@ -1200,8 +1200,8 @@ class BinaryLogisticRegressionSummary private[classification] (
   /**
    * Computes the area under the receiver operating characteristic (ROC) curve.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
@@ -1210,8 +1210,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * Returns the precision-recall curve, which is a Dataframe containing
    * two fields recall, precision with (0.0, 1.0) prepended to it.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision")
@@ -1219,8 +1219,8 @@ class BinaryLogisticRegressionSummary private[classification] (
   /**
    * Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val fMeasureByThreshold: DataFrame = {
@@ -1232,8 +1232,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * Every possible probability obtained in transforming the dataset are used
    * as thresholds used in calculating the precision.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val precisionByThreshold: DataFrame = {
@@ -1245,8 +1245,8 @@ class BinaryLogisticRegressionSummary private[classification] (
    * Every possible probability obtained in transforming the dataset are used
    * as thresholds used in calculating the recall.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   @transient lazy val recallByThreshold: DataFrame = {
@@ -1401,18 +1401,18 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    $$
  * </blockquote></p>
  *
- * @note In order to avoid unnecessary computation during calculation of the gradient updates
- *       we lay out the coefficients in column major order during training. This allows us to
- *       perform feature standardization once, while still retaining sequential memory access
- *       for speed. We convert back to row major order when we create the model,
- *       since this form is optimal for the matrix operations used for prediction.
- *
  * @param bcCoefficients The broadcast coefficients corresponding to the features.
  * @param bcFeaturesStd The broadcast standard deviation values of the features.
  * @param numClasses the number of possible outcomes for k classes classification problem in
  *                   Multinomial Logistic Regression.
  * @param fitIntercept Whether to fit an intercept term.
  * @param multinomial Whether to use multinomial (softmax) or binary loss
+ *
+ * @note In order to avoid unnecessary computation during calculation of the gradient updates
+ * we lay out the coefficients in column major order during training. This allows us to
+ * perform feature standardization once, while still retaining sequential memory access
+ * for speed. We convert back to row major order when we create the model,
+ * since this form is optimal for the matrix operations used for prediction.
  */
 private class LogisticAggregator(
     bcCoefficients: Broadcast[Vector],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index a0bd66e731a1d..c6035cc4c9647 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -268,9 +268,9 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
  * While this process is generally guaranteed to converge, it is not guaranteed
  * to find a global optimum.
  *
- * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
- *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
- *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
+ * @note For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  */
 @Since("2.0.0")
 @Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index 28cbe1cb01e9a..ccfb0ce8f85ca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -85,7 +85,8 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
  * </blockquote></p>
  *
  * For the case $E_{max} == E_{min}$, $Rescaled(e_i) = 0.5 * (max + min)$.
- * Note that since zero values will probably be transformed to non-zero values, output of the
+ *
+ * @note Since zero values will probably be transformed to non-zero values, output of the
  * transformer will be DenseVector even for sparse input.
  */
 @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index e8e28ba29c841..ea401216aec7b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -36,7 +36,8 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
  * The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]]
  * because it makes the vector entries sum up to one, and hence linearly dependent.
  * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
- * Note that this is different from scikit-learn's OneHotEncoder, which keeps all categories.
+ *
+ * @note This is different from scikit-learn's OneHotEncoder, which keeps all categories.
  * The output vectors are sparse.
  *
  * @see [[StringIndexer]] for converting categorical values into category indices
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 1e49352b8517e..6e08bf059124c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -142,8 +142,9 @@ class PCAModel private[ml] (
 
   /**
    * Transform a vector by computed Principal Components.
-   * NOTE: Vectors to be transformed must be the same length
-   * as the source vectors given to [[PCA.fit()]].
+   *
+   * @note Vectors to be transformed must be the same length as the source vectors given
+   * to [[PCA.fit()]].
    */
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 666070037cdd8..0ced21365ff6f 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -28,7 +28,10 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
 
 /**
  * A feature transformer that filters out stop words from input.
- * Note: null values from input array are preserved unless adding null to stopWords explicitly.
+ *
+ * @note null values from input array are preserved unless adding null to stopWords
+ * explicitly.
+ *
  * @see [[http://en.wikipedia.org/wiki/Stop_words]]
  */
 @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 80fe46796f807..8b155f00017cf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -113,11 +113,11 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] {
 /**
  * Model fitted by [[StringIndexer]].
  *
- * NOTE: During transformation, if the input column does not exist,
+ * @param labels  Ordered list of labels, corresponding to indices to be assigned.
+ *
+ * @note During transformation, if the input column does not exist,
  * [[StringIndexerModel.transform]] would return the input dataset unmodified.
  * This is a temporary fix for the case when target labels do not exist during prediction.
- *
- * @param labels  Ordered list of labels, corresponding to indices to be assigned.
  */
 @Since("1.4.0")
 class StringIndexerModel (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 9245931b27ca6..96206e0b7ad88 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -533,7 +533,7 @@ trait Params extends Identifiable with Serializable {
    * Returns all params sorted by their names. The default implementation uses Java reflection to
    * list all public methods that have no arguments and return [[Param]].
    *
-   * Note: Developer should not use this method in constructor because we cannot guarantee that
+   * @note Developer should not use this method in constructor because we cannot guarantee that
    * this variable gets initialized before other params.
    */
   lazy val params: Array[Param[_]] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index ebc6c12ddcf92..1419da874709f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -207,9 +207,9 @@ class DecisionTreeRegressionModel private[ml] (
    *     where gain is scaled by the number of instances passing through node
    *   - Normalize importances for tree to sum to 1.
    *
-   * Note: Feature importance for single decision trees can have high variance due to
-   *       correlated predictor variables. Consider using a [[RandomForestRegressor]]
-   *       to determine feature importance instead.
+   * @note Feature importance for single decision trees can have high variance due to
+   * correlated predictor variables. Consider using a [[RandomForestRegressor]]
+   * to determine feature importance instead.
    */
   @Since("2.0.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(this, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 1d2961e0277f5..736fd3b9e0f64 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -879,8 +879,8 @@ class GeneralizedLinearRegressionSummary private[regression] (
    * Private copy of model to ensure Params are not modified outside this class.
    * Coefficients is not a deep copy, but that is acceptable.
    *
-   * NOTE: [[predictionCol]] must be set correctly before the value of [[model]] is set,
-   *       and [[model]] must be set before [[predictions]] is set!
+   * @note [[predictionCol]] must be set correctly before the value of [[model]] is set,
+   * and [[model]] must be set before [[predictions]] is set!
    */
   protected val model: GeneralizedLinearRegressionModel =
     origModel.copy(ParamMap.empty).setPredictionCol(predictionCol)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 71c542adf6f6f..da7ce6b46f2ab 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -103,11 +103,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   /**
    * Whether to standardize the training features before fitting the model.
    * The coefficients of models will be always returned on the original scale,
-   * so it will be transparent for users. Note that with/without standardization,
-   * the models should be always converged to the same solution when no regularization
-   * is applied. In R's GLMNET package, the default behavior is true as well.
+   * so it will be transparent for users.
    * Default is true.
    *
+   * @note With/without standardization, the models should be always converged
+   * to the same solution when no regularization is applied. In R's GLMNET package,
+   * the default behavior is true as well.
+   *
    * @group setParam
    */
   @Since("1.5.0")
@@ -624,8 +626,8 @@ class LinearRegressionSummary private[regression] (
    * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
    * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val explainedVariance: Double = metrics.explainedVariance
@@ -634,8 +636,8 @@ class LinearRegressionSummary private[regression] (
    * Returns the mean absolute error, which is a risk function corresponding to the
    * expected value of the absolute error loss or l1-norm loss.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val meanAbsoluteError: Double = metrics.meanAbsoluteError
@@ -644,8 +646,8 @@ class LinearRegressionSummary private[regression] (
    * Returns the mean squared error, which is a risk function corresponding to the
    * expected value of the squared error loss or quadratic loss.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val meanSquaredError: Double = metrics.meanSquaredError
@@ -654,8 +656,8 @@ class LinearRegressionSummary private[regression] (
    * Returns the root mean squared error, which is defined as the square root of
    * the mean squared error.
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val rootMeanSquaredError: Double = metrics.rootMeanSquaredError
@@ -664,8 +666,8 @@ class LinearRegressionSummary private[regression] (
    * Returns R^2^, the coefficient of determination.
    * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
    *
-   * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
-   *       This will change in later Spark versions.
+   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * This will change in later Spark versions.
    */
   @Since("1.5.0")
   val r2: Double = metrics.r2
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
index 73d813064decb..e1376927030e4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
@@ -48,7 +48,7 @@ import org.apache.spark.sql.{DataFrame, DataFrameReader}
  *    inconsistent feature dimensions.
  *  - "vectorType": feature vector type, "sparse" (default) or "dense".
  *
- * Note that this class is public for documentation purpose. Please don't use this class directly.
+ * @note This class is public for documentation purpose. Please don't use this class directly.
  * Rather, use the data source API as illustrated above.
  *
  * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index ede0a060eef95..0a0bc4c006389 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -98,7 +98,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * @param initTreeWeight: learning rate assigned to the first tree.
    * @param initTree: first DecisionTreeModel.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to every sample.
    */
   def computeInitialPredictionAndError(
@@ -121,7 +121,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * @param treeWeight: Learning rate.
    * @param tree: Tree using which the prediction and error should be updated.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to each sample.
    */
   def updatePredictionError(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index bc4f9e6716ee8..e5fa5d53e3fca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -221,7 +221,7 @@ trait MLReadable[T] {
   /**
    * Reads an ML instance from the input path, a shortcut of `read.load(path)`.
    *
-   * Note: Implementing classes should override this to be Java-friendly.
+   * @note Implementing classes should override this to be Java-friendly.
    */
   @Since("1.6.0")
   def load(path: String): T = read.load(path)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index d851b983349c9..4b650000736e2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -202,9 +202,11 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
  * Train a classification model for Binary Logistic Regression
  * using Stochastic Gradient Descent. By default L2 regularization is used,
  * which can be changed via `LogisticRegressionWithSGD.optimizer`.
- * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
- * for k classes multi-label classification problem.
+ *
  * Using [[LogisticRegressionWithLBFGS]] is recommended over this.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1, ..., k - 1}
+ * for k classes multi-label classification problem.
  */
 @Since("0.8.0")
 class LogisticRegressionWithSGD private[mllib] (
@@ -239,7 +241,8 @@ class LogisticRegressionWithSGD private[mllib] (
 
 /**
  * Top-level methods for calling Logistic Regression using Stochastic Gradient Descent.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
  */
 @Since("0.8.0")
 @deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
@@ -252,7 +255,6 @@ object LogisticRegressionWithSGD {
    * number of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
    * gradient descent are initialized using the initial weights provided.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -260,6 +262,8 @@ object LogisticRegressionWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -276,13 +280,13 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @param stepSize Step size to be used for each iteration of gradient descent.
-
    * @param miniBatchFraction Fraction of data to be used per iteration.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -298,13 +302,13 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using the specified step size. We use the entire data
    * set to update the gradient in each iteration.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param stepSize Step size to be used for each iteration of Gradient Descent.
-
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -318,11 +322,12 @@ object LogisticRegressionWithSGD {
    * Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
    * number of iterations of gradient descent using a step size of 1.0. We use the entire data set
    * to update the gradient in each iteration.
-   * NOTE: Labels used in Logistic Regression should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a LogisticRegressionModel which has the weights and offset from training.
+   *
+   * @note Labels used in Logistic Regression should be {0, 1}
    */
   @Since("1.0.0")
   def train(
@@ -335,8 +340,6 @@ object LogisticRegressionWithSGD {
 /**
  * Train a classification model for Multinomial/Binary Logistic Regression using
  * Limited-memory BFGS. Standard feature scaling and L2 regularization are used by default.
- * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
- * for k classes multi-label classification problem.
  *
  * Earlier implementations of LogisticRegressionWithLBFGS applies a regularization
  * penalty to all elements including the intercept. If this is called with one of
@@ -344,6 +347,9 @@ object LogisticRegressionWithSGD {
  * into a call to ml.LogisticRegression, otherwise this will use the existing mllib
  * GeneralizedLinearAlgorithm trainer, resulting in a regularization penalty to the
  * intercept.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1, ..., k - 1}
+ * for k classes multi-label classification problem.
  */
 @Since("1.1.0")
 class LogisticRegressionWithLBFGS
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 7c3ccbb40b812..aec1526b55c49 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -125,7 +125,8 @@ object SVMModel extends Loader[SVMModel] {
 /**
  * Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2
  * regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
- * NOTE: Labels used in SVM should be {0, 1}.
+ *
+ * @note Labels used in SVM should be {0, 1}.
  */
 @Since("0.8.0")
 class SVMWithSGD private (
@@ -158,7 +159,9 @@ class SVMWithSGD private (
 }
 
 /**
- * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}.
+ * Top-level methods for calling SVM.
+ *
+ * @note Labels used in SVM should be {0, 1}.
  */
 @Since("0.8.0")
 object SVMWithSGD {
@@ -169,8 +172,6 @@ object SVMWithSGD {
    * `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
    * gradient descent are initialized using the initial weights provided.
    *
-   * NOTE: Labels used in SVM should be {0, 1}.
-   *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @param stepSize Step size to be used for each iteration of gradient descent.
@@ -178,6 +179,8 @@ object SVMWithSGD {
    * @param miniBatchFraction Fraction of data to be used per iteration.
    * @param initialWeights Initial set of weights to be used. Array should be equal in size to
    *        the number of features in the data.
+   *
+   * @note Labels used in SVM should be {0, 1}.
    */
   @Since("0.8.0")
   def train(
@@ -195,7 +198,8 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using the specified step size. Each iteration uses
    * `miniBatchFraction` fraction of the data to calculate the gradient.
-   * NOTE: Labels used in SVM should be {0, 1}
+   *
+   * @note Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
@@ -217,13 +221,14 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using the specified step size. We use the entire data set to
    * update the gradient in each iteration.
-   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param stepSize Step size to be used for each iteration of Gradient Descent.
    * @param regParam Regularization parameter.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
+   *
+   * @note Labels used in SVM should be {0, 1}
    */
   @Since("0.8.0")
   def train(
@@ -238,11 +243,12 @@ object SVMWithSGD {
    * Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
    * of iterations of gradient descent using a step size of 1.0. We use the entire data set to
    * update the gradient in each iteration.
-   * NOTE: Labels used in SVM should be {0, 1}
    *
    * @param input RDD of (label, array of features) pairs.
    * @param numIterations Number of iterations of gradient descent to run.
    * @return a SVMModel which has the weights and offset from training.
+   *
+   * @note Labels used in SVM should be {0, 1}
    */
   @Since("0.8.0")
   def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 43193adf3e184..56cdeea5f7a3f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -41,14 +41,14 @@ import org.apache.spark.util.Utils
  * While this process is generally guaranteed to converge, it is not guaranteed
  * to find a global optimum.
  *
- * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
- *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
- *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
- *
  * @param k Number of independent Gaussians in the mixture model.
  * @param convergenceTol Maximum change in log-likelihood at which convergence
  *                       is considered to have occurred.
  * @param maxIterations Maximum number of iterations allowed.
+ *
+ * @note For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  */
 @Since("1.3.0")
 class GaussianMixture private (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ed9c064879d01..fa72b72e2d921 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -56,14 +56,18 @@ class KMeans private (
   def this() = this(2, 20, KMeans.K_MEANS_PARALLEL, 2, 1e-4, Utils.random.nextLong())
 
   /**
-   * Number of clusters to create (k). Note that it is possible for fewer than k clusters to
+   * Number of clusters to create (k).
+   *
+   * @note It is possible for fewer than k clusters to
    * be returned, for example, if there are fewer than k distinct points to cluster.
    */
   @Since("1.4.0")
   def getK: Int = k
 
   /**
-   * Set the number of clusters to create (k). Note that it is possible for fewer than k clusters to
+   * Set the number of clusters to create (k).
+   *
+   * @note It is possible for fewer than k clusters to
    * be returned, for example, if there are fewer than k distinct points to cluster. Default: 2.
    */
   @Since("0.8.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index d999b9be8e8ac..7c52abdeaac22 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -175,7 +175,7 @@ class LDA private (
    *
    * This is the parameter to a symmetric Dirichlet distribution.
    *
-   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * @note The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    */
   @Since("1.3.0")
@@ -187,7 +187,7 @@ class LDA private (
    *
    * This is the parameter to a symmetric Dirichlet distribution.
    *
-   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * @note The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    *
    * If set to -1, then topicConcentration is set automatically.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 90d8a558f10d4..b5b0e64a2a6c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -66,7 +66,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *
    * This is the parameter to a symmetric Dirichlet distribution.
    *
-   * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+   * @note The topics' distributions over terms are called "beta" in the original LDA paper
    * by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
    */
   @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index ae324f86fe6d1..7365ea1f200da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -93,9 +93,11 @@ final class EMLDAOptimizer extends LDAOptimizer {
   /**
    * If using checkpointing, this indicates whether to keep the last checkpoint (vs clean up).
    * Deleting the checkpoint can cause failures if a data partition is lost, so set this bit with
-   * care.  Note that checkpoints will be cleaned up via reference counting, regardless.
+   * care.
    *
    * Default: true
+   *
+   * @note Checkpoints will be cleaned up via reference counting, regardless.
    */
   @Since("2.0.0")
   def setKeepLastCheckpoint(keepLastCheckpoint: Boolean): this.type = {
@@ -348,7 +350,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Mini-batch fraction in (0, 1], which sets the fraction of document sampled and used in
    * each iteration.
    *
-   * Note that this should be adjusted in synch with [[LDA.setMaxIterations()]]
+   * @note This should be adjusted in synch with [[LDA.setMaxIterations()]]
    * so the entire corpus is used.  Specifically, set both so that
    * maxIterations * miniBatchFraction >= 1.
    *
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
index f0779491e6374..003d1411a9cf7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
@@ -39,7 +39,7 @@ private[evaluation] object AreaUnderCurve {
   /**
    * Returns the area under the given curve.
    *
-   * @param curve a RDD of ordered 2D points stored in pairs representing a curve
+   * @param curve an RDD of ordered 2D points stored in pairs representing a curve
    */
   def of(curve: RDD[(Double, Double)]): Double = {
     curve.sliding(2).aggregate(0.0)(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index fbd217af74ecb..c94d7890cf557 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types._
 /**
  * Represents a numeric vector, whose index type is Int and value type is Double.
  *
- * Note: Users should not implement this interface.
+ * @note Users should not implement this interface.
  */
 @SQLUserDefinedType(udt = classOf[VectorUDT])
 @Since("1.0.0")
@@ -132,7 +132,9 @@ sealed trait Vector extends Serializable {
 
   /**
    * Number of active entries.  An "active entry" is an element which is explicitly stored,
-   * regardless of its value.  Note that inactive entries have value 0.
+   * regardless of its value.
+   *
+   * @note Inactive entries have value 0.
    */
   @Since("1.4.0")
   def numActives: Int
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 377be6bfb9886..03866753b50ee 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -451,7 +451,7 @@ class BlockMatrix @Since("1.3.0") (
    * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
    * some performance issues until support for multiplying two sparse matrices is added.
    *
-   * Note: The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
+   * @note The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
    * there were blocks with duplicate indices. Now, the blocks with duplicate indices will be added
    * with each other.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index b03b3ecde94f4..809906a158337 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -188,8 +188,9 @@ class IndexedRowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`. Note that this cannot be
-   * computed on matrices with more than 65535 columns.
+   * Computes the Gramian matrix `A^T A`.
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index ec32e37afb792..4b120332ab8d8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -106,8 +106,9 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the Gramian matrix `A^T A`. Note that this cannot be computed on matrices with
-   * more than 65535 columns.
+   * Computes the Gramian matrix `A^T A`.
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeGramianMatrix(): Matrix = {
@@ -168,9 +169,6 @@ class RowMatrix @Since("1.0.0") (
    * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK's
    * eigen-decomposition is set to 1e-10.
    *
-   * @note The conditions that decide which method to use internally and the default parameters are
-   *       subject to change.
-   *
    * @param k number of leading singular values to keep (0 &lt; k &lt;= n).
    *          It might return less than k if
    *          there are numerically zero singular values or there are not enough Ritz values
@@ -180,6 +178,9 @@ class RowMatrix @Since("1.0.0") (
    * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
    *              are treated as zero, where sigma(0) is the largest singular value.
    * @return SingularValueDecomposition(U, s, V). U = null if computeU = false.
+   *
+   * @note The conditions that decide which method to use internally and the default parameters are
+   * subject to change.
    */
   @Since("1.0.0")
   def computeSVD(
@@ -319,9 +320,11 @@ class RowMatrix @Since("1.0.0") (
   }
 
   /**
-   * Computes the covariance matrix, treating each row as an observation. Note that this cannot
-   * be computed on matrices with more than 65535 columns.
+   * Computes the covariance matrix, treating each row as an observation.
+   *
    * @return a local dense matrix of size n x n
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.0.0")
   def computeCovariance(): Matrix = {
@@ -369,12 +372,12 @@ class RowMatrix @Since("1.0.0") (
    * The row data do not need to be "centered" first; it is not necessary for
    * the mean of each column to be 0.
    *
-   * Note that this cannot be computed on matrices with more than 65535 columns.
-   *
    * @param k number of top principal components.
    * @return a matrix of size n-by-k, whose columns are principal components, and
    * a vector of values which indicate how much variance each principal component
    * explains
+   *
+   * @note This cannot be computed on matrices with more than 65535 columns.
    */
   @Since("1.6.0")
   def computePrincipalComponentsAndExplainedVariance(k: Int): (Matrix, Vector) = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 81e64de4e5b5d..c49e72646bf13 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -305,7 +305,8 @@ class LeastSquaresGradient extends Gradient {
  * :: DeveloperApi ::
  * Compute gradient and loss for a Hinge loss function, as used in SVM binary classification.
  * See also the documentation for the precise formulation.
- * NOTE: This assumes that the labels are {0,1}
+ *
+ * @note This assumes that the labels are {0,1}
  */
 @DeveloperApi
 class HingeGradient extends Gradient {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
index 0f7857b8d8627..005119616f063 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -31,7 +31,7 @@ import org.apache.spark.rdd.RDD
 class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable {
 
   /**
-   * Returns a RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
+   * Returns an RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
    * window over them. The ordering is first based on the partition index and then the ordering of
    * items within each partition. This is similar to sliding in Scala collections, except that it
    * becomes an empty RDD if the window size is greater than the total number of items. It needs to
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index c642573ccba6d..24e4dcccc843f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -43,14 +43,14 @@ import org.apache.spark.storage.StorageLevel
 /**
  * Model representing the result of matrix factorization.
  *
- * Note: If you create the model directly using constructor, please be aware that fast prediction
- * requires cached user/product features and their associated partitioners.
- *
  * @param rank Rank for the features in this model.
  * @param userFeatures RDD of tuples where each tuple represents the userId and
  *                     the features computed for this user.
  * @param productFeatures RDD of tuples where each tuple represents the productId
  *                        and the features computed for this product.
+ *
+ * @note If you create the model directly using constructor, please be aware that fast prediction
+ * requires cached user/product features and their associated partitioners.
  */
 @Since("0.8.0")
 class MatrixFactorizationModel @Since("0.8.0") (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f3159f7e724cc..925fdf4d7e7bc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -60,15 +60,15 @@ object Statistics {
    * Compute the correlation matrix for the input RDD of Vectors using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
    *
-   * Note that for Spearman, a rank correlation, we need to create an RDD[Double] for each column
-   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
-   * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
-   * avoid recomputing the common lineage.
-   *
    * @param X an RDD[Vector] for which the correlation matrix is to be computed.
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
    * @return Correlation matrix comparing columns in X.
+   *
+   * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
+   * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
+   * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
+   * avoid recomputing the common lineage.
    */
   @Since("1.1.0")
   def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
@@ -77,12 +77,12 @@ object Statistics {
    * Compute the Pearson correlation for the input RDDs.
    * Returns NaN if either vector has 0 variance.
    *
-   * Note: the two input RDDs need to have the same number of partitions and the same number of
-   * elements in each partition.
-   *
    * @param x RDD[Double] of the same cardinality as y.
    * @param y RDD[Double] of the same cardinality as x.
    * @return A Double containing the Pearson correlation between the two input RDD[Double]s
+   *
+   * @note The two input RDDs need to have the same number of partitions and the same number of
+   * elements in each partition.
    */
   @Since("1.1.0")
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
@@ -98,15 +98,15 @@ object Statistics {
    * Compute the correlation for the input RDDs using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
    *
-   * Note: the two input RDDs need to have the same number of partitions and the same number of
-   * elements in each partition.
-   *
    * @param x RDD[Double] of the same cardinality as y.
    * @param y RDD[Double] of the same cardinality as x.
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
    * @return A Double containing the correlation between the two input RDD[Double]s using the
    *         specified method.
+   *
+   * @note The two input RDDs need to have the same number of partitions and the same number of
+   * elements in each partition.
    */
   @Since("1.1.0")
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
@@ -122,15 +122,15 @@ object Statistics {
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
    * expected distribution.
    *
-   * Note: the two input Vectors need to have the same size.
-   *       `observed` cannot contain negative values.
-   *       `expected` cannot contain nonpositive values.
-   *
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @param expected Vector containing the expected categorical counts/relative frequencies.
    *                 `expected` is rescaled if the `expected` sum differs from the `observed` sum.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   *
+   * @note The two input Vectors need to have the same size.
+   * `observed` cannot contain negative values.
+   * `expected` cannot contain nonpositive values.
    */
   @Since("1.1.0")
   def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
@@ -141,11 +141,11 @@ object Statistics {
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
    * distribution, with each category having an expected frequency of `1 / observed.size`.
    *
-   * Note: `observed` cannot contain negative values.
-   *
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
+   *
+   * @note `observed` cannot contain negative values.
    */
   @Since("1.1.0")
   def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 36feab7859b43..d846c43cf2913 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -75,10 +75,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -86,6 +82,10 @@ object DecisionTree extends Serializable with Logging {
    *                 of decision tree (classification or regression), feature type (continuous,
    *                 categorical), depth of the tree, quantile calculation strategy, etc.
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.0.0")
   def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = {
@@ -96,10 +96,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -108,6 +104,10 @@ object DecisionTree extends Serializable with Logging {
    * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
    *                 1 internal node + 2 leaf nodes).
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.0.0")
   def train(
@@ -123,10 +123,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -136,6 +132,10 @@ object DecisionTree extends Serializable with Logging {
    *                 1 internal node + 2 leaf nodes).
    * @param numClasses Number of classes for classification. Default value of 2.
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.2.0")
   def train(
@@ -152,10 +152,6 @@ object DecisionTree extends Serializable with Logging {
    * Method to train a decision tree model.
    * The method supports binary and multiclass classification and regression.
    *
-   * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
-   *       and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
-   *       is recommended to clearly separate classification and regression.
-   *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              For classification, labels should take values {0, 1, ..., numClasses-1}.
    *              For regression, labels are real numbers.
@@ -170,6 +166,10 @@ object DecisionTree extends Serializable with Logging {
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @return DecisionTreeModel that can be used for prediction.
+   *
+   * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+   * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+   * is recommended to clearly separate classification and regression.
    */
   @Since("1.0.0")
   def train(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
index de14ddf024d75..09274a2e1b2ac 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -42,11 +42,13 @@ trait Loss extends Serializable {
 
   /**
    * Method to calculate error of the base learner for the gradient boosting calculation.
-   * Note: This method is not used by the gradient boosting algorithm but is useful for debugging
-   * purposes.
+   *
    * @param model Model of the weak learner.
    * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    * @return Measure of model error on data
+   *
+   * @note This method is not used by the gradient boosting algorithm but is useful for debugging
+   * purposes.
    */
   @Since("1.2.0")
   def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = {
@@ -55,11 +57,13 @@ trait Loss extends Serializable {
 
   /**
    * Method to calculate loss when the predictions are already known.
-   * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
-   * predicted values from previously fit trees.
+   *
    * @param prediction Predicted label.
    * @param label True label.
    * @return Measure of model error on datapoint.
+   *
+   * @note This method is used in the method evaluateEachIteration to avoid recomputing the
+   * predicted values from previously fit trees.
    */
   private[spark] def computeError(prediction: Double, label: Double): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 657ed0a8ecda8..299950785e420 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -187,7 +187,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param initTreeWeight: learning rate assigned to the first tree.
    * @param initTree: first DecisionTreeModel.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to every sample.
    */
   @Since("1.4.0")
@@ -213,7 +213,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
    * @param treeWeight: Learning rate.
    * @param tree: Tree using which the prediction and error should be updated.
    * @param loss: evaluation metric.
-   * @return a RDD with each element being a zip of the prediction and error
+   * @return an RDD with each element being a zip of the prediction and error
    *         corresponding to each sample.
    */
   @Since("1.4.0")
diff --git a/pom.xml b/pom.xml
index 650b4cd965b66..024b2850d0a3d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2476,6 +2476,13 @@
             <artifactId>maven-javadoc-plugin</artifactId>
             <configuration>
               <additionalparam>-Xdoclint:all -Xdoclint:-missing</additionalparam>
+              <tags>
+                <tag>
+                  <name>note</name>
+                  <placement>a</placement>
+                  <head>Note:</head>
+                </tag>
+              </tags>
             </configuration>
           </plugin>
         </plugins>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 2d3a95b163a76..92b45657210e1 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -741,7 +741,8 @@ object Unidoc {
     javacOptions in (JavaUnidoc, unidoc) := Seq(
       "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc",
       "-public",
-      "-noqualifier", "java.lang"
+      "-noqualifier", "java.lang",
+      "-tag", """note:a:Note\:"""
     ),
 
     // Use GitHub repository for Scaladoc source links
diff --git a/python/pyspark/mllib/stat/KernelDensity.py b/python/pyspark/mllib/stat/KernelDensity.py
index 3b1c5519bd87e..7250eab6705a7 100644
--- a/python/pyspark/mllib/stat/KernelDensity.py
+++ b/python/pyspark/mllib/stat/KernelDensity.py
@@ -28,7 +28,7 @@
 
 class KernelDensity(object):
     """
-    Estimate probability density at required points given a RDD of samples
+    Estimate probability density at required points given an RDD of samples
     from the population.
 
     >>> kd = KernelDensity()
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index ed6fd4bca4c54..97755807ef262 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -499,7 +499,7 @@ def generateLinearInput(intercept, weights, xMean, xVariance,
     def generateLinearRDD(sc, nexamples, nfeatures, eps,
                           nParts=2, intercept=0.0):
         """
-        Generate a RDD of LabeledPoints.
+        Generate an RDD of LabeledPoints.
         """
         return callMLlibFunc(
             "generateLinearRDDWrapper", sc, int(nexamples), int(nfeatures),
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index a163ceafe9d3b..641787ee20e0c 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1218,7 +1218,7 @@ def mergeMaps(m1, m2):
 
     def top(self, num, key=None):
         """
-        Get the top N elements from a RDD.
+        Get the top N elements from an RDD.
 
         Note that this method should only be used if the resulting array is expected
         to be small, as all the data is loaded into the driver's memory.
@@ -1242,7 +1242,7 @@ def merge(a, b):
 
     def takeOrdered(self, num, key=None):
         """
-        Get the N elements from a RDD ordered in ascending order or as
+        Get the N elements from an RDD ordered in ascending order or as
         specified by the optional key function.
 
         Note that this method should only be used if the resulting array is expected
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index bf27d8047a753..134424add3b62 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -144,7 +144,7 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
         """
         .. note:: Experimental
 
-        Create a RDD from Kafka using offset ranges for each topic and partition.
+        Create an RDD from Kafka using offset ranges for each topic and partition.
 
         :param sc:  SparkContext object
         :param kafkaParams: Additional params for Kafka
@@ -155,7 +155,7 @@ def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
         :param valueDecoder:  A function used to decode value (default is utf8_decoder)
         :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                                meta using messageHandler (default is None).
-        :return: A RDD object
+        :return: An RDD object
         """
         if leaders is None:
             leaders = dict()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
index dc90659a676e0..0b95a8821b05a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoders.scala
@@ -165,10 +165,10 @@ object Encoders {
    * (Scala-specific) Creates an encoder that serializes objects of type T using generic Java
    * serialization. This encoder maps T into a single byte array (binary) field.
    *
-   * Note that this is extremely inefficient and should only be used as the last resort.
-   *
    * T must be publicly accessible.
    *
+   * @note This is extremely inefficient and should only be used as the last resort.
+   *
    * @since 1.6.0
    */
   def javaSerialization[T: ClassTag]: Encoder[T] = genericSerializer(useKryo = false)
@@ -177,10 +177,10 @@ object Encoders {
    * Creates an encoder that serializes objects of type T using generic Java serialization.
    * This encoder maps T into a single byte array (binary) field.
    *
-   * Note that this is extremely inefficient and should only be used as the last resort.
-   *
    * T must be publicly accessible.
    *
+   * @note This is extremely inefficient and should only be used as the last resort.
+   *
    * @since 1.6.0
    */
   def javaSerialization[T](clazz: Class[T]): Encoder[T] = javaSerialization(ClassTag[T](clazz))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
index e121044288e5a..21f3497ba06fb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
@@ -23,10 +23,10 @@ import org.apache.spark.annotation.InterfaceStability
  * The data type representing calendar time intervals. The calendar time interval is stored
  * internally in two components: number of months the number of microseconds.
  *
- * Note that calendar intervals are not comparable.
- *
  * Please use the singleton [[DataTypes.CalendarIntervalType]].
  *
+ * @note Calendar intervals are not comparable.
+ *
  * @since 1.5.0
  */
 @InterfaceStability.Stable
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 7a131b30eafd7..fa3b2b9de5d5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -118,7 +118,7 @@ class TypedColumn[-T, U](
  *   $"a" === $"b"
  * }}}
  *
- * Note that the internal Catalyst expression can be accessed via "expr", but this method is for
+ * @note The internal Catalyst expression can be accessed via "expr", but this method is for
  * debugging purposes only and can change in any future Spark releases.
  *
  * @groupname java_expr_ops Java-specific expression operators
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index b5bbcee37150f..6335fc4579a28 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -51,7 +51,6 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient
    * Online Computation of Quantile Summaries]] by Greenwald and Khanna.
    *
-   * Note that NaN values will be removed from the numerical column before calculation
    * @param col the name of the numerical column
    * @param probabilities a list of quantile probabilities
    *   Each number must belong to [0, 1].
@@ -61,6 +60,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *   Note that values greater than 1 are accepted but give the same result as 1.
    * @return the approximate quantiles at the given probabilities
    *
+   * @note NaN values will be removed from the numerical column before calculation
+   *
    * @since 2.0.0
    */
   def approxQuantile(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index e0c89811ddbfa..15281f24fa628 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -218,7 +218,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * Inserts the content of the [[DataFrame]] to the specified table. It requires that
    * the schema of the [[DataFrame]] is the same as the schema of the table.
    *
-   * Note: Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based
+   * @note Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based
    * resolution. For example:
    *
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 3761773698df3..3c75a6a45ec86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -377,7 +377,7 @@ class Dataset[T] private[sql](
 
   /**
    * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed.
-   * This can be quite convenient in conversion from a RDD of tuples into a [[DataFrame]] with
+   * This can be quite convenient in conversion from an RDD of tuples into a [[DataFrame]] with
    * meaningful names. For example:
    * {{{
    *   val rdd: RDD[(Int, String)] = ...
@@ -703,13 +703,13 @@ class Dataset[T] private[sql](
    *   df1.join(df2, "user_id")
    * }}}
    *
-   * Note that if you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
-   * there is no way to disambiguate which side of the join you would like to reference.
-   *
    * @param right Right side of the join operation.
    * @param usingColumn Name of the column to join on. This column must exist on both sides.
    *
+   * @note If you perform a self-join using this function without aliasing the input
+   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
    * @group untypedrel
    * @since 2.0.0
    */
@@ -728,13 +728,13 @@ class Dataset[T] private[sql](
    *   df1.join(df2, Seq("user_id", "user_name"))
    * }}}
    *
-   * Note that if you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
-   * there is no way to disambiguate which side of the join you would like to reference.
-   *
    * @param right Right side of the join operation.
    * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
    *
+   * @note If you perform a self-join using this function without aliasing the input
+   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
    * @group untypedrel
    * @since 2.0.0
    */
@@ -748,14 +748,14 @@ class Dataset[T] private[sql](
    * Different from other join functions, the join columns will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
    *
-   * Note that if you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
-   * there is no way to disambiguate which side of the join you would like to reference.
-   *
    * @param right Right side of the join operation.
    * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
    * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
    *
+   * @note If you perform a self-join using this function without aliasing the input
+   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * there is no way to disambiguate which side of the join you would like to reference.
+   *
    * @group untypedrel
    * @since 2.0.0
    */
@@ -856,10 +856,10 @@ class Dataset[T] private[sql](
   /**
    * Explicit cartesian join with another [[DataFrame]].
    *
-   * Note that cartesian joins are very expensive without an extra filter that can be pushed down.
-   *
    * @param right Right side of the join operation.
    *
+   * @note Cartesian joins are very expensive without an extra filter that can be pushed down.
+   *
    * @group untypedrel
    * @since 2.1.0
    */
@@ -1044,7 +1044,8 @@ class Dataset[T] private[sql](
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
-   * Note that the column name can also reference to a nested column like `a.b`.
+   *
+   * @note The column name can also reference to a nested column like `a.b`.
    *
    * @group untypedrel
    * @since 2.0.0
@@ -1053,7 +1054,8 @@ class Dataset[T] private[sql](
 
   /**
    * Selects column based on the column name and return it as a [[Column]].
-   * Note that the column name can also reference to a nested column like `a.b`.
+   *
+   * @note The column name can also reference to a nested column like `a.b`.
    *
    * @group untypedrel
    * @since 2.0.0
@@ -1621,7 +1623,7 @@ class Dataset[T] private[sql](
    * Returns a new Dataset containing rows only in both this Dataset and another Dataset.
    * This is equivalent to `INTERSECT` in SQL.
    *
-   * Note that, equality checking is performed directly on the encoded representation of the data
+   * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * @group typedrel
@@ -1635,7 +1637,7 @@ class Dataset[T] private[sql](
    * Returns a new Dataset containing rows in this Dataset but not in another Dataset.
    * This is equivalent to `EXCEPT` in SQL.
    *
-   * Note that, equality checking is performed directly on the encoded representation of the data
+   * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * @group typedrel
@@ -1648,13 +1650,13 @@ class Dataset[T] private[sql](
   /**
    * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed.
    *
-   * Note: this is NOT guaranteed to provide exactly the fraction of the count
-   * of the given [[Dataset]].
-   *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
    * @param seed Seed for sampling.
    *
+   * @note This is NOT guaranteed to provide exactly the fraction of the count
+   * of the given [[Dataset]].
+   *
    * @group typedrel
    * @since 1.6.0
    */
@@ -1670,12 +1672,12 @@ class Dataset[T] private[sql](
   /**
    * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed.
    *
-   * Note: this is NOT guaranteed to provide exactly the fraction of the total count
-   * of the given [[Dataset]].
-   *
    * @param withReplacement Sample with replacement or not.
    * @param fraction Fraction of rows to generate.
    *
+   * @note This is NOT guaranteed to provide exactly the fraction of the total count
+   * of the given [[Dataset]].
+   *
    * @group typedrel
    * @since 1.6.0
    */
@@ -2375,7 +2377,7 @@ class Dataset[T] private[sql](
    *
    * The iterator will consume as much memory as the largest partition in this Dataset.
    *
-   * Note: this results in multiple Spark jobs, and if the input Dataset is the result
+   * @note this results in multiple Spark jobs, and if the input Dataset is the result
    * of a wide transformation (e.g. join with different partitioners), to avoid
    * recomputing the input Dataset should be cached first.
    *
@@ -2453,7 +2455,7 @@ class Dataset[T] private[sql](
    * Returns a new Dataset that contains only the unique rows from this Dataset.
    * This is an alias for `dropDuplicates`.
    *
-   * Note that, equality checking is performed directly on the encoded representation of the data
+   * @note Equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * @group typedrel
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 3c5cf037c578d..2fae93651b344 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -181,9 +181,6 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * A collection of methods for registering user-defined functions (UDF).
-   * Note that the user-defined functions must be deterministic. Due to optimization,
-   * duplicate invocations may be eliminated or the function may even be invoked more times than
-   * it is present in the query.
    *
    * The following example registers a Scala closure as UDF:
    * {{{
@@ -208,6 +205,10 @@ class SQLContext private[sql](val sparkSession: SparkSession)
    *       DataTypes.StringType);
    * }}}
    *
+   * @note The user-defined functions must be deterministic. Due to optimization,
+   * duplicate invocations may be eliminated or the function may even be invoked more times than
+   * it is present in the query.
+   *
    * @group basic
    * @since 1.3.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 58b2ab3957173..e09e3caa3c981 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -155,9 +155,6 @@ class SparkSession private(
 
   /**
    * A collection of methods for registering user-defined functions (UDF).
-   * Note that the user-defined functions must be deterministic. Due to optimization,
-   * duplicate invocations may be eliminated or the function may even be invoked more times than
-   * it is present in the query.
    *
    * The following example registers a Scala closure as UDF:
    * {{{
@@ -182,6 +179,10 @@ class SparkSession private(
    *       DataTypes.StringType);
    * }}}
    *
+   * @note The user-defined functions must be deterministic. Due to optimization,
+   * duplicate invocations may be eliminated or the function may even be invoked more times than
+   * it is present in the query.
+   *
    * @since 2.0.0
    */
   def udf: UDFRegistration = sessionState.udf
@@ -201,7 +202,7 @@ class SparkSession private(
    * Start a new session with isolated SQL configurations, temporary tables, registered
    * functions are isolated, but sharing the underlying [[SparkContext]] and cached data.
    *
-   * Note: Other than the [[SparkContext]], all shared state is initialized lazily.
+   * @note Other than the [[SparkContext]], all shared state is initialized lazily.
    * This method will force the initialization of the shared state to ensure that parent
    * and child sessions are set up with the same shared state. If the underlying catalog
    * implementation is Hive, this will initialize the metastore, which may take some time.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 0444ad10d34fb..6043c5ee14b54 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -39,7 +39,8 @@ import org.apache.spark.util.Utils
 
 /**
  * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
- * Note that the user-defined functions must be deterministic.
+ *
+ * @note The user-defined functions must be deterministic.
  *
  * @since 1.3.0
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
index 4914a9d722a83..1b56c08f729c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
@@ -28,7 +28,7 @@ package object state {
 
   implicit class StateStoreOps[T: ClassTag](dataRDD: RDD[T]) {
 
-    /** Map each partition of a RDD along with data in a [[StateStore]]. */
+    /** Map each partition of an RDD along with data in a [[StateStore]]. */
     def mapPartitionsWithStateStore[U: ClassTag](
         sqlContext: SQLContext,
         checkpointLocation: String,
@@ -49,7 +49,7 @@ package object state {
         storeUpdateFunction)
     }
 
-    /** Map each partition of a RDD along with data in a [[StateStore]]. */
+    /** Map each partition of an RDD along with data in a [[StateStore]]. */
     private[streaming] def mapPartitionsWithStateStore[U: ClassTag](
         checkpointLocation: String,
         operatorId: Long,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 28598af781653..36dd5f78ac137 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -25,9 +25,7 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * A user-defined function. To create one, use the `udf` functions in [[functions]].
- * Note that the user-defined functions must be deterministic. Due to optimization,
- * duplicate invocations may be eliminated or the function may even be invoked more times than
- * it is present in the query.
+ *
  * As an example:
  * {{{
  *   // Defined a UDF that returns true or false based on some numeric score.
@@ -37,6 +35,10 @@ import org.apache.spark.sql.types.DataType
  *   df.select( predict(df("score")) )
  * }}}
  *
+ * @note The user-defined functions must be deterministic. Due to optimization,
+ * duplicate invocations may be eliminated or the function may even be invoked more times than
+ * it is present in the query.
+ *
  * @since 1.3.0
  */
 @InterfaceStability.Stable
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e221c032b82f6..d5940c638acdb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -476,7 +476,7 @@ object functions {
    *
    *   (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
    *
-   * Note: the list of columns should match with grouping columns exactly, or empty (means all the
+   * @note The list of columns should match with grouping columns exactly, or empty (means all the
    * grouping columns).
    *
    * @group agg_funcs
@@ -489,7 +489,7 @@ object functions {
    *
    *   (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
    *
-   * Note: the list of columns should match with grouping columns exactly.
+   * @note The list of columns should match with grouping columns exactly.
    *
    * @group agg_funcs
    * @since 2.0.0
@@ -1120,7 +1120,7 @@ object functions {
    * Generate a random column with independent and identically distributed (i.i.d.) samples
    * from U[0.0, 1.0].
    *
-   * Note that this is indeterministic when data partitions are not fixed.
+   * @note This is indeterministic when data partitions are not fixed.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1140,7 +1140,7 @@ object functions {
    * Generate a column with independent and identically distributed (i.i.d.) samples from
    * the standard normal distribution.
    *
-   * Note that this is indeterministic when data partitions are not fixed.
+   * @note This is indeterministic when data partitions are not fixed.
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1159,7 +1159,7 @@ object functions {
   /**
    * Partition ID.
    *
-   * Note that this is indeterministic because it depends on data partitioning and task scheduling.
+   * @note This is indeterministic because it depends on data partitioning and task scheduling.
    *
    * @group normal_funcs
    * @since 1.6.0
@@ -2207,7 +2207,7 @@ object functions {
    * Locate the position of the first occurrence of substr column in the given string.
    * Returns null if either of the arguments are null.
    *
-   * NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
+   * @note The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2242,7 +2242,8 @@ object functions {
 
   /**
    * Locate the position of the first occurrence of substr.
-   * NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
+   *
+   * @note The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2255,7 +2256,7 @@ object functions {
   /**
    * Locate the position of the first occurrence of substr in a string column, after position pos.
    *
-   * NOTE: The position is not zero based, but 1 based index. returns 0 if substr
+   * @note The position is not zero based, but 1 based index. returns 0 if substr
    * could not be found in str.
    *
    * @group string_funcs
@@ -2369,7 +2370,8 @@ object functions {
 
   /**
    * Splits str around pattern (pattern is a regular expression).
-   * NOTE: pattern is a string representation of the regular expression.
+   *
+   * @note Pattern is a string representation of the regular expression.
    *
    * @group string_funcs
    * @since 1.5.0
@@ -2468,7 +2470,7 @@ object functions {
    * A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    * pattern letters of [[java.text.SimpleDateFormat]] can be used.
    *
-   * NOTE: Use when ever possible specialized functions like [[year]]. These benefit from a
+   * @note Use when ever possible specialized functions like [[year]]. These benefit from a
    * specialized implementation.
    *
    * @group datetime_funcs
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index dec316be7aea1..7c64e28d24724 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -140,7 +140,7 @@ abstract class JdbcDialect extends Serializable {
  * tried in reverse order. A user-added dialect will thus be applied first,
  * overwriting the defaults.
  *
- * Note that all new dialects are applied to new jdbc DataFrames only. Make
+ * @note All new dialects are applied to new jdbc DataFrames only. Make
  * sure to register your dialects first.
  */
 @DeveloperApi
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 15a48072525b2..ff6dd8cb0cf92 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -69,7 +69,8 @@ trait DataSourceRegister {
 trait RelationProvider {
   /**
    * Returns a new base relation with the given parameters.
-   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   *
+   * @note The parameters' keywords are case insensitive and this insensitivity is enforced
    * by the Map that is passed to the function.
    */
   def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation
@@ -99,7 +100,8 @@ trait RelationProvider {
 trait SchemaRelationProvider {
   /**
    * Returns a new base relation with the given parameters and user defined schema.
-   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
+   *
+   * @note The parameters' keywords are case insensitive and this insensitivity is enforced
    * by the Map that is passed to the function.
    */
   def createRelation(
@@ -205,7 +207,7 @@ abstract class BaseRelation {
    * large to broadcast. This method will be called multiple times during query planning
    * and thus should not perform expensive operations for each invocation.
    *
-   * Note that it is always better to overestimate size than underestimate, because underestimation
+   * @note It is always better to overestimate size than underestimate, because underestimation
    * could lead to execution plans that are suboptimal (i.e. broadcasting a very large table).
    *
    * @since 1.3.0
@@ -219,7 +221,7 @@ abstract class BaseRelation {
    *
    * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]]
    *
-   * Note: The internal representation is not stable across releases and thus data sources outside
+   * @note The internal representation is not stable across releases and thus data sources outside
    * of Spark SQL should leave this as true.
    *
    * @since 1.4.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
index 5e93fc469a41f..4504582187b97 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.QueryExecution
  * :: Experimental ::
  * The interface of query execution listener that can be used to analyze execution metrics.
  *
- * Note that implementations should guarantee thread-safety as they can be invoked by
+ * @note Implementations should guarantee thread-safety as they can be invoked by
  * multiple different threads.
  */
 @Experimental
@@ -39,24 +39,26 @@ trait QueryExecutionListener {
 
   /**
    * A callback function that will be called when a query executed successfully.
-   * Note that this can be invoked by multiple different threads.
    *
    * @param funcName name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param durationNs the execution time for this query in nanoseconds.
+   *
+   * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
   def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit
 
   /**
    * A callback function that will be called when a query execution failed.
-   * Note that this can be invoked by multiple different threads.
    *
    * @param funcName the name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param exception the exception that failed this query.
+   *
+   * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
   def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index 0daa29b666f62..b272c8e7d79c2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -157,7 +157,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     val allColumns = fields.map(_.name).mkString(",")
     val schema = StructType(fields)
 
-    // Create a RDD for the schema
+    // Create an RDD for the schema
     val rdd =
       sparkContext.parallelize((1 to 10000), 10).map { i =>
         Row(
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 4808d0fcbc6cc..444261da8de6a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -421,11 +421,11 @@ class StreamingContext private[streaming] (
    * by "moving" them from another location within the same file system. File names
    * starting with . are ignored.
    *
-   * '''Note:''' We ensure that the byte array for each record in the
-   * resulting RDDs of the DStream has the provided record length.
-   *
    * @param directory HDFS directory to monitor for new file
    * @param recordLength length of each record in bytes
+   *
+   * @note We ensure that the byte array for each record in the
+   * resulting RDDs of the DStream has the provided record length.
    */
   def binaryRecordsStream(
       directory: String,
@@ -447,12 +447,12 @@ class StreamingContext private[streaming] (
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE: Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
-   * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
    * @param queue      Queue of RDDs. Modifications to this data structure must be synchronized.
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
    * @tparam T         Type of objects in the RDD
+   *
+   * @note Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
+   * those RDDs, so `queueStream` doesn't support checkpointing.
    */
   def queueStream[T: ClassTag](
       queue: Queue[RDD[T]],
@@ -465,14 +465,14 @@ class StreamingContext private[streaming] (
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE: Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
-   * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
    * @param queue      Queue of RDDs. Modifications to this data structure must be synchronized.
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
    * @param defaultRDD Default RDD is returned by the DStream when the queue is empty.
    *                   Set as null if no RDD should be returned when empty
    * @tparam T         Type of objects in the RDD
+   *
+   * @note Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
+   * those RDDs, so `queueStream` doesn't support checkpointing.
    */
   def queueStream[T: ClassTag](
       queue: Queue[RDD[T]],
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index da9ff858853cf..aa4003c62e1e7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -74,7 +74,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    */
   def repartition(numPartitions: Int): JavaPairDStream[K, V] = dstream.repartition(numPartitions)
 
-  /** Method that generates a RDD for the given Duration */
+  /** Method that generates an RDD for the given Duration */
   def compute(validTime: Time): JavaPairRDD[K, V] = {
     dstream.compute(validTime) match {
       case Some(rdd) => new JavaPairRDD(rdd)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 4c4376a089f59..b43b9405def97 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -218,11 +218,11 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * for new files and reads them as flat binary files with fixed record lengths,
    * yielding byte arrays
    *
-   * '''Note:''' We ensure that the byte array for each record in the
-   * resulting RDDs of the DStream has the provided record length.
-   *
    * @param directory HDFS directory to monitor for new files
    * @param recordLength The length at which to split the records
+   *
+   * @note We ensure that the byte array for each record in the
+   * resulting RDDs of the DStream has the provided record length.
    */
   def binaryRecordsStream(directory: String, recordLength: Int): JavaDStream[Array[Byte]] = {
     ssc.binaryRecordsStream(directory, recordLength)
@@ -352,13 +352,13 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE:
+   * @param queue      Queue of RDDs
+   * @tparam T         Type of objects in the RDD
+   *
+   * @note
    * 1. Changes to the queue after the stream is created will not be recognized.
    * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
    * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
-   * @param queue      Queue of RDDs
-   * @tparam T         Type of objects in the RDD
    */
   def queueStream[T](queue: java.util.Queue[JavaRDD[T]]): JavaDStream[T] = {
     implicit val cm: ClassTag[T] =
@@ -372,14 +372,14 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE:
-   * 1. Changes to the queue after the stream is created will not be recognized.
-   * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
-   * those RDDs, so `queueStream` doesn't support checkpointing.
-   *
    * @param queue      Queue of RDDs
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
    * @tparam T         Type of objects in the RDD
+   *
+   * @note
+   * 1. Changes to the queue after the stream is created will not be recognized.
+   * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
+   * those RDDs, so `queueStream` doesn't support checkpointing.
    */
   def queueStream[T](
       queue: java.util.Queue[JavaRDD[T]],
@@ -396,7 +396,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
    * Create an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
-   * NOTE:
+   * @note
    * 1. Changes to the queue after the stream is created will not be recognized.
    * 2. Arbitrary RDDs can be added to `queueStream`, there is no way to recover data of
    * those RDDs, so `queueStream` doesn't support checkpointing.
@@ -454,9 +454,10 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
   /**
    * Create a new DStream in which each RDD is generated by applying a function on RDDs of
    * the DStreams. The order of the JavaRDDs in the transform function parameter will be the
-   * same as the order of corresponding DStreams in the list. Note that for adding a
-   * JavaPairDStream in the list of JavaDStreams, convert it to a JavaDStream using
-   * [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
+   * same as the order of corresponding DStreams in the list.
+   *
+   * @note For adding a JavaPairDStream in the list of JavaDStreams, convert it to a
+   * JavaDStream using [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
    * In the transform function, convert the JavaRDD corresponding to that JavaDStream to
    * a JavaPairRDD using org.apache.spark.api.java.JavaPairRDD.fromJavaRDD().
    */
@@ -476,9 +477,10 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable {
   /**
    * Create a new DStream in which each RDD is generated by applying a function on RDDs of
    * the DStreams. The order of the JavaRDDs in the transform function parameter will be the
-   * same as the order of corresponding DStreams in the list. Note that for adding a
-   * JavaPairDStream in the list of JavaDStreams, convert it to a JavaDStream using
-   * [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
+   * same as the order of corresponding DStreams in the list.
+   *
+   * @note For adding a JavaPairDStream in the list of JavaDStreams, convert it to
+   * a JavaDStream using [[org.apache.spark.streaming.api.java.JavaPairDStream]].toJavaDStream().
    * In the transform function, convert the JavaRDD corresponding to that JavaDStream to
    * a JavaPairRDD using org.apache.spark.api.java.JavaPairRDD.fromJavaRDD().
    */
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index fa15a0bf65ab9..938a7fac1af41 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -68,13 +68,13 @@ abstract class DStream[T: ClassTag] (
   // Methods that should be implemented by subclasses of DStream
   // =======================================================================
 
-  /** Time interval after which the DStream generates a RDD */
+  /** Time interval after which the DStream generates an RDD */
   def slideDuration: Duration
 
   /** List of parent DStreams on which this DStream depends on */
   def dependencies: List[DStream[_]]
 
-  /** Method that generates a RDD for the given time */
+  /** Method that generates an RDD for the given time */
   def compute(validTime: Time): Option[RDD[T]]
 
   // =======================================================================
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
index ed08191f41cc8..9512db7d7d757 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/MapWithStateDStream.scala
@@ -128,7 +128,7 @@ class InternalMapWithStateDStream[K: ClassTag, V: ClassTag, S: ClassTag, E: Clas
     super.initialize(time)
   }
 
-  /** Method that generates a RDD for the given time */
+  /** Method that generates an RDD for the given time */
   override def compute(validTime: Time): Option[RDD[MapWithStateRDDRecord[K, S, E]]] = {
     // Get the previous state or create a new empty state RDD
     val prevStateRDD = getOrCompute(validTime - slideDuration) match {
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
index ce5a6e00fb2fe..a37fac87300b7 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala
@@ -186,7 +186,7 @@ class WriteAheadLogBackedBlockRDDSuite
     assert(rdd.collect() === data.flatten)
 
     // Verify that the block fetching is skipped when isBlockValid is set to false.
-    // This is done by using a RDD whose data is only in memory but is set to skip block fetching
+    // This is done by using an RDD whose data is only in memory but is set to skip block fetching
     // Using that RDD will throw exception, as it skips block fetching even if the blocks are in
     // in BlockManager.
     if (testIsBlockValid) {

From 30a6fbbb0fb47f5b74ceba3384f28a61bf4e4740 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 19 Nov 2016 11:28:25 +0000
Subject: [PATCH 149/534] [SPARK-18353][CORE] spark.rpc.askTimeout defalut
 value is not 120s

## What changes were proposed in this pull request?

Avoid hard-coding spark.rpc.askTimeout to non-default in Client; fix doc about spark.rpc.askTimeout default

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #15833 from srowen/SPARK-18353.

(cherry picked from commit 8b1e1088eb274fb15260cd5d6d9508d42837a4d6)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 core/src/main/scala/org/apache/spark/deploy/Client.scala | 4 +++-
 docs/configuration.md                                    | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index ee276e1b71138..a4de3d7eaf458 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -221,7 +221,9 @@ object Client {
     val conf = new SparkConf()
     val driverArgs = new ClientArguments(args)
 
-    conf.set("spark.rpc.askTimeout", "10")
+    if (!conf.contains("spark.rpc.askTimeout")) {
+      conf.set("spark.rpc.askTimeout", "10s")
+    }
     Logger.getRootLogger.setLevel(driverArgs.logLevel)
 
     val rpcEnv =
diff --git a/docs/configuration.md b/docs/configuration.md
index e0c661349caab..c2329b411fc69 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1175,7 +1175,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.rpc.askTimeout</code></td>
-  <td>120s</td>
+  <td><code>spark.network.timeout</code></td>
   <td>
     Duration for an RPC ask operation to wait before timing out.
   </td>
@@ -1531,7 +1531,7 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.core.connection.ack.wait.timeout</code></td>
-  <td>60s</td>
+  <td><code>spark.network.timeout</code></td>
   <td>
     How long for the connection to wait for ack to occur before timing
     out and giving up. To avoid unwilling timeout caused by long pause like GC,

From 15ad3a319b91a8b495da9a0e6f5386417991d30d Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 19 Nov 2016 13:48:56 +0000
Subject: [PATCH 150/534] [SPARK-18448][CORE] Fix @since 2.1.0 on new
 SparkSession.close() method

## What changes were proposed in this pull request?

Fix since 2.1.0 on new SparkSession.close() method. I goofed in https://github.com/apache/spark/pull/15932 because it was back-ported to 2.1 instead of just master as originally planned.

Author: Sean Owen <sowen@cloudera.com>

Closes #15938 from srowen/SPARK-18448.2.

(cherry picked from commit ded5fefb6f5c0a97bf3d7fa1c0494dc434b6ee40)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index e09e3caa3c981..71b1880dc0715 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -652,7 +652,7 @@ class SparkSession private(
   /**
    * Synonym for `stop()`.
    *
-   * @since 2.2.0
+   * @since 2.1.0
    */
   override def close(): Unit = stop()
 

From 15eb86c29c02178f4413df63c39b8df3cda30ca8 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Sun, 20 Nov 2016 01:42:37 +0000
Subject: [PATCH 151/534] [SPARK-18456][ML][FOLLOWUP] Use matrix abstraction
 for coefficients in LogisticRegression training

## What changes were proposed in this pull request?

This is a follow up to some of the discussion [here](https://github.com/apache/spark/pull/15593). During LogisticRegression training, we store the coefficients combined with intercepts as a flat vector, but a more natural abstraction is a matrix. Here, we refactor the code to use matrix where possible, which makes the code more readable and greatly simplifies the indexing.

Note: We do not use a Breeze matrix for the cost function as was mentioned in the linked PR. This is because LBFGS/OWLQN require an implicit `MutableInnerProductModule[DenseMatrix[Double], Double]` which is not natively defined in Breeze. We would need to extend Breeze in Spark to define it ourselves. Also, we do not modify the `regParamL1Fun` because OWLQN in Breeze requires a `MutableEnumeratedCoordinateField[(Int, Int), DenseVector[Double]]` (since we still use a dense vector for coefficients). Here again we would have to extend Breeze inside Spark.

## How was this patch tested?

This is internal code refactoring - the current unit tests passing show us that the change did not break anything. No added functionality in this patch.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15893 from sethah/logreg_refactor.

(cherry picked from commit 856e0042007c789dda4539fb19a5d4580999fbf4)
Signed-off-by: DB Tsai <dbtsai@dbtsai.com>
---
 .../classification/LogisticRegression.scala   | 115 ++++++++----------
 1 file changed, 53 insertions(+), 62 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 71a7fe53c15f8..f58efd36a1c66 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -463,16 +463,11 @@ class LogisticRegression @Since("1.2.0") (
         }
 
         /*
-          The coefficients are laid out in column major order during training. e.g. for
-          `numClasses = 3` and `numFeatures = 2` and `fitIntercept = true` the layout is:
-
-           Array(beta_11, beta_21, beta_31, beta_12, beta_22, beta_32, intercept_1, intercept_2,
-             intercept_3)
-
-           where beta_jk corresponds to the coefficient for class `j` and feature `k`.
+          The coefficients are laid out in column major order during training. Here we initialize
+          a column major matrix of initial coefficients.
          */
-        val initialCoefficientsWithIntercept =
-          Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept)
+        val initialCoefWithInterceptMatrix =
+          Matrices.zeros(numCoefficientSets, numFeaturesPlusIntercept)
 
         val initialModelIsValid = optInitialModel match {
           case Some(_initialModel) =>
@@ -491,18 +486,15 @@ class LogisticRegression @Since("1.2.0") (
         }
 
         if (initialModelIsValid) {
-          val initialCoefWithInterceptArray = initialCoefficientsWithIntercept.toArray
           val providedCoef = optInitialModel.get.coefficientMatrix
-          providedCoef.foreachActive { (row, col, value) =>
-            // convert matrix to column major for training
-            val flatIndex = col * numCoefficientSets + row
+          providedCoef.foreachActive { (classIndex, featureIndex, value) =>
             // We need to scale the coefficients since they will be trained in the scaled space
-            initialCoefWithInterceptArray(flatIndex) = value * featuresStd(col)
+            initialCoefWithInterceptMatrix.update(classIndex, featureIndex,
+              value * featuresStd(featureIndex))
           }
           if ($(fitIntercept)) {
-            optInitialModel.get.interceptVector.foreachActive { (index, value) =>
-              val coefIndex = numCoefficientSets * numFeatures + index
-              initialCoefWithInterceptArray(coefIndex) = value
+            optInitialModel.get.interceptVector.foreachActive { (classIndex, value) =>
+              initialCoefWithInterceptMatrix.update(classIndex, numFeatures, value)
             }
           }
         } else if ($(fitIntercept) && isMultinomial) {
@@ -532,8 +524,7 @@ class LogisticRegression @Since("1.2.0") (
           val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
           val rawMean = rawIntercepts.sum / rawIntercepts.length
           rawIntercepts.indices.foreach { i =>
-            initialCoefficientsWithIntercept.toArray(numClasses * numFeatures + i) =
-              rawIntercepts(i) - rawMean
+            initialCoefWithInterceptMatrix.update(i, numFeatures, rawIntercepts(i) - rawMean)
           }
         } else if ($(fitIntercept)) {
           /*
@@ -549,12 +540,12 @@ class LogisticRegression @Since("1.2.0") (
                b = \log{P(1) / P(0)} = \log{count_1 / count_0}
              }}}
            */
-          initialCoefficientsWithIntercept.toArray(numFeatures) = math.log(
-            histogram(1) / histogram(0))
+          initialCoefWithInterceptMatrix.update(0, numFeatures,
+            math.log(histogram(1) / histogram(0)))
         }
 
         val states = optimizer.iterations(new CachedDiffFunction(costFun),
-          initialCoefficientsWithIntercept.asBreeze.toDenseVector)
+          new BDV[Double](initialCoefWithInterceptMatrix.toArray))
 
         /*
            Note that in Logistic Regression, the objective history (loss + regularization)
@@ -586,15 +577,24 @@ class LogisticRegression @Since("1.2.0") (
            Note that the intercept in scaled space and original space is the same;
            as a result, no scaling is needed.
          */
-        val rawCoefficients = state.x.toArray.clone()
-        val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i =>
-          val colMajorIndex = (i % numFeatures) * numCoefficientSets + i / numFeatures
-          val featureIndex = i % numFeatures
-          if (featuresStd(featureIndex) != 0.0) {
-            rawCoefficients(colMajorIndex) / featuresStd(featureIndex)
-          } else {
-            0.0
+        val allCoefficients = state.x.toArray.clone()
+        val allCoefMatrix = new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept,
+          allCoefficients)
+        val denseCoefficientMatrix = new DenseMatrix(numCoefficientSets, numFeatures,
+          new Array[Double](numCoefficientSets * numFeatures), isTransposed = true)
+        val interceptVec = if ($(fitIntercept) || !isMultinomial) {
+          Vectors.zeros(numCoefficientSets)
+        } else {
+          Vectors.sparse(numCoefficientSets, Seq())
+        }
+        // separate intercepts and coefficients from the combined matrix
+        allCoefMatrix.foreachActive { (classIndex, featureIndex, value) =>
+          val isIntercept = $(fitIntercept) && (featureIndex == numFeatures)
+          if (!isIntercept && featuresStd(featureIndex) != 0.0) {
+            denseCoefficientMatrix.update(classIndex, featureIndex,
+              value / featuresStd(featureIndex))
           }
+          if (isIntercept) interceptVec.toArray(classIndex) = value
         }
 
         if ($(regParam) == 0.0 && isMultinomial) {
@@ -607,17 +607,16 @@ class LogisticRegression @Since("1.2.0") (
             Friedman, et al. "Regularization Paths for Generalized Linear Models via
               Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf
            */
-          val coefficientMean = coefficientArray.sum / coefficientArray.length
-          coefficientArray.indices.foreach { i => coefficientArray(i) -= coefficientMean}
+          val denseValues = denseCoefficientMatrix.values
+          val coefficientMean = denseValues.sum / denseValues.length
+          denseCoefficientMatrix.update(_ - coefficientMean)
         }
 
-        val denseCoefficientMatrix =
-          new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true)
         // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471
         val compressedCoefficientMatrix = if (isMultinomial) {
           denseCoefficientMatrix
         } else {
-          val compressedVector = Vectors.dense(coefficientArray).compressed
+          val compressedVector = Vectors.dense(denseCoefficientMatrix.values).compressed
           compressedVector match {
             case dv: DenseVector => denseCoefficientMatrix
             case sv: SparseVector =>
@@ -626,25 +625,13 @@ class LogisticRegression @Since("1.2.0") (
           }
         }
 
-        val interceptsArray: Array[Double] = if ($(fitIntercept)) {
-          Array.tabulate(numCoefficientSets) { i =>
-            val coefIndex = numFeatures * numCoefficientSets + i
-            rawCoefficients(coefIndex)
-          }
-        } else {
-          Array.empty[Double]
-        }
-        val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
-          // The intercepts are never regularized, so we always center the mean.
-          val interceptMean = interceptsArray.sum / numClasses
-          interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
-          Vectors.dense(interceptsArray)
-        } else if (interceptsArray.length == 1) {
-          Vectors.dense(interceptsArray)
-        } else {
-          Vectors.sparse(numCoefficientSets, Seq())
+        // center the intercepts when using multinomial algorithm
+        if ($(fitIntercept) && isMultinomial) {
+          val interceptArray = interceptVec.toArray
+          val interceptMean = interceptArray.sum / interceptArray.length
+          (0 until interceptVec.size).foreach { i => interceptArray(i) -= interceptMean }
         }
-        (compressedCoefficientMatrix, interceptVector.compressed, arrayBuilder.result())
+        (compressedCoefficientMatrix, interceptVec.compressed, arrayBuilder.result())
       }
     }
 
@@ -1424,6 +1411,7 @@ private class LogisticAggregator(
   private val numFeatures = bcFeaturesStd.value.length
   private val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
   private val coefficientSize = bcCoefficients.value.size
+  private val numCoefficientSets = if (multinomial) numClasses else 1
   if (multinomial) {
     require(numClasses ==  coefficientSize / numFeaturesPlusIntercept, s"The number of " +
       s"coefficients should be ${numClasses * numFeaturesPlusIntercept} but was $coefficientSize")
@@ -1633,12 +1621,12 @@ private class LogisticAggregator(
     lossSum / weightSum
   }
 
-  def gradient: Vector = {
+  def gradient: Matrix = {
     require(weightSum > 0.0, s"The effective number of instances should be " +
       s"greater than 0.0, but $weightSum.")
     val result = Vectors.dense(gradientSumArray.clone())
     scal(1.0 / weightSum, result)
-    result
+    new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept, result.toArray)
   }
 }
 
@@ -1664,6 +1652,7 @@ private class LogisticCostFun(
     val featuresStd = bcFeaturesStd.value
     val numFeatures = featuresStd.length
     val numCoefficientSets = if (multinomial) numClasses else 1
+    val numFeaturesPlusIntercept = if (fitIntercept) numFeatures + 1 else numFeatures
 
     val logisticAggregator = {
       val seqOp = (c: LogisticAggregator, instance: Instance) => c.add(instance)
@@ -1675,24 +1664,25 @@ private class LogisticCostFun(
       )(seqOp, combOp, aggregationDepth)
     }
 
-    val totalGradientArray = logisticAggregator.gradient.toArray
+    val totalGradientMatrix = logisticAggregator.gradient
+    val coefMatrix = new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept, coeffs.toArray)
     // regVal is the sum of coefficients squares excluding intercept for L2 regularization.
     val regVal = if (regParamL2 == 0.0) {
       0.0
     } else {
       var sum = 0.0
-      coeffs.foreachActive { case (index, value) =>
+      coefMatrix.foreachActive { case (classIndex, featureIndex, value) =>
         // We do not apply regularization to the intercepts
-        val isIntercept = fitIntercept && index >= numCoefficientSets * numFeatures
+        val isIntercept = fitIntercept && (featureIndex == numFeatures)
         if (!isIntercept) {
           // The following code will compute the loss of the regularization; also
           // the gradient of the regularization, and add back to totalGradientArray.
           sum += {
             if (standardization) {
-              totalGradientArray(index) += regParamL2 * value
+              val gradValue = totalGradientMatrix(classIndex, featureIndex)
+              totalGradientMatrix.update(classIndex, featureIndex, gradValue + regParamL2 * value)
               value * value
             } else {
-              val featureIndex = index / numCoefficientSets
               if (featuresStd(featureIndex) != 0.0) {
                 // If `standardization` is false, we still standardize the data
                 // to improve the rate of convergence; as a result, we have to
@@ -1700,7 +1690,8 @@ private class LogisticCostFun(
                 // differently to get effectively the same objective function when
                 // the training dataset is not standardized.
                 val temp = value / (featuresStd(featureIndex) * featuresStd(featureIndex))
-                totalGradientArray(index) += regParamL2 * temp
+                val gradValue = totalGradientMatrix(classIndex, featureIndex)
+                totalGradientMatrix.update(classIndex, featureIndex, gradValue + regParamL2 * temp)
                 value * temp
               } else {
                 0.0
@@ -1713,6 +1704,6 @@ private class LogisticCostFun(
     }
     bcCoeffs.destroy(blocking = false)
 
-    (logisticAggregator.loss + regVal, new BDV(totalGradientArray))
+    (logisticAggregator.loss + regVal, new BDV(totalGradientMatrix.toArray))
   }
 }

From b0b2f10817f38d9cebd2e436a07d4dd3e41e9328 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Sat, 19 Nov 2016 21:50:20 -0800
Subject: [PATCH 152/534] [SPARK-18458][CORE] Fix signed integer overflow
 problem at an expression in RadixSort.java

## What changes were proposed in this pull request?

This PR avoids that a result of an expression is negative due to signed integer overflow (e.g. 0x10?????? * 8 < 0). This PR casts each operand to `long` before executing a calculation. Since the result is interpreted as long, the result of the expression is positive.

## How was this patch tested?

Manually executed query82 of TPC-DS with 100TB

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #15907 from kiszk/SPARK-18458.

(cherry picked from commit d93b6552473468df297a08c0bef9ea0bf0f5c13a)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../collection/unsafe/sort/RadixSort.java     | 48 ++++++++++---------
 .../unsafe/sort/UnsafeInMemorySorter.java     |  2 +-
 .../unsafe/sort/RadixSortSuite.scala          | 28 +++++------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java
index 404361734a55b..3dd318471008b 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/RadixSort.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.util.collection.unsafe.sort;
 
+import com.google.common.primitives.Ints;
+
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.LongArray;
 
@@ -40,14 +42,14 @@ public class RadixSort {
    *         of always copying the data back to position zero for efficiency.
    */
   public static int sort(
-      LongArray array, int numRecords, int startByteIndex, int endByteIndex,
+      LongArray array, long numRecords, int startByteIndex, int endByteIndex,
       boolean desc, boolean signed) {
     assert startByteIndex >= 0 : "startByteIndex (" + startByteIndex + ") should >= 0";
     assert endByteIndex <= 7 : "endByteIndex (" + endByteIndex + ") should <= 7";
     assert endByteIndex > startByteIndex;
     assert numRecords * 2 <= array.size();
-    int inIndex = 0;
-    int outIndex = numRecords;
+    long inIndex = 0;
+    long outIndex = numRecords;
     if (numRecords > 0) {
       long[][] counts = getCounts(array, numRecords, startByteIndex, endByteIndex);
       for (int i = startByteIndex; i <= endByteIndex; i++) {
@@ -55,13 +57,13 @@ public static int sort(
           sortAtByte(
             array, numRecords, counts[i], i, inIndex, outIndex,
             desc, signed && i == endByteIndex);
-          int tmp = inIndex;
+          long tmp = inIndex;
           inIndex = outIndex;
           outIndex = tmp;
         }
       }
     }
-    return inIndex;
+    return Ints.checkedCast(inIndex);
   }
 
   /**
@@ -78,14 +80,14 @@ public static int sort(
    * @param signed whether this is a signed (two's complement) sort (only applies to last byte).
    */
   private static void sortAtByte(
-      LongArray array, int numRecords, long[] counts, int byteIdx, int inIndex, int outIndex,
+      LongArray array, long numRecords, long[] counts, int byteIdx, long inIndex, long outIndex,
       boolean desc, boolean signed) {
     assert counts.length == 256;
     long[] offsets = transformCountsToOffsets(
-      counts, numRecords, array.getBaseOffset() + outIndex * 8, 8, desc, signed);
+      counts, numRecords, array.getBaseOffset() + outIndex * 8L, 8, desc, signed);
     Object baseObject = array.getBaseObject();
-    long baseOffset = array.getBaseOffset() + inIndex * 8;
-    long maxOffset = baseOffset + numRecords * 8;
+    long baseOffset = array.getBaseOffset() + inIndex * 8L;
+    long maxOffset = baseOffset + numRecords * 8L;
     for (long offset = baseOffset; offset < maxOffset; offset += 8) {
       long value = Platform.getLong(baseObject, offset);
       int bucket = (int)((value >>> (byteIdx * 8)) & 0xff);
@@ -106,13 +108,13 @@ private static void sortAtByte(
    *         significant byte. If the byte does not need sorting the array will be null.
    */
   private static long[][] getCounts(
-      LongArray array, int numRecords, int startByteIndex, int endByteIndex) {
+      LongArray array, long numRecords, int startByteIndex, int endByteIndex) {
     long[][] counts = new long[8][];
     // Optimization: do a fast pre-pass to determine which byte indices we can skip for sorting.
     // If all the byte values at a particular index are the same we don't need to count it.
     long bitwiseMax = 0;
     long bitwiseMin = -1L;
-    long maxOffset = array.getBaseOffset() + numRecords * 8;
+    long maxOffset = array.getBaseOffset() + numRecords * 8L;
     Object baseObject = array.getBaseObject();
     for (long offset = array.getBaseOffset(); offset < maxOffset; offset += 8) {
       long value = Platform.getLong(baseObject, offset);
@@ -146,18 +148,18 @@ private static long[][] getCounts(
    * @return the input counts array.
    */
   private static long[] transformCountsToOffsets(
-      long[] counts, int numRecords, long outputOffset, int bytesPerRecord,
+      long[] counts, long numRecords, long outputOffset, long bytesPerRecord,
       boolean desc, boolean signed) {
     assert counts.length == 256;
     int start = signed ? 128 : 0;  // output the negative records first (values 129-255).
     if (desc) {
-      int pos = numRecords;
+      long pos = numRecords;
       for (int i = start; i < start + 256; i++) {
         pos -= counts[i & 0xff];
         counts[i & 0xff] = outputOffset + pos * bytesPerRecord;
       }
     } else {
-      int pos = 0;
+      long pos = 0;
       for (int i = start; i < start + 256; i++) {
         long tmp = counts[i & 0xff];
         counts[i & 0xff] = outputOffset + pos * bytesPerRecord;
@@ -176,8 +178,8 @@ private static long[] transformCountsToOffsets(
    */
   public static int sortKeyPrefixArray(
       LongArray array,
-      int startIndex,
-      int numRecords,
+      long startIndex,
+      long numRecords,
       int startByteIndex,
       int endByteIndex,
       boolean desc,
@@ -186,8 +188,8 @@ public static int sortKeyPrefixArray(
     assert endByteIndex <= 7 : "endByteIndex (" + endByteIndex + ") should <= 7";
     assert endByteIndex > startByteIndex;
     assert numRecords * 4 <= array.size();
-    int inIndex = startIndex;
-    int outIndex = startIndex + numRecords * 2;
+    long inIndex = startIndex;
+    long outIndex = startIndex + numRecords * 2L;
     if (numRecords > 0) {
       long[][] counts = getKeyPrefixArrayCounts(
         array, startIndex, numRecords, startByteIndex, endByteIndex);
@@ -196,13 +198,13 @@ public static int sortKeyPrefixArray(
           sortKeyPrefixArrayAtByte(
             array, numRecords, counts[i], i, inIndex, outIndex,
             desc, signed && i == endByteIndex);
-          int tmp = inIndex;
+          long tmp = inIndex;
           inIndex = outIndex;
           outIndex = tmp;
         }
       }
     }
-    return inIndex;
+    return Ints.checkedCast(inIndex);
   }
 
   /**
@@ -210,7 +212,7 @@ public static int sortKeyPrefixArray(
    * getCounts with some added parameters but that seems to hurt in benchmarks.
    */
   private static long[][] getKeyPrefixArrayCounts(
-      LongArray array, int startIndex, int numRecords, int startByteIndex, int endByteIndex) {
+      LongArray array, long startIndex, long numRecords, int startByteIndex, int endByteIndex) {
     long[][] counts = new long[8][];
     long bitwiseMax = 0;
     long bitwiseMin = -1L;
@@ -238,11 +240,11 @@ private static long[][] getKeyPrefixArrayCounts(
    * Specialization of sortAtByte() for key-prefix arrays.
    */
   private static void sortKeyPrefixArrayAtByte(
-      LongArray array, int numRecords, long[] counts, int byteIdx, int inIndex, int outIndex,
+      LongArray array, long numRecords, long[] counts, int byteIdx, long inIndex, long outIndex,
       boolean desc, boolean signed) {
     assert counts.length == 256;
     long[] offsets = transformCountsToOffsets(
-      counts, numRecords, array.getBaseOffset() + outIndex * 8, 16, desc, signed);
+      counts, numRecords, array.getBaseOffset() + outIndex * 8L, 16, desc, signed);
     Object baseObject = array.getBaseObject();
     long baseOffset = array.getBaseOffset() + inIndex * 8L;
     long maxOffset = baseOffset + numRecords * 16L;
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 2a71e68adafad..252a35ec6bdf5 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -322,7 +322,7 @@ public UnsafeSorterIterator getSortedIterator() {
     if (sortComparator != null) {
       if (this.radixSortSupport != null) {
         offset = RadixSort.sortKeyPrefixArray(
-          array, nullBoundaryPos, (pos - nullBoundaryPos) / 2, 0, 7,
+          array, nullBoundaryPos, (pos - nullBoundaryPos) / 2L, 0, 7,
           radixSortSupport.sortDescending(), radixSortSupport.sortSigned());
       } else {
         MemoryBlock unused = new MemoryBlock(
diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala
index 366ffda7788d3..d5956ea32096a 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala
@@ -22,6 +22,8 @@ import java.util.{Arrays, Comparator}
 
 import scala.util.Random
 
+import com.google.common.primitives.Ints
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
 import org.apache.spark.unsafe.array.LongArray
@@ -30,7 +32,7 @@ import org.apache.spark.util.collection.Sorter
 import org.apache.spark.util.random.XORShiftRandom
 
 class RadixSortSuite extends SparkFunSuite with Logging {
-  private val N = 10000  // scale this down for more readable results
+  private val N = 10000L  // scale this down for more readable results
 
   /**
    * Describes a type of sort to test, e.g. two's complement descending. Each sort type has
@@ -73,22 +75,22 @@ class RadixSortSuite extends SparkFunSuite with Logging {
       },
       2, 4, false, false, true))
 
-  private def generateTestData(size: Int, rand: => Long): (Array[JLong], LongArray) = {
-    val ref = Array.tabulate[Long](size) { i => rand }
-    val extended = ref ++ Array.fill[Long](size)(0)
+  private def generateTestData(size: Long, rand: => Long): (Array[JLong], LongArray) = {
+    val ref = Array.tabulate[Long](Ints.checkedCast(size)) { i => rand }
+    val extended = ref ++ Array.fill[Long](Ints.checkedCast(size))(0)
     (ref.map(i => new JLong(i)), new LongArray(MemoryBlock.fromLongArray(extended)))
   }
 
-  private def generateKeyPrefixTestData(size: Int, rand: => Long): (LongArray, LongArray) = {
-    val ref = Array.tabulate[Long](size * 2) { i => rand }
-    val extended = ref ++ Array.fill[Long](size * 2)(0)
+  private def generateKeyPrefixTestData(size: Long, rand: => Long): (LongArray, LongArray) = {
+    val ref = Array.tabulate[Long](Ints.checkedCast(size * 2)) { i => rand }
+    val extended = ref ++ Array.fill[Long](Ints.checkedCast(size * 2))(0)
     (new LongArray(MemoryBlock.fromLongArray(ref)),
      new LongArray(MemoryBlock.fromLongArray(extended)))
   }
 
-  private def collectToArray(array: LongArray, offset: Int, length: Int): Array[Long] = {
+  private def collectToArray(array: LongArray, offset: Int, length: Long): Array[Long] = {
     var i = 0
-    val out = new Array[Long](length)
+    val out = new Array[Long](Ints.checkedCast(length))
     while (i < length) {
       out(i) = array.get(offset + i)
       i += 1
@@ -107,15 +109,13 @@ class RadixSortSuite extends SparkFunSuite with Logging {
     }
   }
 
-  private def referenceKeyPrefixSort(buf: LongArray, lo: Int, hi: Int, refCmp: PrefixComparator) {
+  private def referenceKeyPrefixSort(buf: LongArray, lo: Long, hi: Long, refCmp: PrefixComparator) {
     val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt)))
     new Sorter(new UnsafeSortDataFormat(sortBuffer)).sort(
-      buf, lo, hi, new Comparator[RecordPointerAndKeyPrefix] {
+      buf, Ints.checkedCast(lo), Ints.checkedCast(hi), new Comparator[RecordPointerAndKeyPrefix] {
         override def compare(
             r1: RecordPointerAndKeyPrefix,
-            r2: RecordPointerAndKeyPrefix): Int = {
-          refCmp.compare(r1.keyPrefix, r2.keyPrefix)
-        }
+            r2: RecordPointerAndKeyPrefix): Int = refCmp.compare(r1.keyPrefix, r2.keyPrefix)
       })
   }
 

From 94a9eed11a11510a91dc4c8adb793dc3cbdef8f5 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 19 Nov 2016 21:57:09 -0800
Subject: [PATCH 153/534] [SPARK-18508][SQL] Fix documentation error for
 DateDiff

## What changes were proposed in this pull request?
The previous documentation and example for DateDiff was wrong.

## How was this patch tested?
Doc only change.

Author: Reynold Xin <rxin@databricks.com>

Closes #15937 from rxin/datediff-doc.

(cherry picked from commit bce9a03677f931d52491e7768aba9e4a19a7e696)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/expressions/datetimeExpressions.scala     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 9cec6be841de0..1db1d1995d942 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -1101,11 +1101,14 @@ case class TruncDate(date: Expression, format: Expression)
  * Returns the number of days from startDate to endDate.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(date1, date2) - Returns the number of days between `date1` and `date2`.",
+  usage = "_FUNC_(endDate, startDate) - Returns the number of days from `startDate` to `endDate`.",
   extended = """
     Examples:
-      > SELECT _FUNC_('2009-07-30', '2009-07-31');
+      > SELECT _FUNC_('2009-07-31', '2009-07-30');
        1
+
+      > SELECT _FUNC_('2009-07-30', '2009-07-31');
+       -1
   """)
 case class DateDiff(endDate: Expression, startDate: Expression)
   extends BinaryExpression with ImplicitCastInputTypes {

From 063da0c8d4e82a47cf7841578dcf968080c3d89d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 19 Nov 2016 21:57:49 -0800
Subject: [PATCH 154/534] [SQL] Fix documentation for Concat and ConcatWs

(cherry picked from commit a64f25d8b403b17ff68c9575f6f35b22e5b62427)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/expressions/stringExpressions.scala   | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index e74ef9a08750e..908aa44f81c97 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -40,15 +40,13 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
  * An expression that concatenates multiple input strings into a single string.
  * If any input is null, concat returns null.
  */
-// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of `str1`, `str2`, ..., `strN`.",
+  usage = "_FUNC_(str1, str2, ..., strN) - Returns the concatenation of str1, str2, ..., strN.",
   extended = """
     Examples:
-      > SELECT _FUNC_('Spark','SQL');
+      > SELECT _FUNC_('Spark', 'SQL');
        SparkSQL
   """)
-// scalastyle:on line.size.limit
 case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
@@ -89,8 +87,8 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
   usage = "_FUNC_(sep, [str | array(str)]+) - Returns the concatenation of the strings separated by `sep`.",
   extended = """
     Examples:
-      > SELECT _FUNC_(' ', Spark', 'SQL');
-       Spark SQL
+      > SELECT _FUNC_(' ', 'Spark', 'SQL');
+        Spark SQL
   """)
 // scalastyle:on line.size.limit
 case class ConcatWs(children: Seq[Expression])

From bc3e7b3b8a0dfc00d22bf5ee168f308a6ef5d78b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 20 Nov 2016 09:52:03 +0000
Subject: [PATCH 155/534] [SPARK-3359][BUILD][DOCS] Print examples and disable
 group and tparam tags in javadoc

## What changes were proposed in this pull request?

This PR proposes/fixes two things.

- Remove many errors to generate javadoc with Java8 from unrecognisable tags, `tparam` and `group`.

  ```
  [error] .../spark/mllib/target/java/org/apache/spark/ml/classification/Classifier.java:18: error: unknown tag: group
  [error]   /** group setParam */
  [error]       ^
  [error] .../spark/mllib/target/java/org/apache/spark/ml/classification/Classifier.java:8: error: unknown tag: tparam
  [error]  * tparam FeaturesType  Type of input features.  E.g., <code>Vector</code>
  [error]    ^
  ...
  ```

  It does not fully resolve the problem but remove many errors. It seems both `group` and `tparam` are unrecognisable in javadoc. It seems we can't print them pretty in javadoc in a way of `example` here because they appear differently (both examples can be found in http://spark.apache.org/docs/2.0.2/api/scala/index.html#org.apache.spark.ml.classification.Classifier).

- Print `example` in javadoc.
  Currently, there are few `example` tag in several places.

  ```
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example This operation might be used to evaluate a graph
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example We might use this operation to change the vertex values
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example This function might be used to initialize edge
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example This function might be used to initialize edge
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example This function might be used to initialize edge
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example We can use this function to compute the in-degree of each
  ./graphx/src/main/scala/org/apache/spark/graphx/Graph.scala:   * example This function is used to update the vertices with new values based on external data.
  ./graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala:   * example Loads a file in the following format:
  ./graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala:   * example This function is used to update the vertices with new
  ./graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala:   * example This function can be used to filter the graph based on some property, without
  ./graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala: * example We can use the Pregel abstraction to implement PageRank:
  ./graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala: * example Construct a `VertexRDD` from a plain RDD:
  ./repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkCommandLine.scala: * example new SparkCommandLine(Nil).settings
  ./repl/scala-2.10/src/main/scala/org/apache/spark/repl/SparkIMain.scala:   * example addImports("org.apache.spark.SparkContext")
  ./sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala: * example {{{
  ```

**Before**

  <img width="505" alt="2016-11-20 2 43 23" src="https://cloud.githubusercontent.com/assets/6477701/20457285/26f07e1c-aecb-11e6-9ae9-d9dee66845f4.png">

**After**
  <img width="499" alt="2016-11-20 1 27 17" src="https://cloud.githubusercontent.com/assets/6477701/20457240/409124e4-aeca-11e6-9a91-0ba514148b52.png">

## How was this patch tested?

Maunally tested by `jekyll build` with Java 7 and 8

```
java version "1.7.0_80"
Java(TM) SE Runtime Environment (build 1.7.0_80-b15)
Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode)
```

```
java version "1.8.0_45"
Java(TM) SE Runtime Environment (build 1.8.0_45-b14)
Java HotSpot(TM) 64-Bit Server VM (build 25.45-b02, mixed mode)
```

Note: this does not make sbt unidoc suceed with Java 8 yet but it reduces the number of errors with Java 8.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15939 from HyukjinKwon/SPARK-3359-javadoc.

(cherry picked from commit c528812ce770fd8a6626e7f9d2f8ca9d1e84642b)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 pom.xml                  | 13 +++++++++++++
 project/SparkBuild.scala |  5 ++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 024b2850d0a3d..7c0b0b59dc62b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2477,11 +2477,24 @@
             <configuration>
               <additionalparam>-Xdoclint:all -Xdoclint:-missing</additionalparam>
               <tags>
+                <tag>
+                  <name>example</name>
+                  <placement>a</placement>
+                  <head>Example:</head>
+                </tag>
                 <tag>
                   <name>note</name>
                   <placement>a</placement>
                   <head>Note:</head>
                 </tag>
+                <tag>
+                  <name>group</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>tparam</name>
+                  <placement>X</placement>
+                </tag>
               </tags>
             </configuration>
           </plugin>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 92b45657210e1..429a163d22a6d 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -742,7 +742,10 @@ object Unidoc {
       "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc",
       "-public",
       "-noqualifier", "java.lang",
-      "-tag", """note:a:Note\:"""
+      "-tag", """example:a:Example\:""",
+      "-tag", """note:a:Note\:""",
+      "-tag", "group:X",
+      "-tag", "tparam:X"
     ),
 
     // Use GitHub repository for Scaladoc source links

From cffaf5035816fa6ffc4dadd47bede1eff6371fee Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Sun, 20 Nov 2016 12:46:29 -0800
Subject: [PATCH 156/534] [SPARK-17732][SQL] Revert ALTER TABLE DROP PARTITION
 should support comparators

This reverts commit 1126c3194ee1c79015cf1d3808bc963aa93dcadf.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #15948 from hvanhovell/SPARK-17732.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |   6 +-
 .../sql/catalyst/parser/AstBuilder.scala      |  30 +----
 .../spark/sql/execution/SparkSqlParser.scala  |   2 +-
 .../spark/sql/execution/command/ddl.scala     |  51 ++-------
 .../datasources/DataSourceStrategy.scala      |   8 +-
 .../execution/command/DDLCommandSuite.scala   |   9 +-
 .../sql/hive/execution/HiveDDLSuite.scala     | 103 ------------------
 7 files changed, 24 insertions(+), 185 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index fcca11c69f0a3..b599a884957a8 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -239,7 +239,11 @@ partitionSpecLocation
     ;
 
 partitionSpec
-    : PARTITION '(' expression (',' expression)* ')'
+    : PARTITION '(' partitionVal (',' partitionVal)* ')'
+    ;
+
+partitionVal
+    : identifier (EQ constant)?
     ;
 
 describeFuncName
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 97056bba9d763..2006844923cf7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -194,15 +194,10 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
    */
   override def visitPartitionSpec(
       ctx: PartitionSpecContext): Map[String, Option[String]] = withOrigin(ctx) {
-    val parts = ctx.expression.asScala.map { pVal =>
-      expression(pVal) match {
-        case UnresolvedAttribute(name :: Nil) =>
-          name -> None
-        case cmp @ EqualTo(UnresolvedAttribute(name :: Nil), constant: Literal) =>
-          name -> Option(constant.toString)
-        case _ =>
-          throw new ParseException("Invalid partition filter specification", ctx)
-      }
+    val parts = ctx.partitionVal.asScala.map { pVal =>
+      val name = pVal.identifier.getText
+      val value = Option(pVal.constant).map(visitStringConstant)
+      name -> value
     }
     // Before calling `toMap`, we check duplicated keys to avoid silently ignore partition values
     // in partition spec like PARTITION(a='1', b='2', a='3'). The real semantical check for
@@ -211,23 +206,6 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
     parts.toMap
   }
 
-  /**
-   * Create a partition filter specification.
-   */
-  def visitPartitionFilterSpec(ctx: PartitionSpecContext): Expression = withOrigin(ctx) {
-    val parts = ctx.expression.asScala.map { pVal =>
-      expression(pVal) match {
-        case EqualNullSafe(_, _) =>
-          throw new ParseException("'<=>' operator is not allowed in partition specification.", ctx)
-        case cmp @ BinaryComparison(UnresolvedAttribute(name :: Nil), constant: Literal) =>
-          cmp.withNewChildren(Seq(AttributeReference(name, StringType)(), constant))
-        case _ =>
-          throw new ParseException("Invalid partition filter specification", ctx)
-      }
-    }
-    parts.reduceLeft(And)
-  }
-
   /**
    * Create a partition specification map without optional values.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 112d812cb6c76..b8be3d17ba444 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -813,7 +813,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     }
     AlterTableDropPartitionCommand(
       visitTableIdentifier(ctx.tableIdentifier),
-      ctx.partitionSpec.asScala.map(visitPartitionFilterSpec),
+      ctx.partitionSpec.asScala.map(visitNonOptionalPartitionSpec),
       ctx.EXISTS != null,
       ctx.PURGE != null)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 588aa05c37b49..570a9967871e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -31,8 +31,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BinaryComparison}
-import org.apache.spark.sql.catalyst.expressions.{EqualTo, Expression, PredicateHelper}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.execution.datasources.PartitioningUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -419,55 +418,27 @@ case class AlterTableRenamePartitionCommand(
  */
 case class AlterTableDropPartitionCommand(
     tableName: TableIdentifier,
-    specs: Seq[Expression],
+    specs: Seq[TablePartitionSpec],
     ifExists: Boolean,
     purge: Boolean)
-  extends RunnableCommand with PredicateHelper {
-
-  private def isRangeComparison(expr: Expression): Boolean = {
-    expr.find(e => e.isInstanceOf[BinaryComparison] && !e.isInstanceOf[EqualTo]).isDefined
-  }
+  extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
-    val resolver = sparkSession.sessionState.conf.resolver
     DDLUtils.verifyAlterTableType(catalog, table, isView = false)
     DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE DROP PARTITION")
 
-    specs.foreach { expr =>
-      expr.references.foreach { attr =>
-        if (!table.partitionColumnNames.exists(resolver(_, attr.name))) {
-          throw new AnalysisException(s"${attr.name} is not a valid partition column " +
-            s"in table ${table.identifier.quotedString}.")
-        }
-      }
+    val normalizedSpecs = specs.map { spec =>
+      PartitioningUtils.normalizePartitionSpec(
+        spec,
+        table.partitionColumnNames,
+        table.identifier.quotedString,
+        sparkSession.sessionState.conf.resolver)
     }
 
-    if (specs.exists(isRangeComparison)) {
-      val partitionSet = specs.flatMap { spec =>
-        val partitions = catalog.listPartitionsByFilter(table.identifier, Seq(spec)).map(_.spec)
-        if (partitions.isEmpty && !ifExists) {
-          throw new AnalysisException(s"There is no partition for ${spec.sql}")
-        }
-        partitions
-      }.distinct
-      catalog.dropPartitions(
-        table.identifier, partitionSet, ignoreIfNotExists = ifExists, purge = purge)
-    } else {
-      val normalizedSpecs = specs.map { expr =>
-        val spec = splitConjunctivePredicates(expr).map {
-          case BinaryComparison(AttributeReference(name, _, _, _), right) => name -> right.toString
-        }.toMap
-        PartitioningUtils.normalizePartitionSpec(
-          spec,
-          table.partitionColumnNames,
-          table.identifier.quotedString,
-          resolver)
-      }
-      catalog.dropPartitions(
-        table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
-    }
+    catalog.dropPartitions(
+      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
     Seq.empty[Row]
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index e81512d1abf84..4f19a2d00b0e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -215,14 +215,8 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
           if (overwrite.enabled) {
             val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
             if (deletedPartitions.nonEmpty) {
-              import org.apache.spark.sql.catalyst.expressions._
-              val expressions = deletedPartitions.map { specs =>
-                specs.map { case (key, value) =>
-                  EqualTo(AttributeReference(key, StringType)(), Literal.create(value, StringType))
-                }.reduceLeft(And)
-              }.toSeq
               AlterTableDropPartitionCommand(
-                l.catalogTable.get.identifier, expressions,
+                l.catalogTable.get.identifier, deletedPartitions.toSeq,
                 ifExists = true, purge = true).run(t.sparkSession)
             }
           }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 057528bef5084..d31e7aeb3a78a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -21,7 +21,6 @@ import scala.reflect.{classTag, ClassTag}
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, EqualTo, Literal}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.Project
@@ -613,12 +612,8 @@ class DDLCommandSuite extends PlanTest {
     val expected1_table = AlterTableDropPartitionCommand(
       tableIdent,
       Seq(
-        And(
-          EqualTo(AttributeReference("dt", StringType)(), Literal.create("2008-08-08", StringType)),
-          EqualTo(AttributeReference("country", StringType)(), Literal.create("us", StringType))),
-        And(
-          EqualTo(AttributeReference("dt", StringType)(), Literal.create("2009-09-09", StringType)),
-          EqualTo(AttributeReference("country", StringType)(), Literal.create("uk", StringType)))),
+        Map("dt" -> "2008-08-08", "country" -> "us"),
+        Map("dt" -> "2009-09-09", "country" -> "uk")),
       ifExists = true,
       purge = false)
     val expected2_table = expected1_table.copy(ifExists = false)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 15e3927b755af..951e0704148b3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
@@ -226,108 +225,6 @@ class HiveDDLSuite
     }
   }
 
-  test("SPARK-17732: Drop partitions by filter") {
-    withTable("sales") {
-      sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
-
-      for (country <- Seq("US", "CA", "KR")) {
-        for (quarter <- 1 to 4) {
-          sql(s"ALTER TABLE sales ADD PARTITION (country = '$country', quarter = '$quarter')")
-        }
-      }
-
-      sql("ALTER TABLE sales DROP PARTITION (country < 'KR', quarter > '2')")
-      checkAnswer(sql("SHOW PARTITIONS sales"),
-        Row("country=CA/quarter=1") ::
-        Row("country=CA/quarter=2") ::
-        Row("country=KR/quarter=1") ::
-        Row("country=KR/quarter=2") ::
-        Row("country=KR/quarter=3") ::
-        Row("country=KR/quarter=4") ::
-        Row("country=US/quarter=1") ::
-        Row("country=US/quarter=2") ::
-        Row("country=US/quarter=3") ::
-        Row("country=US/quarter=4") :: Nil)
-
-      sql("ALTER TABLE sales DROP PARTITION (country < 'KR'), PARTITION (quarter <= '1')")
-      checkAnswer(sql("SHOW PARTITIONS sales"),
-        Row("country=KR/quarter=2") ::
-        Row("country=KR/quarter=3") ::
-        Row("country=KR/quarter=4") ::
-        Row("country=US/quarter=2") ::
-        Row("country=US/quarter=3") ::
-        Row("country=US/quarter=4") :: Nil)
-
-      sql("ALTER TABLE sales DROP PARTITION (country='KR', quarter='4')")
-      sql("ALTER TABLE sales DROP PARTITION (country='US', quarter='3')")
-      checkAnswer(sql("SHOW PARTITIONS sales"),
-        Row("country=KR/quarter=2") ::
-        Row("country=KR/quarter=3") ::
-        Row("country=US/quarter=2") ::
-        Row("country=US/quarter=4") :: Nil)
-
-      sql("ALTER TABLE sales DROP PARTITION (quarter <= 2), PARTITION (quarter >= '4')")
-      checkAnswer(sql("SHOW PARTITIONS sales"),
-        Row("country=KR/quarter=3") :: Nil)
-
-      // According to the declarative partition spec definitions, this drops the union of target
-      // partitions without exceptions. Hive raises exceptions because it handles them sequentially.
-      sql("ALTER TABLE sales DROP PARTITION (quarter <= 4), PARTITION (quarter <= '3')")
-      checkAnswer(sql("SHOW PARTITIONS sales"), Nil)
-    }
-  }
-
-  test("SPARK-17732: Error handling for drop partitions by filter") {
-    withTable("sales") {
-      sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
-
-      val m = intercept[AnalysisException] {
-        sql("ALTER TABLE sales DROP PARTITION (unknown = 'KR')")
-      }.getMessage
-      assert(m.contains("unknown is not a valid partition column in table"))
-
-      val m2 = intercept[AnalysisException] {
-        sql("ALTER TABLE sales DROP PARTITION (unknown < 'KR')")
-      }.getMessage
-      assert(m2.contains("unknown is not a valid partition column in table"))
-
-      val m3 = intercept[AnalysisException] {
-        sql("ALTER TABLE sales DROP PARTITION (unknown <=> 'KR')")
-      }.getMessage
-      assert(m3.contains("'<=>' operator is not allowed in partition specification"))
-
-      val m4 = intercept[ParseException] {
-        sql("ALTER TABLE sales DROP PARTITION (unknown <=> upper('KR'))")
-      }.getMessage
-      assert(m4.contains("'<=>' operator is not allowed in partition specification"))
-
-      val m5 = intercept[ParseException] {
-        sql("ALTER TABLE sales DROP PARTITION (country < 'KR', quarter)")
-      }.getMessage
-      assert(m5.contains("Invalid partition filter specification"))
-
-      sql(s"ALTER TABLE sales ADD PARTITION (country = 'KR', quarter = '3')")
-      val m6 = intercept[AnalysisException] {
-        sql("ALTER TABLE sales DROP PARTITION (quarter <= '4'), PARTITION (quarter <= '2')")
-      }.getMessage
-      // The query is not executed because `PARTITION (quarter <= '2')` is invalid.
-      checkAnswer(sql("SHOW PARTITIONS sales"),
-        Row("country=KR/quarter=3") :: Nil)
-      assert(m6.contains("There is no partition for (`quarter` <= '2')"))
-    }
-  }
-
-  test("SPARK-17732: Partition filter is not allowed in ADD PARTITION") {
-    withTable("sales") {
-      sql("CREATE TABLE sales(id INT) PARTITIONED BY (country STRING, quarter STRING)")
-
-      val m = intercept[ParseException] {
-        sql("ALTER TABLE sales ADD PARTITION (country = 'US', quarter < '1')")
-      }.getMessage()
-      assert(m.contains("Invalid partition filter specification"))
-    }
-  }
-
   test("drop views") {
     withTable("tab1") {
       val tabName = "tab1"

From f8662db72815b9c89f2448511d117e6d224e0b11 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 20 Nov 2016 20:00:59 -0800
Subject: [PATCH 157/534] [HOTFIX][SQL] Fix DDLSuite failure.

(cherry picked from commit b625a36ebc59cbacc223fc03005bc0f6d296b6e7)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/sql/execution/command/DDLSuite.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index a01073987423e..02d9d15684904 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1426,8 +1426,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       sql("DESCRIBE FUNCTION 'concat'"),
       Row("Class: org.apache.spark.sql.catalyst.expressions.Concat") ::
         Row("Function: concat") ::
-        Row("Usage: concat(str1, str2, ..., strN) " +
-          "- Returns the concatenation of `str1`, `str2`, ..., `strN`.") :: Nil
+        Row("Usage: concat(str1, str2, ..., strN) - " +
+            "Returns the concatenation of str1, str2, ..., strN.") :: Nil
     )
     // extended mode
     checkAnswer(

From fb4e6359d1fdb9e4f05fcfa03839024e8b91b47a Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Mon, 21 Nov 2016 12:05:01 +0800
Subject: [PATCH 158/534] [SPARK-18467][SQL] Extracts method for preparing
 arguments from StaticInvoke, Invoke and NewInstance and modify to short
 circuit if arguments have null when `needNullCheck == true`.

## What changes were proposed in this pull request?

This pr extracts method for preparing arguments from `StaticInvoke`, `Invoke` and `NewInstance` and modify to short circuit if arguments have `null` when `propageteNull == true`.

The steps are as follows:

1. Introduce `InvokeLike` to extract common logic from `StaticInvoke`, `Invoke` and `NewInstance` to prepare arguments.
`StaticInvoke` and `Invoke` had a risk to exceed 64kb JVM limit to prepare arguments but after this patch they can handle them because they share the preparing code of NewInstance, which handles the limit well.

2. Remove unneeded null checking and fix nullability of `NewInstance`.
Avoid some of nullabilty checking which are not needed because the expression is not nullable.

3. Modify to short circuit if arguments have `null` when `needNullCheck == true`.
If `needNullCheck == true`, preparing arguments can be skipped if we found one of them is `null`, so modified to short circuit in the case.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #15901 from ueshin/issues/SPARK-18467.

(cherry picked from commit 658547974915ebcaae83e13e4c3bdf68d5426fda)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/objects/objects.scala         | 163 +++++++++++-------
 1 file changed, 101 insertions(+), 62 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 0e3d99127ed56..0b36091ece1bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -32,6 +32,78 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCo
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
 import org.apache.spark.sql.types._
 
+/**
+ * Common base class for [[StaticInvoke]], [[Invoke]], and [[NewInstance]].
+ */
+trait InvokeLike extends Expression with NonSQLExpression {
+
+  def arguments: Seq[Expression]
+
+  def propagateNull: Boolean
+
+  protected lazy val needNullCheck: Boolean = propagateNull && arguments.exists(_.nullable)
+
+  /**
+   * Prepares codes for arguments.
+   *
+   * - generate codes for argument.
+   * - use ctx.splitExpressions() to not exceed 64kb JVM limit while preparing arguments.
+   * - avoid some of nullabilty checking which are not needed because the expression is not
+   *   nullable.
+   * - when needNullCheck == true, short circuit if we found one of arguments is null because
+   *   preparing rest of arguments can be skipped in the case.
+   *
+   * @param ctx a [[CodegenContext]]
+   * @return (code to prepare arguments, argument string, result of argument null check)
+   */
+  def prepareArguments(ctx: CodegenContext): (String, String, String) = {
+
+    val resultIsNull = if (needNullCheck) {
+      val resultIsNull = ctx.freshName("resultIsNull")
+      ctx.addMutableState("boolean", resultIsNull, "")
+      resultIsNull
+    } else {
+      "false"
+    }
+    val argValues = arguments.map { e =>
+      val argValue = ctx.freshName("argValue")
+      ctx.addMutableState(ctx.javaType(e.dataType), argValue, "")
+      argValue
+    }
+
+    val argCodes = if (needNullCheck) {
+      val reset = s"$resultIsNull = false;"
+      val argCodes = arguments.zipWithIndex.map { case (e, i) =>
+        val expr = e.genCode(ctx)
+        val updateResultIsNull = if (e.nullable) {
+          s"$resultIsNull = ${expr.isNull};"
+        } else {
+          ""
+        }
+        s"""
+          if (!$resultIsNull) {
+            ${expr.code}
+            $updateResultIsNull
+            ${argValues(i)} = ${expr.value};
+          }
+        """
+      }
+      reset +: argCodes
+    } else {
+      arguments.zipWithIndex.map { case (e, i) =>
+        val expr = e.genCode(ctx)
+        s"""
+          ${expr.code}
+          ${argValues(i)} = ${expr.value};
+        """
+      }
+    }
+    val argCode = ctx.splitExpressions(ctx.INPUT_ROW, argCodes)
+
+    (argCode, argValues.mkString(", "), resultIsNull)
+  }
+}
+
 /**
  * Invokes a static function, returning the result.  By default, any of the arguments being null
  * will result in returning null instead of calling the function.
@@ -50,7 +122,7 @@ case class StaticInvoke(
     dataType: DataType,
     functionName: String,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true) extends Expression with NonSQLExpression {
+    propagateNull: Boolean = true) extends InvokeLike {
 
   val objectName = staticObject.getName.stripSuffix("$")
 
@@ -62,16 +134,10 @@ case class StaticInvoke(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val javaType = ctx.javaType(dataType)
-    val argGen = arguments.map(_.genCode(ctx))
-    val argString = argGen.map(_.value).mkString(", ")
 
-    val callFunc = s"$objectName.$functionName($argString)"
+    val (argCode, argString, resultIsNull) = prepareArguments(ctx)
 
-    val setIsNull = if (propagateNull && arguments.nonEmpty) {
-      s"boolean ${ev.isNull} = ${argGen.map(_.isNull).mkString(" || ")};"
-    } else {
-      s"boolean ${ev.isNull} = false;"
-    }
+    val callFunc = s"$objectName.$functionName($argString)"
 
     // If the function can return null, we do an extra check to make sure our null bit is still set
     // correctly.
@@ -82,9 +148,9 @@ case class StaticInvoke(
     }
 
     val code = s"""
-      ${argGen.map(_.code).mkString("\n")}
-      $setIsNull
-      final $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : $callFunc;
+      $argCode
+      boolean ${ev.isNull} = $resultIsNull;
+      final $javaType ${ev.value} = $resultIsNull ? ${ctx.defaultValue(dataType)} : $callFunc;
       $postNullCheck
      """
     ev.copy(code = code)
@@ -103,13 +169,15 @@ case class StaticInvoke(
  * @param functionName The name of the method to call.
  * @param dataType The expected return type of the function.
  * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
+ * @param propagateNull When true, and any of the arguments is null, null will be returned instead
+ *                      of calling the function.
  */
 case class Invoke(
     targetObject: Expression,
     functionName: String,
     dataType: DataType,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true) extends Expression with NonSQLExpression {
+    propagateNull: Boolean = true) extends InvokeLike {
 
   override def nullable: Boolean = true
   override def children: Seq[Expression] = targetObject +: arguments
@@ -131,8 +199,8 @@ case class Invoke(
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val javaType = ctx.javaType(dataType)
     val obj = targetObject.genCode(ctx)
-    val argGen = arguments.map(_.genCode(ctx))
-    val argString = argGen.map(_.value).mkString(", ")
+
+    val (argCode, argString, resultIsNull) = prepareArguments(ctx)
 
     val returnPrimitive = method.isDefined && method.get.getReturnType.isPrimitive
     val needTryCatch = method.isDefined && method.get.getExceptionTypes.nonEmpty
@@ -164,12 +232,6 @@ case class Invoke(
       """
     }
 
-    val setIsNull = if (propagateNull && arguments.nonEmpty) {
-      s"boolean ${ev.isNull} = ${obj.isNull} || ${argGen.map(_.isNull).mkString(" || ")};"
-    } else {
-      s"boolean ${ev.isNull} = ${obj.isNull};"
-    }
-
     // If the function can return null, we do an extra check to make sure our null bit is still set
     // correctly.
     val postNullCheck = if (ctx.defaultValue(dataType) == "null") {
@@ -177,15 +239,19 @@ case class Invoke(
     } else {
       ""
     }
+
     val code = s"""
       ${obj.code}
-      ${argGen.map(_.code).mkString("\n")}
-      $setIsNull
+      boolean ${ev.isNull} = true;
       $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        $evaluate
+      if (!${obj.isNull}) {
+        $argCode
+        ${ev.isNull} = $resultIsNull;
+        if (!${ev.isNull}) {
+          $evaluate
+        }
+        $postNullCheck
       }
-      $postNullCheck
      """
     ev.copy(code = code)
   }
@@ -223,10 +289,10 @@ case class NewInstance(
     arguments: Seq[Expression],
     propagateNull: Boolean,
     dataType: DataType,
-    outerPointer: Option[() => AnyRef]) extends Expression with NonSQLExpression {
+    outerPointer: Option[() => AnyRef]) extends InvokeLike {
   private val className = cls.getName
 
-  override def nullable: Boolean = propagateNull
+  override def nullable: Boolean = needNullCheck
 
   override def children: Seq[Expression] = arguments
 
@@ -245,52 +311,25 @@ case class NewInstance(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val javaType = ctx.javaType(dataType)
-    val argIsNulls = ctx.freshName("argIsNulls")
-    ctx.addMutableState("boolean[]", argIsNulls,
-      s"$argIsNulls = new boolean[${arguments.size}];")
-    val argValues = arguments.zipWithIndex.map { case (e, i) =>
-      val argValue = ctx.freshName("argValue")
-      ctx.addMutableState(ctx.javaType(e.dataType), argValue, "")
-      argValue
-    }
 
-    val argCodes = arguments.zipWithIndex.map { case (e, i) =>
-      val expr = e.genCode(ctx)
-      expr.code + s"""
-       $argIsNulls[$i] = ${expr.isNull};
-       ${argValues(i)} = ${expr.value};
-     """
-    }
-    val argCode = ctx.splitExpressions(ctx.INPUT_ROW, argCodes)
+    val (argCode, argString, resultIsNull) = prepareArguments(ctx)
 
     val outer = outerPointer.map(func => Literal.fromObject(func()).genCode(ctx))
 
-    var isNull = ev.isNull
-    val setIsNull = if (propagateNull && arguments.nonEmpty) {
-      s"""
-       boolean $isNull = false;
-       for (int idx = 0; idx < ${arguments.length}; idx++) {
-         if ($argIsNulls[idx]) { $isNull = true; break; }
-       }
-     """
-    } else {
-      isNull = "false"
-      ""
-    }
+    ev.isNull = resultIsNull
 
     val constructorCall = outer.map { gen =>
-      s"""${gen.value}.new ${cls.getSimpleName}(${argValues.mkString(", ")})"""
+      s"${gen.value}.new ${cls.getSimpleName}($argString)"
     }.getOrElse {
-      s"new $className(${argValues.mkString(", ")})"
+      s"new $className($argString)"
     }
 
     val code = s"""
       $argCode
       ${outer.map(_.code).getOrElse("")}
-      $setIsNull
-      final $javaType ${ev.value} = $isNull ? ${ctx.defaultValue(javaType)} : $constructorCall;
-     """
-    ev.copy(code = code, isNull = isNull)
+      final $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(javaType)} : $constructorCall;
+    """
+    ev.copy(code = code)
   }
 
   override def toString: String = s"newInstance($cls)"

From 31002e4a77ca56492f41bf35e7c8f263d767d3aa Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Mon, 21 Nov 2016 05:36:49 -0800
Subject: [PATCH 159/534] [SPARK-18282][ML][PYSPARK] Add python clustering
 summaries for GMM and BKM

## What changes were proposed in this pull request?

Add model summary APIs for `GaussianMixtureModel` and `BisectingKMeansModel` in pyspark.

## How was this patch tested?

Unit tests.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #15777 from sethah/pyspark_cluster_summaries.

(cherry picked from commit e811fbf9ed131bccbc46f3c5701c4ff317222fd9)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../classification/LogisticRegression.scala   |  11 +-
 .../spark/ml/clustering/BisectingKMeans.scala |   9 +-
 .../spark/ml/clustering/GaussianMixture.scala |   9 +-
 .../apache/spark/ml/clustering/KMeans.scala   |   9 +-
 .../GeneralizedLinearRegression.scala         |  11 +-
 .../ml/regression/LinearRegression.scala      |  14 +-
 .../LogisticRegressionSuite.scala             |   2 +
 .../ml/clustering/BisectingKMeansSuite.scala  |   3 +
 .../ml/clustering/GaussianMixtureSuite.scala  |   3 +
 .../spark/ml/clustering/KMeansSuite.scala     |   3 +
 .../GeneralizedLinearRegressionSuite.scala    |   2 +
 .../ml/regression/LinearRegressionSuite.scala |   2 +
 python/pyspark/ml/classification.py           |  15 +-
 python/pyspark/ml/clustering.py               | 162 +++++++++++++++++-
 python/pyspark/ml/regression.py               |  16 +-
 python/pyspark/ml/tests.py                    |  32 ++++
 16 files changed, 256 insertions(+), 47 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f58efd36a1c66..d07b4adebb08f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -648,7 +648,7 @@ class LogisticRegression @Since("1.2.0") (
         $(labelCol),
         $(featuresCol),
         objectiveHistory)
-      model.setSummary(logRegSummary)
+      model.setSummary(Some(logRegSummary))
     } else {
       model
     }
@@ -790,9 +790,9 @@ class LogisticRegressionModel private[spark] (
     }
   }
 
-  private[classification] def setSummary(
-      summary: LogisticRegressionTrainingSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[classification]
+  def setSummary(summary: Option[LogisticRegressionTrainingSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -887,8 +887,7 @@ class LogisticRegressionModel private[spark] (
   override def copy(extra: ParamMap): LogisticRegressionModel = {
     val newModel = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
       numClasses, isMultinomial), extra)
-    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
-    newModel.setParent(parent)
+    newModel.setSummary(trainingSummary).setParent(parent)
   }
 
   override protected def raw2prediction(rawPrediction: Vector): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index f8a606d60b2aa..e6ca3aedffd9d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -95,8 +95,7 @@ class BisectingKMeansModel private[ml] (
   @Since("2.0.0")
   override def copy(extra: ParamMap): BisectingKMeansModel = {
     val copied = copyValues(new BisectingKMeansModel(uid, parentModel), extra)
-    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
-    copied.setParent(this.parent)
+    copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
   @Since("2.0.0")
@@ -132,8 +131,8 @@ class BisectingKMeansModel private[ml] (
 
   private var trainingSummary: Option[BisectingKMeansSummary] = None
 
-  private[clustering] def setSummary(summary: BisectingKMeansSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[clustering] def setSummary(summary: Option[BisectingKMeansSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -265,7 +264,7 @@ class BisectingKMeans @Since("2.0.0") (
     val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
     val summary = new BisectingKMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
-    model.setSummary(summary)
+    model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index c6035cc4c9647..92d0b7d085f12 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -90,8 +90,7 @@ class GaussianMixtureModel private[ml] (
   @Since("2.0.0")
   override def copy(extra: ParamMap): GaussianMixtureModel = {
     val copied = copyValues(new GaussianMixtureModel(uid, weights, gaussians), extra)
-    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
-    copied.setParent(this.parent)
+    copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
   @Since("2.0.0")
@@ -150,8 +149,8 @@ class GaussianMixtureModel private[ml] (
 
   private var trainingSummary: Option[GaussianMixtureSummary] = None
 
-  private[clustering] def setSummary(summary: GaussianMixtureSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[clustering] def setSummary(summary: Option[GaussianMixtureSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -340,7 +339,7 @@ class GaussianMixture @Since("2.0.0") (
       .setParent(this)
     val summary = new GaussianMixtureSummary(model.transform(dataset),
       $(predictionCol), $(probabilityCol), $(featuresCol), $(k))
-    model.setSummary(summary)
+    model.setSummary(Some(summary))
     instr.logNumFeatures(model.gaussians.head.mean.size)
     instr.logSuccess(model)
     model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 26505b4cc1501..152bd13b7a17a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -110,8 +110,7 @@ class KMeansModel private[ml] (
   @Since("1.5.0")
   override def copy(extra: ParamMap): KMeansModel = {
     val copied = copyValues(new KMeansModel(uid, parentModel), extra)
-    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
-    copied.setParent(this.parent)
+    copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
   /** @group setParam */
@@ -165,8 +164,8 @@ class KMeansModel private[ml] (
 
   private var trainingSummary: Option[KMeansSummary] = None
 
-  private[clustering] def setSummary(summary: KMeansSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[clustering] def setSummary(summary: Option[KMeansSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -325,7 +324,7 @@ class KMeans @Since("1.5.0") (
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
     val summary = new KMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
-    model.setSummary(summary)
+    model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 736fd3b9e0f64..3f9de1fe74c9c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -270,7 +270,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
           .setParent(this))
       val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
         wlsModel.diagInvAtWA.toArray, 1, getSolver)
-      return model.setSummary(trainingSummary)
+      return model.setSummary(Some(trainingSummary))
     }
 
     // Fit Generalized Linear Model by iteratively reweighted least squares (IRLS).
@@ -284,7 +284,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
         .setParent(this))
     val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
       irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
-    model.setSummary(trainingSummary)
+    model.setSummary(Some(trainingSummary))
   }
 
   @Since("2.0.0")
@@ -761,8 +761,8 @@ class GeneralizedLinearRegressionModel private[ml] (
   def hasSummary: Boolean = trainingSummary.nonEmpty
 
   private[regression]
-  def setSummary(summary: GeneralizedLinearRegressionTrainingSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  def setSummary(summary: Option[GeneralizedLinearRegressionTrainingSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -778,8 +778,7 @@ class GeneralizedLinearRegressionModel private[ml] (
   override def copy(extra: ParamMap): GeneralizedLinearRegressionModel = {
     val copied = copyValues(new GeneralizedLinearRegressionModel(uid, coefficients, intercept),
       extra)
-    if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
-    copied.setParent(parent)
+    copied.setSummary(trainingSummary).setParent(parent)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index da7ce6b46f2ab..8ea5e1e6c453a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -225,7 +225,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
         model.diagInvAtWA.toArray,
         model.objectiveHistory)
 
-      return lrModel.setSummary(trainingSummary)
+      return lrModel.setSummary(Some(trainingSummary))
     }
 
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
@@ -278,7 +278,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
           model,
           Array(0D),
           Array(0D))
-        return model.setSummary(trainingSummary)
+        return model.setSummary(Some(trainingSummary))
       } else {
         require($(regParam) == 0.0, "The standard deviation of the label is zero. " +
           "Model cannot be regularized.")
@@ -400,7 +400,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       model,
       Array(0D),
       objectiveHistory)
-    model.setSummary(trainingSummary)
+    model.setSummary(Some(trainingSummary))
   }
 
   @Since("1.4.0")
@@ -446,8 +446,9 @@ class LinearRegressionModel private[ml] (
     throw new SparkException("No training summary available for this LinearRegressionModel")
   }
 
-  private[regression] def setSummary(summary: LinearRegressionTrainingSummary): this.type = {
-    this.trainingSummary = Some(summary)
+  private[regression]
+  def setSummary(summary: Option[LinearRegressionTrainingSummary]): this.type = {
+    this.trainingSummary = summary
     this
   }
 
@@ -490,8 +491,7 @@ class LinearRegressionModel private[ml] (
   @Since("1.4.0")
   override def copy(extra: ParamMap): LinearRegressionModel = {
     val newModel = copyValues(new LinearRegressionModel(uid, coefficients, intercept), extra)
-    if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
-    newModel.setParent(parent)
+    newModel.setSummary(trainingSummary).setParent(parent)
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 2877285eb4d59..e360542eae2ab 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -147,6 +147,8 @@ class LogisticRegressionSuite
     assert(model.hasSummary)
     val copiedModel = model.copy(ParamMap.empty)
     assert(copiedModel.hasSummary)
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("empty probabilityCol") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 49797d938d751..fc491cd6161fd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -109,6 +109,9 @@ class BisectingKMeansSuite
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 7165b63ed3b96..07299123f8a47 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -111,6 +111,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 73972557d2631..c1b7242e11a8f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -123,6 +123,9 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+
+    model.setSummary(None)
+    assert(!model.hasSummary)
   }
 
   test("KMeansModel transform with non-default feature and prediction cols") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 6a4ac1735b2cb..9b0fa67630d2e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -197,6 +197,8 @@ class GeneralizedLinearRegressionSuite
     assert(model.hasSummary)
     val copiedModel = model.copy(ParamMap.empty)
     assert(copiedModel.hasSummary)
+    model.setSummary(None)
+    assert(!model.hasSummary)
 
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index df97d0b2ae7ad..0be82742a33be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -146,6 +146,8 @@ class LinearRegressionSuite
     assert(model.hasSummary)
     val copiedModel = model.copy(ParamMap.empty)
     assert(copiedModel.hasSummary)
+    model.setSummary(None)
+    assert(!model.hasSummary)
 
     model.transform(datasetWithDenseFeature)
       .select("label", "prediction")
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 56c8c62259e79..83e1e89347660 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -309,13 +309,16 @@ def interceptVector(self):
     @since("2.0.0")
     def summary(self):
         """
-        Gets summary (e.g. residuals, mse, r-squared ) of model on
-        training set. An exception is thrown if
-        `trainingSummary is None`.
+        Gets summary (e.g. accuracy/precision/recall, objective history, total iterations) of model
+        trained on the training set. An exception is thrown if `trainingSummary is None`.
         """
-        java_blrt_summary = self._call_java("summary")
-        # Note: Once multiclass is added, update this to return correct summary
-        return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
+        if self.hasSummary:
+            java_blrt_summary = self._call_java("summary")
+            # Note: Once multiclass is added, update this to return correct summary
+            return BinaryLogisticRegressionTrainingSummary(java_blrt_summary)
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
 
     @property
     @since("2.0.0")
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 7632f05c3b68c..e58ec1e7ac296 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -17,16 +17,74 @@
 
 from pyspark import since, keyword_only
 from pyspark.ml.util import *
-from pyspark.ml.wrapper import JavaEstimator, JavaModel
+from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper
 from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
 
-__all__ = ['BisectingKMeans', 'BisectingKMeansModel',
+__all__ = ['BisectingKMeans', 'BisectingKMeansModel', 'BisectingKMeansSummary',
            'KMeans', 'KMeansModel',
-           'GaussianMixture', 'GaussianMixtureModel',
+           'GaussianMixture', 'GaussianMixtureModel', 'GaussianMixtureSummary',
            'LDA', 'LDAModel', 'LocalLDAModel', 'DistributedLDAModel']
 
 
+class ClusteringSummary(JavaWrapper):
+    """
+    .. note:: Experimental
+
+    Clustering results for a given model.
+
+    .. versionadded:: 2.1.0
+    """
+
+    @property
+    @since("2.1.0")
+    def predictionCol(self):
+        """
+        Name for column of predicted clusters in `predictions`.
+        """
+        return self._call_java("predictionCol")
+
+    @property
+    @since("2.1.0")
+    def predictions(self):
+        """
+        DataFrame produced by the model's `transform` method.
+        """
+        return self._call_java("predictions")
+
+    @property
+    @since("2.1.0")
+    def featuresCol(self):
+        """
+        Name for column of features in `predictions`.
+        """
+        return self._call_java("featuresCol")
+
+    @property
+    @since("2.1.0")
+    def k(self):
+        """
+        The number of clusters the model was trained with.
+        """
+        return self._call_java("k")
+
+    @property
+    @since("2.1.0")
+    def cluster(self):
+        """
+        DataFrame of predicted cluster centers for each training data point.
+        """
+        return self._call_java("cluster")
+
+    @property
+    @since("2.1.0")
+    def clusterSizes(self):
+        """
+        Size of (number of data points in) each cluster.
+        """
+        return self._call_java("clusterSizes")
+
+
 class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
     .. note:: Experimental
@@ -56,6 +114,28 @@ def gaussiansDF(self):
         """
         return self._call_java("gaussiansDF")
 
+    @property
+    @since("2.1.0")
+    def hasSummary(self):
+        """
+        Indicates whether a training summary exists for this model
+        instance.
+        """
+        return self._call_java("hasSummary")
+
+    @property
+    @since("2.1.0")
+    def summary(self):
+        """
+        Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
+        training set. An exception is thrown if no summary exists.
+        """
+        if self.hasSummary:
+            return GaussianMixtureSummary(self._call_java("summary"))
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
+
 
 @inherit_doc
 class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
@@ -92,6 +172,13 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> gm = GaussianMixture(k=3, tol=0.0001,
     ...                      maxIter=10, seed=10)
     >>> model = gm.fit(df)
+    >>> model.hasSummary
+    True
+    >>> summary = model.summary
+    >>> summary.k
+    3
+    >>> summary.clusterSizes
+    [2, 2, 2]
     >>> weights = model.weights
     >>> len(weights)
     3
@@ -118,6 +205,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> model_path = temp_path + "/gmm_model"
     >>> model.save(model_path)
     >>> model2 = GaussianMixtureModel.load(model_path)
+    >>> model2.hasSummary
+    False
     >>> model2.weights == model.weights
     True
     >>> model2.gaussiansDF.show()
@@ -181,6 +270,32 @@ def getK(self):
         return self.getOrDefault(self.k)
 
 
+class GaussianMixtureSummary(ClusteringSummary):
+    """
+    .. note:: Experimental
+
+    Gaussian mixture clustering results for a given model.
+
+    .. versionadded:: 2.1.0
+    """
+
+    @property
+    @since("2.1.0")
+    def probabilityCol(self):
+        """
+        Name for column of predicted probability of each cluster in `predictions`.
+        """
+        return self._call_java("probabilityCol")
+
+    @property
+    @since("2.1.0")
+    def probability(self):
+        """
+        DataFrame of probabilities of each cluster for each training data point.
+        """
+        return self._call_java("probability")
+
+
 class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
     Model fitted by KMeans.
@@ -346,6 +461,27 @@ def computeCost(self, dataset):
         """
         return self._call_java("computeCost", dataset)
 
+    @property
+    @since("2.1.0")
+    def hasSummary(self):
+        """
+        Indicates whether a training summary exists for this model instance.
+        """
+        return self._call_java("hasSummary")
+
+    @property
+    @since("2.1.0")
+    def summary(self):
+        """
+        Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
+        training set. An exception is thrown if no summary exists.
+        """
+        if self.hasSummary:
+            return BisectingKMeansSummary(self._call_java("summary"))
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
+
 
 @inherit_doc
 class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasSeed,
@@ -373,6 +509,13 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     2
     >>> model.computeCost(df)
     2.000...
+    >>> model.hasSummary
+    True
+    >>> summary = model.summary
+    >>> summary.k
+    2
+    >>> summary.clusterSizes
+    [2, 2]
     >>> transformed = model.transform(df).select("features", "prediction")
     >>> rows = transformed.collect()
     >>> rows[0].prediction == rows[1].prediction
@@ -387,6 +530,8 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> model_path = temp_path + "/bkm_model"
     >>> model.save(model_path)
     >>> model2 = BisectingKMeansModel.load(model_path)
+    >>> model2.hasSummary
+    False
     >>> model.clusterCenters()[0] == model2.clusterCenters()[0]
     array([ True,  True], dtype=bool)
     >>> model.clusterCenters()[1] == model2.clusterCenters()[1]
@@ -460,6 +605,17 @@ def _create_model(self, java_model):
         return BisectingKMeansModel(java_model)
 
 
+class BisectingKMeansSummary(ClusteringSummary):
+    """
+    .. note:: Experimental
+
+    Bisecting KMeans clustering results for a given model.
+
+    .. versionadded:: 2.1.0
+    """
+    pass
+
+
 @inherit_doc
 class LDAModel(JavaModel):
     """
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 0bc319ca4d601..385391ba53fd4 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -160,8 +160,12 @@ def summary(self):
         training set. An exception is thrown if
         `trainingSummary is None`.
         """
-        java_lrt_summary = self._call_java("summary")
-        return LinearRegressionTrainingSummary(java_lrt_summary)
+        if self.hasSummary:
+            java_lrt_summary = self._call_java("summary")
+            return LinearRegressionTrainingSummary(java_lrt_summary)
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
 
     @property
     @since("2.0.0")
@@ -1459,8 +1463,12 @@ def summary(self):
         training set. An exception is thrown if
         `trainingSummary is None`.
         """
-        java_glrt_summary = self._call_java("summary")
-        return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary)
+        if self.hasSummary:
+            java_glrt_summary = self._call_java("summary")
+            return GeneralizedLinearRegressionTrainingSummary(java_glrt_summary)
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
 
     @property
     @since("2.0.0")
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 9d46cc3b4ae64..c0f0d4073564e 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1097,6 +1097,38 @@ def test_logistic_regression_summary(self):
         sameSummary = model.evaluate(df)
         self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
 
+    def test_gaussian_mixture_summary(self):
+        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
+                (Vectors.sparse(1, [], []),)]
+        df = self.spark.createDataFrame(data, ["features"])
+        gmm = GaussianMixture(k=2)
+        model = gmm.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.probabilityCol, "probability")
+        self.assertTrue(isinstance(s.probability, DataFrame))
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        self.assertTrue(isinstance(s.cluster, DataFrame))
+        self.assertEqual(len(s.clusterSizes), 2)
+        self.assertEqual(s.k, 2)
+
+    def test_bisecting_kmeans_summary(self):
+        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
+                (Vectors.sparse(1, [], []),)]
+        df = self.spark.createDataFrame(data, ["features"])
+        bkm = BisectingKMeans(k=2)
+        model = bkm.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        self.assertTrue(isinstance(s.cluster, DataFrame))
+        self.assertEqual(len(s.clusterSizes), 2)
+        self.assertEqual(s.k, 2)
+
 
 class OneVsRestTests(SparkSessionTestCase):
 

From 251a9927646f367ca2cf75a87e80ce1c061a8f27 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Mon, 21 Nov 2016 05:50:35 -0800
Subject: [PATCH 160/534] [SPARK-18398][SQL] Fix nullabilities of MapObjects
 and ExternalMapToCatalyst.

## What changes were proposed in this pull request?

The nullabilities of `MapObject` can be made more strict by relying on `inputObject.nullable` and `lambdaFunction.nullable`.

Also `ExternalMapToCatalyst.dataType` can be made more strict by relying on `valueConverter.nullable`.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #15840 from ueshin/issues/SPARK-18398.

(cherry picked from commit 9f262ae163b6dca6526665b3ad12b3b2ea8fb873)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/catalyst/expressions/objects/objects.scala  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 0b36091ece1bf..5c27179ec3b46 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -461,14 +461,15 @@ case class MapObjects private(
     lambdaFunction: Expression,
     inputData: Expression) extends Expression with NonSQLExpression {
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = inputData.nullable
 
   override def children: Seq[Expression] = lambdaFunction :: inputData :: Nil
 
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported")
 
-  override def dataType: DataType = ArrayType(lambdaFunction.dataType)
+  override def dataType: DataType =
+    ArrayType(lambdaFunction.dataType, containsNull = lambdaFunction.nullable)
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val elementJavaType = ctx.javaType(loopVarDataType)
@@ -642,7 +643,8 @@ case class ExternalMapToCatalyst private(
 
   override def foldable: Boolean = false
 
-  override def dataType: MapType = MapType(keyConverter.dataType, valueConverter.dataType)
+  override def dataType: MapType = MapType(
+    keyConverter.dataType, valueConverter.dataType, valueContainsNull = valueConverter.nullable)
 
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported")

From b0a73c9be3b691f95d2f6ace3d6304db7f69705f Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 21 Nov 2016 16:14:59 -0500
Subject: [PATCH 161/534] [SPARK-18517][SQL] DROP TABLE IF EXISTS should not
 warn for non-existing tables

## What changes were proposed in this pull request?

Currently, `DROP TABLE IF EXISTS` shows warning for non-existing tables. However, it had better be quiet for this case by definition of the command.

**BEFORE**
```scala
scala> sql("DROP TABLE IF EXISTS nonexist")
16/11/20 20:48:26 WARN DropTableCommand: org.apache.spark.sql.catalyst.analysis.NoSuchTableException: Table or view 'nonexist' not found in database 'default';
```

**AFTER**
```scala
scala> sql("DROP TABLE IF EXISTS nonexist")
res0: org.apache.spark.sql.DataFrame = []
```

## How was this patch tested?

Manual because this is related to the warning messages instead of exceptions.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15953 from dongjoon-hyun/SPARK-18517.

(cherry picked from commit ddd02f50bb7458410d65427321efc75da5e65224)
Signed-off-by: Andrew Or <andrewor14@gmail.com>
---
 .../scala/org/apache/spark/sql/execution/command/ddl.scala     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 570a9967871e9..0f126d0200eff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.Resolver
+import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
@@ -202,6 +202,7 @@ case class DropTableCommand(
       sparkSession.sharedState.cacheManager.uncacheQuery(
         sparkSession.table(tableName.quotedString))
     } catch {
+      case _: NoSuchTableException if ifExists =>
       case NonFatal(e) => log.warn(e.toString, e)
     }
     catalog.refreshTable(tableName)

From 406f33987ac078fb20d2f5e81b7e1f646ea53fed Mon Sep 17 00:00:00 2001
From: Gabriel Huang <gabi.xiaohuang@gmail.com>
Date: Mon, 21 Nov 2016 16:08:34 -0500
Subject: [PATCH 162/534] [SPARK-18361][PYSPARK] Expose RDD localCheckpoint in
 PySpark

## What changes were proposed in this pull request?

Expose RDD's localCheckpoint() and associated functions in PySpark.

## How was this patch tested?

I added a UnitTest in python/pyspark/tests.py which passes.

I certify that this is my original work, and I license it to the project under the project's open source license.

Gabriel HUANG
Developer at Cardabel (http://cardabel.com/)

Author: Gabriel Huang <gabi.xiaohuang@gmail.com>

Closes #15811 from gabrielhuang/pyspark-localcheckpoint.
---
 python/pyspark/rdd.py   | 33 ++++++++++++++++++++++++++++++++-
 python/pyspark/tests.py | 17 +++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 641787ee20e0c..f21a364df9100 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -263,13 +263,44 @@ def checkpoint(self):
 
     def isCheckpointed(self):
         """
-        Return whether this RDD has been checkpointed or not
+        Return whether this RDD is checkpointed and materialized, either reliably or locally.
         """
         return self._jrdd.rdd().isCheckpointed()
 
+    def localCheckpoint(self):
+        """
+        Mark this RDD for local checkpointing using Spark's existing caching layer.
+
+        This method is for users who wish to truncate RDD lineages while skipping the expensive
+        step of replicating the materialized data in a reliable distributed file system. This is
+        useful for RDDs with long lineages that need to be truncated periodically (e.g. GraphX).
+
+        Local checkpointing sacrifices fault-tolerance for performance. In particular, checkpointed
+        data is written to ephemeral local storage in the executors instead of to a reliable,
+        fault-tolerant storage. The effect is that if an executor fails during the computation,
+        the checkpointed data may no longer be accessible, causing an irrecoverable job failure.
+
+        This is NOT safe to use with dynamic allocation, which removes executors along
+        with their cached blocks. If you must use both features, you are advised to set
+        L{spark.dynamicAllocation.cachedExecutorIdleTimeout} to a high value.
+
+        The checkpoint directory set through L{SparkContext.setCheckpointDir()} is not used.
+        """
+        self._jrdd.rdd().localCheckpoint()
+
+    def isLocallyCheckpointed(self):
+        """
+        Return whether this RDD is marked for local checkpointing.
+
+        Exposed for testing.
+        """
+        return self._jrdd.rdd().isLocallyCheckpointed()
+
     def getCheckpointFile(self):
         """
         Gets the name of the file to which this RDD was checkpointed
+
+        Not defined if RDD is checkpointed locally.
         """
         checkpointFile = self._jrdd.rdd().getCheckpointFile()
         if checkpointFile.isDefined():
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 3e0bd16d85ca4..ab4bef8329cd0 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -390,6 +390,23 @@ def test_checkpoint_and_restore(self):
         self.assertEqual([1, 2, 3, 4], recovered.collect())
 
 
+class LocalCheckpointTests(ReusedPySparkTestCase):
+
+    def test_basic_localcheckpointing(self):
+        parCollection = self.sc.parallelize([1, 2, 3, 4])
+        flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1))
+
+        self.assertFalse(flatMappedRDD.isCheckpointed())
+        self.assertFalse(flatMappedRDD.isLocallyCheckpointed())
+
+        flatMappedRDD.localCheckpoint()
+        result = flatMappedRDD.collect()
+        time.sleep(1)  # 1 second
+        self.assertTrue(flatMappedRDD.isCheckpointed())
+        self.assertTrue(flatMappedRDD.isLocallyCheckpointed())
+        self.assertEqual(flatMappedRDD.collect(), result)
+
+
 class AddFileTests(PySparkTestCase):
 
     def test_add_py_file(self):

From 2afc18be23150d283361d374caf8cbfd3da63c9c Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 21 Nov 2016 13:23:32 -0800
Subject: [PATCH 163/534] [SPARK-17765][SQL] Support for writing out
 user-defined type in ORC datasource

## What changes were proposed in this pull request?

This PR adds the support for `UserDefinedType` when writing out instead of throwing `ClassCastException` in ORC data source.

In more details, `OrcStruct` is being created based on string from`DataType.catalogString`. For user-defined type, it seems it returns `sqlType.simpleString` for `catalogString` by default[1]. However, during type-dispatching to match the output with the schema, it tries to cast to, for example, `StructType`[2].

So, running the codes below (`MyDenseVector` was borrowed[3]) :

``` scala
val data = Seq((1, new UDT.MyDenseVector(Array(0.25, 2.25, 4.25))))
val udtDF = data.toDF("id", "vectors")
udtDF.write.orc("/tmp/test.orc")
```

ends up throwing an exception as below:

```
java.lang.ClassCastException: org.apache.spark.sql.UDT$MyDenseVectorUDT cannot be cast to org.apache.spark.sql.types.ArrayType
    at org.apache.spark.sql.hive.HiveInspectors$class.wrapperFor(HiveInspectors.scala:381)
    at org.apache.spark.sql.hive.orc.OrcSerializer.wrapperFor(OrcFileFormat.scala:164)
...
```

So, this PR uses `UserDefinedType.sqlType` during finding the correct converter when writing out in ORC data source.

[1]https://github.com/apache/spark/blob/dfdcab00c7b6200c22883baa3ebc5818be09556f/sql/catalyst/src/main/scala/org/apache/spark/sql/types/UserDefinedType.scala#L95
[2]https://github.com/apache/spark/blob/d2dc8c4a162834818190ffd82894522c524ca3e5/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala#L326
[3]https://github.com/apache/spark/blob/2bfed1a0c5be7d0718fd574a4dad90f4f6b44be7/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala#L38-L70
## How was this patch tested?

Unit tests in `OrcQuerySuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15361 from HyukjinKwon/SPARK-17765.

(cherry picked from commit a2d464770cd183daa7d727bf377bde9c21e29e6a)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/sql/hive/HiveInspectors.scala     |  3 +++
 .../org/apache/spark/sql/hive/orc/OrcQuerySuite.scala  | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index e303065127c3b..52aa1088acd4a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -246,6 +246,9 @@ private[hive] trait HiveInspectors {
    * Wraps with Hive types based on object inspector.
    */
   protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match {
+    case _ if dataType.isInstanceOf[UserDefinedType[_]] =>
+      val sqlType = dataType.asInstanceOf[UserDefinedType[_]].sqlType
+      wrapperFor(oi, sqlType)
     case x: ConstantObjectInspector =>
       (o: Any) =>
         x.getWritableConstantValue
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index a628977af2f4e..b8761e9de2886 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -93,6 +93,16 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("Read/write UserDefinedType") {
+    withTempPath { path =>
+      val data = Seq((1, new UDT.MyDenseVector(Array(0.25, 2.25, 4.25))))
+      val udtDF = data.toDF("id", "vectors")
+      udtDF.write.orc(path.getAbsolutePath)
+      val readBack = spark.read.schema(udtDF.schema).orc(path.getAbsolutePath)
+      checkAnswer(udtDF, readBack)
+    }
+  }
+
   test("Creating case class RDD table") {
     val data = (1 to 100).map(i => (i, s"val_$i"))
     sparkContext.parallelize(data).toDF().createOrReplaceTempView("t")

From 6dbe44891458b497c1ad4df8d8358e326fb3f795 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Mon, 21 Nov 2016 17:24:02 -0800
Subject: [PATCH 164/534] [SPARK-18493] Add missing python APIs: withWatermark
 and checkpoint to dataframe

## What changes were proposed in this pull request?

This PR adds two of the newly added methods of `Dataset`s to Python:
`withWatermark` and `checkpoint`

## How was this patch tested?

Doc tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #15921 from brkyvz/py-watermark.

(cherry picked from commit 97a8239a625df455d2c439f3628a529d6d9413ca)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 python/pyspark/sql/dataframe.py               | 57 ++++++++++++++++++-
 .../scala/org/apache/spark/sql/Dataset.scala  | 10 +++-
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 38998900837cf..6fe622643291e 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -322,6 +322,54 @@ def show(self, n=20, truncate=True):
     def __repr__(self):
         return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
+    @since(2.1)
+    def checkpoint(self, eager=True):
+        """Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
+        logical plan of this DataFrame, which is especially useful in iterative algorithms where the
+        plan may grow exponentially. It will be saved to files inside the checkpoint
+        directory set with L{SparkContext.setCheckpointDir()}.
+
+        :param eager: Whether to checkpoint this DataFrame immediately
+
+        .. note:: Experimental
+        """
+        jdf = self._jdf.checkpoint(eager)
+        return DataFrame(jdf, self.sql_ctx)
+
+    @since(2.1)
+    def withWatermark(self, eventTime, delayThreshold):
+        """Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point
+        in time before which we assume no more late data is going to arrive.
+
+        Spark will use this watermark for several purposes:
+          - To know when a given time window aggregation can be finalized and thus can be emitted
+            when using output modes that do not allow updates.
+
+          - To minimize the amount of state that we need to keep for on-going aggregations.
+
+        The current watermark is computed by looking at the `MAX(eventTime)` seen across
+        all of the partitions in the query minus a user specified `delayThreshold`.  Due to the cost
+        of coordinating this value across partitions, the actual watermark used is only guaranteed
+        to be at least `delayThreshold` behind the actual event time.  In some cases we may still
+        process records that arrive more than `delayThreshold` late.
+
+        :param eventTime: the name of the column that contains the event time of the row.
+        :param delayThreshold: the minimum delay to wait to data to arrive late, relative to the
+            latest record that has been processed in the form of an interval
+            (e.g. "1 minute" or "5 hours").
+
+        .. note:: Experimental
+
+        >>> sdf.select('name', sdf.time.cast('timestamp')).withWatermark('time', '10 minutes')
+        DataFrame[name: string, time: timestamp]
+        """
+        if not eventTime or type(eventTime) is not str:
+            raise TypeError("eventTime should be provided as a string")
+        if not delayThreshold or type(delayThreshold) is not str:
+            raise TypeError("delayThreshold should be provided as a string interval")
+        jdf = self._jdf.withWatermark(eventTime, delayThreshold)
+        return DataFrame(jdf, self.sql_ctx)
+
     @since(1.3)
     def count(self):
         """Returns the number of rows in this :class:`DataFrame`.
@@ -1626,6 +1674,7 @@ def _test():
     from pyspark.context import SparkContext
     from pyspark.sql import Row, SQLContext, SparkSession
     import pyspark.sql.dataframe
+    from pyspark.sql.functions import from_unixtime
     globs = pyspark.sql.dataframe.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
     globs['sc'] = sc
@@ -1638,9 +1687,11 @@ def _test():
     globs['df3'] = sc.parallelize([Row(name='Alice', age=2),
                                    Row(name='Bob', age=5)]).toDF()
     globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80),
-                                  Row(name='Bob', age=5, height=None),
-                                  Row(name='Tom', age=None, height=None),
-                                  Row(name=None, age=None, height=None)]).toDF()
+                                   Row(name='Bob', age=5, height=None),
+                                   Row(name='Tom', age=None, height=None),
+                                   Row(name=None, age=None, height=None)]).toDF()
+    globs['sdf'] = sc.parallelize([Row(name='Tom', time=1479441846),
+                                   Row(name='Bob', time=1479442946)]).toDF()
 
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.dataframe, globs=globs,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 3c75a6a45ec86..7ba6ffce278cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -485,7 +485,10 @@ class Dataset[T] private[sql](
   def isStreaming: Boolean = logicalPlan.isStreaming
 
   /**
-   * Returns a checkpointed version of this Dataset.
+   * Eagerly checkpoint a Dataset and return the new Dataset. Checkpointing can be used to truncate
+   * the logical plan of this Dataset, which is especially useful in iterative algorithms where the
+   * plan may grow exponentially. It will be saved to files inside the checkpoint
+   * directory set with `SparkContext#setCheckpointDir`.
    *
    * @group basic
    * @since 2.1.0
@@ -495,7 +498,10 @@ class Dataset[T] private[sql](
   def checkpoint(): Dataset[T] = checkpoint(eager = true)
 
   /**
-   * Returns a checkpointed version of this Dataset.
+   * Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
+   * logical plan of this Dataset, which is especially useful in iterative algorithms where the
+   * plan may grow exponentially. It will be saved to files inside the checkpoint
+   * directory set with `SparkContext#setCheckpointDir`.
    *
    * @group basic
    * @since 2.1.0

From aaa2a173a81868a92d61bcc9420961aaa7eaeb57 Mon Sep 17 00:00:00 2001
From: Liwei Lin <lwlin7@gmail.com>
Date: Mon, 21 Nov 2016 21:14:13 -0800
Subject: [PATCH 165/534] [SPARK-18425][STRUCTURED STREAMING][TESTS] Test
 `CompactibleFileStreamLog` directly

## What changes were proposed in this pull request?

Right now we are testing the most of `CompactibleFileStreamLog` in `FileStreamSinkLogSuite` (because `FileStreamSinkLog` once was the only subclass of `CompactibleFileStreamLog`, but now it's not the case any more).

Let's refactor the tests so that `CompactibleFileStreamLog` is directly tested, making future changes (like https://github.com/apache/spark/pull/15828, https://github.com/apache/spark/pull/15827) to `CompactibleFileStreamLog` much easier to test and much easier to review.

## How was this patch tested?

the PR itself is about tests

Author: Liwei Lin <lwlin7@gmail.com>

Closes #15870 from lw-lin/test-compact-1113.

(cherry picked from commit ebeb0830a3a4837c7354a0eee667b9f5fad389c5)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../CompactibleFileStreamLogSuite.scala       | 216 +++++++++++++++++-
 .../streaming/FileStreamSinkLogSuite.scala    |  68 ------
 2 files changed, 214 insertions(+), 70 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
index 2cd2157b293cb..e511fda57912c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
@@ -17,12 +17,79 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.SparkFunSuite
+import java.io._
+import java.nio.charset.StandardCharsets._
 
-class CompactibleFileStreamLogSuite extends SparkFunSuite {
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.sql.execution.streaming.FakeFileSystem._
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.test.SharedSQLContext
+
+class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext {
+
+  /** To avoid caching of FS objects */
+  override protected val sparkConf =
+    new SparkConf().set(s"spark.hadoop.fs.$scheme.impl.disable.cache", "true")
 
   import CompactibleFileStreamLog._
 
+  /** -- testing of `object CompactibleFileStreamLog` begins -- */
+
+  test("getBatchIdFromFileName") {
+    assert(1234L === getBatchIdFromFileName("1234"))
+    assert(1234L === getBatchIdFromFileName("1234.compact"))
+    intercept[NumberFormatException] {
+      getBatchIdFromFileName("1234a")
+    }
+  }
+
+  test("isCompactionBatch") {
+    assert(false === isCompactionBatch(0, compactInterval = 3))
+    assert(false === isCompactionBatch(1, compactInterval = 3))
+    assert(true === isCompactionBatch(2, compactInterval = 3))
+    assert(false === isCompactionBatch(3, compactInterval = 3))
+    assert(false === isCompactionBatch(4, compactInterval = 3))
+    assert(true === isCompactionBatch(5, compactInterval = 3))
+  }
+
+  test("nextCompactionBatchId") {
+    assert(2 === nextCompactionBatchId(0, compactInterval = 3))
+    assert(2 === nextCompactionBatchId(1, compactInterval = 3))
+    assert(5 === nextCompactionBatchId(2, compactInterval = 3))
+    assert(5 === nextCompactionBatchId(3, compactInterval = 3))
+    assert(5 === nextCompactionBatchId(4, compactInterval = 3))
+    assert(8 === nextCompactionBatchId(5, compactInterval = 3))
+  }
+
+  test("getValidBatchesBeforeCompactionBatch") {
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(0, compactInterval = 3)
+    }
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(1, compactInterval = 3)
+    }
+    assert(Seq(0, 1) === getValidBatchesBeforeCompactionBatch(2, compactInterval = 3))
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(3, compactInterval = 3)
+    }
+    intercept[AssertionError] {
+      getValidBatchesBeforeCompactionBatch(4, compactInterval = 3)
+    }
+    assert(Seq(2, 3, 4) === getValidBatchesBeforeCompactionBatch(5, compactInterval = 3))
+  }
+
+  test("getAllValidBatches") {
+    assert(Seq(0) === getAllValidBatches(0, compactInterval = 3))
+    assert(Seq(0, 1) === getAllValidBatches(1, compactInterval = 3))
+    assert(Seq(2) === getAllValidBatches(2, compactInterval = 3))
+    assert(Seq(2, 3) === getAllValidBatches(3, compactInterval = 3))
+    assert(Seq(2, 3, 4) === getAllValidBatches(4, compactInterval = 3))
+    assert(Seq(5) === getAllValidBatches(5, compactInterval = 3))
+    assert(Seq(5, 6) === getAllValidBatches(6, compactInterval = 3))
+    assert(Seq(5, 6, 7) === getAllValidBatches(7, compactInterval = 3))
+    assert(Seq(8) === getAllValidBatches(8, compactInterval = 3))
+  }
+
   test("deriveCompactInterval") {
     // latestCompactBatchId(4) + 1 <= default(5)
     // then use latestestCompactBatchId + 1 === 5
@@ -30,4 +97,149 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite {
     // First divisor of 10 greater than 4 === 5
     assert(5 === deriveCompactInterval(4, 9))
   }
+
+  /** -- testing of `object CompactibleFileStreamLog` ends -- */
+
+  test("batchIdToPath") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      compactibleLog => {
+        assert("0" === compactibleLog.batchIdToPath(0).getName)
+        assert("1" === compactibleLog.batchIdToPath(1).getName)
+        assert("2.compact" === compactibleLog.batchIdToPath(2).getName)
+        assert("3" === compactibleLog.batchIdToPath(3).getName)
+        assert("4" === compactibleLog.batchIdToPath(4).getName)
+        assert("5.compact" === compactibleLog.batchIdToPath(5).getName)
+      })
+  }
+
+  test("serialize") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      compactibleLog => {
+        val logs = Array("entry_1", "entry_2", "entry_3")
+        val expected = s"""${FakeCompactibleFileStreamLog.VERSION}
+            |"entry_1"
+            |"entry_2"
+            |"entry_3"""".stripMargin
+        val baos = new ByteArrayOutputStream()
+        compactibleLog.serialize(logs, baos)
+        assert(expected === baos.toString(UTF_8.name()))
+
+        baos.reset()
+        compactibleLog.serialize(Array(), baos)
+        assert(FakeCompactibleFileStreamLog.VERSION === baos.toString(UTF_8.name()))
+      })
+  }
+
+  test("deserialize") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      compactibleLog => {
+        val logs = s"""${FakeCompactibleFileStreamLog.VERSION}
+            |"entry_1"
+            |"entry_2"
+            |"entry_3"""".stripMargin
+        val expected = Array("entry_1", "entry_2", "entry_3")
+        assert(expected ===
+          compactibleLog.deserialize(new ByteArrayInputStream(logs.getBytes(UTF_8))))
+
+        assert(Nil ===
+          compactibleLog.deserialize(
+            new ByteArrayInputStream(FakeCompactibleFileStreamLog.VERSION.getBytes(UTF_8))))
+      })
+  }
+
+  testWithUninterruptibleThread("compact") {
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = Long.MaxValue,
+      defaultCompactInterval = 3,
+      compactibleLog => {
+        for (batchId <- 0 to 10) {
+          compactibleLog.add(batchId, Array("some_path_" + batchId))
+          val expectedFiles = (0 to batchId).map { id => "some_path_" + id }
+          assert(compactibleLog.allFiles() === expectedFiles)
+          if (isCompactionBatch(batchId, 3)) {
+            // Since batchId is a compaction batch, the batch log file should contain all logs
+            assert(compactibleLog.get(batchId).getOrElse(Nil) === expectedFiles)
+          }
+        }
+      })
+  }
+
+  testWithUninterruptibleThread("delete expired file") {
+    // Set `fileCleanupDelayMs` to 0 so that we can detect the deleting behaviour deterministically
+    withFakeCompactibleFileStreamLog(
+      fileCleanupDelayMs = 0,
+      defaultCompactInterval = 3,
+      compactibleLog => {
+        val fs = compactibleLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
+
+        def listBatchFiles(): Set[String] = {
+          fs.listStatus(compactibleLog.metadataPath).map(_.getPath.getName).filter { fileName =>
+            try {
+              getBatchIdFromFileName(fileName)
+              true
+            } catch {
+              case _: NumberFormatException => false
+            }
+          }.toSet
+        }
+
+        compactibleLog.add(0, Array("some_path_0"))
+        assert(Set("0") === listBatchFiles())
+        compactibleLog.add(1, Array("some_path_1"))
+        assert(Set("0", "1") === listBatchFiles())
+        compactibleLog.add(2, Array("some_path_2"))
+        assert(Set("2.compact") === listBatchFiles())
+        compactibleLog.add(3, Array("some_path_3"))
+        assert(Set("2.compact", "3") === listBatchFiles())
+        compactibleLog.add(4, Array("some_path_4"))
+        assert(Set("2.compact", "3", "4") === listBatchFiles())
+        compactibleLog.add(5, Array("some_path_5"))
+        assert(Set("5.compact") === listBatchFiles())
+      })
+  }
+
+  private def withFakeCompactibleFileStreamLog(
+    fileCleanupDelayMs: Long,
+    defaultCompactInterval: Int,
+    f: FakeCompactibleFileStreamLog => Unit
+  ): Unit = {
+    withTempDir { file =>
+      val compactibleLog = new FakeCompactibleFileStreamLog(
+        fileCleanupDelayMs,
+        defaultCompactInterval,
+        spark,
+        file.getCanonicalPath)
+      f(compactibleLog)
+    }
+  }
+}
+
+object FakeCompactibleFileStreamLog {
+  val VERSION = "test_version"
+}
+
+class FakeCompactibleFileStreamLog(
+    _fileCleanupDelayMs: Long,
+    _defaultCompactInterval: Int,
+    sparkSession: SparkSession,
+    path: String)
+  extends CompactibleFileStreamLog[String](
+    FakeCompactibleFileStreamLog.VERSION,
+    sparkSession,
+    path
+  ) {
+
+  override protected def fileCleanupDelayMs: Long = _fileCleanupDelayMs
+
+  override protected def isDeletingExpiredLog: Boolean = true
+
+  override protected def defaultCompactInterval: Int = _defaultCompactInterval
+
+  override def compactLogs(logs: Seq[String]): Seq[String] = logs
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
index e1bc674a28071..e046fee0c04d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
@@ -29,61 +29,6 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
   import CompactibleFileStreamLog._
   import FileStreamSinkLog._
 
-  test("getBatchIdFromFileName") {
-    assert(1234L === getBatchIdFromFileName("1234"))
-    assert(1234L === getBatchIdFromFileName("1234.compact"))
-    intercept[NumberFormatException] {
-      getBatchIdFromFileName("1234a")
-    }
-  }
-
-  test("isCompactionBatch") {
-    assert(false === isCompactionBatch(0, compactInterval = 3))
-    assert(false === isCompactionBatch(1, compactInterval = 3))
-    assert(true === isCompactionBatch(2, compactInterval = 3))
-    assert(false === isCompactionBatch(3, compactInterval = 3))
-    assert(false === isCompactionBatch(4, compactInterval = 3))
-    assert(true === isCompactionBatch(5, compactInterval = 3))
-  }
-
-  test("nextCompactionBatchId") {
-    assert(2 === nextCompactionBatchId(0, compactInterval = 3))
-    assert(2 === nextCompactionBatchId(1, compactInterval = 3))
-    assert(5 === nextCompactionBatchId(2, compactInterval = 3))
-    assert(5 === nextCompactionBatchId(3, compactInterval = 3))
-    assert(5 === nextCompactionBatchId(4, compactInterval = 3))
-    assert(8 === nextCompactionBatchId(5, compactInterval = 3))
-  }
-
-  test("getValidBatchesBeforeCompactionBatch") {
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(0, compactInterval = 3)
-    }
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(1, compactInterval = 3)
-    }
-    assert(Seq(0, 1) === getValidBatchesBeforeCompactionBatch(2, compactInterval = 3))
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(3, compactInterval = 3)
-    }
-    intercept[AssertionError] {
-      getValidBatchesBeforeCompactionBatch(4, compactInterval = 3)
-    }
-    assert(Seq(2, 3, 4) === getValidBatchesBeforeCompactionBatch(5, compactInterval = 3))
-  }
-
-  test("getAllValidBatches") {
-    assert(Seq(0) === getAllValidBatches(0, compactInterval = 3))
-    assert(Seq(0, 1) === getAllValidBatches(1, compactInterval = 3))
-    assert(Seq(2) === getAllValidBatches(2, compactInterval = 3))
-    assert(Seq(2, 3) === getAllValidBatches(3, compactInterval = 3))
-    assert(Seq(2, 3, 4) === getAllValidBatches(4, compactInterval = 3))
-    assert(Seq(5) === getAllValidBatches(5, compactInterval = 3))
-    assert(Seq(5, 6) === getAllValidBatches(6, compactInterval = 3))
-    assert(Seq(5, 6, 7) === getAllValidBatches(7, compactInterval = 3))
-    assert(Seq(8) === getAllValidBatches(8, compactInterval = 3))
-  }
-
   test("compactLogs") {
     withFileStreamSinkLog { sinkLog =>
       val logs = Seq(
@@ -184,19 +129,6 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
     }
   }
 
-  test("batchIdToPath") {
-    withSQLConf(SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3") {
-      withFileStreamSinkLog { sinkLog =>
-        assert("0" === sinkLog.batchIdToPath(0).getName)
-        assert("1" === sinkLog.batchIdToPath(1).getName)
-        assert("2.compact" === sinkLog.batchIdToPath(2).getName)
-        assert("3" === sinkLog.batchIdToPath(3).getName)
-        assert("4" === sinkLog.batchIdToPath(4).getName)
-        assert("5.compact" === sinkLog.batchIdToPath(5).getName)
-      }
-    }
-  }
-
   testWithUninterruptibleThread("compact") {
     withSQLConf(SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3") {
       withFileStreamSinkLog { sinkLog =>

From c7021407597480bddf226ffa6d1d3f682408dfeb Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 22 Nov 2016 00:05:30 -0800
Subject: [PATCH 166/534] [SPARK-18444][SPARKR] SparkR running in yarn-cluster
 mode should not download Spark package.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
When running SparkR job in yarn-cluster mode, it will download Spark package from apache website which is not necessary.
```
./bin/spark-submit --master yarn-cluster ./examples/src/main/r/dataframe.R
```
The following is output:
```
Attaching package: ‘SparkR’

The following objects are masked from ‘package:stats’:

    cov, filter, lag, na.omit, predict, sd, var, window

The following objects are masked from ‘package:base’:

    as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
    rank, rbind, sample, startsWith, subset, summary, transform, union

Spark not found in SPARK_HOME:
Spark not found in the cache directory. Installation will start.
MirrorUrl not provided.
Looking for preferred site from apache website...
......
```
There's no ```SPARK_HOME``` in yarn-cluster mode since the R process is in a remote host of the yarn cluster rather than in the client host. The JVM comes up first and the R process then connects to it. So in such cases we should never have to download Spark as Spark is already running.

## How was this patch tested?
Offline test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15888 from yanboliang/spark-18444.

(cherry picked from commit acb97157796231fef74aba985825b05b607b9279)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/sparkR.R                        | 20 +++++++----
 R/pkg/R/utils.R                         |  4 +++
 R/pkg/inst/tests/testthat/test_sparkR.R | 46 +++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 R/pkg/inst/tests/testthat/test_sparkR.R

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 6b4a2f2fdc85c..a7152b4313993 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -373,8 +373,13 @@ sparkR.session <- function(
     overrideEnvs(sparkConfigMap, paramMap)
   }
 
+  deployMode <- ""
+  if (exists("spark.submit.deployMode", envir = sparkConfigMap)) {
+    deployMode <- sparkConfigMap[["spark.submit.deployMode"]]
+  }
+
   if (!exists(".sparkRjsc", envir = .sparkREnv)) {
-    retHome <- sparkCheckInstall(sparkHome, master)
+    retHome <- sparkCheckInstall(sparkHome, master, deployMode)
     if (!is.null(retHome)) sparkHome <- retHome
     sparkExecutorEnvMap <- new.env()
     sparkR.sparkContext(master, appName, sparkHome, sparkConfigMap, sparkExecutorEnvMap,
@@ -550,24 +555,27 @@ processSparkPackages <- function(packages) {
 #
 # @param sparkHome directory to find Spark package.
 # @param master the Spark master URL, used to check local or remote mode.
+# @param deployMode whether to deploy your driver on the worker nodes (cluster)
+#        or locally as an external client (client).
 # @return NULL if no need to update sparkHome, and new sparkHome otherwise.
-sparkCheckInstall <- function(sparkHome, master) {
+sparkCheckInstall <- function(sparkHome, master, deployMode) {
   if (!isSparkRShell()) {
     if (!is.na(file.info(sparkHome)$isdir)) {
       msg <- paste0("Spark package found in SPARK_HOME: ", sparkHome)
       message(msg)
       NULL
     } else {
-      if (!nzchar(master) || isMasterLocal(master)) {
-        msg <- paste0("Spark not found in SPARK_HOME: ",
-                      sparkHome)
+      if (isMasterLocal(master)) {
+        msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome)
         message(msg)
         packageLocalDir <- install.spark()
         packageLocalDir
-      } else {
+      } else if (isClientMode(master) || deployMode == "client") {
         msg <- paste0("Spark not found in SPARK_HOME: ",
                       sparkHome, "\n", installInstruction("remote"))
         stop(msg)
+      } else {
+        NULL
       }
     }
   } else {
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 20004549cc037..098c0e3e31e95 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -777,6 +777,10 @@ isMasterLocal <- function(master) {
   grepl("^local(\\[([0-9]+|\\*)\\])?$", master, perl = TRUE)
 }
 
+isClientMode <- function(master) {
+  grepl("([a-z]+)-client$", master, perl = TRUE)
+}
+
 isSparkRShell <- function() {
   grepl(".*shell\\.R$", Sys.getenv("R_PROFILE_USER"), perl = TRUE)
 }
diff --git a/R/pkg/inst/tests/testthat/test_sparkR.R b/R/pkg/inst/tests/testthat/test_sparkR.R
new file mode 100644
index 0000000000000..f73fc6baeccef
--- /dev/null
+++ b/R/pkg/inst/tests/testthat/test_sparkR.R
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in sparkR.R")
+
+test_that("sparkCheckInstall", {
+  # "local, yarn-client, mesos-client" mode, SPARK_HOME was set correctly,
+  # and the SparkR job was submitted by "spark-submit"
+  sparkHome <- paste0(tempdir(), "/", "sparkHome")
+  dir.create(sparkHome)
+  master <- ""
+  deployMode <- ""
+  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+  unlink(sparkHome, recursive = TRUE)
+
+  # "yarn-cluster, mesos-cluster" mode, SPARK_HOME was not set,
+  # and the SparkR job was submitted by "spark-submit"
+  sparkHome <- ""
+  master <- ""
+  deployMode <- ""
+  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+
+  # "yarn-client, mesos-client" mode, SPARK_HOME was not set
+  sparkHome <- ""
+  master <- "yarn-client"
+  deployMode <- ""
+  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+  sparkHome <- ""
+  master <- ""
+  deployMode <- "client"
+  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+})

From 63aa01ffe06e49af032b57ba2eb28dfb8f14f779 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 22 Nov 2016 11:26:10 +0000
Subject: [PATCH 167/534] [SPARK-18514][DOCS] Fix the markdown for
 `Note:`/`NOTE:`/`Note that` across R API documentation

## What changes were proposed in this pull request?

It seems in R, there are

- `Note:`
- `NOTE:`
- `Note that`

This PR proposes to fix those to `Note:` to be consistent.

**Before**

![2016-11-21 11 30 07](https://cloud.githubusercontent.com/assets/6477701/20468848/2f27b0fa-afde-11e6-89e3-993701269dbe.png)

**After**

![2016-11-21 11 29 44](https://cloud.githubusercontent.com/assets/6477701/20468851/39469664-afde-11e6-9929-ad80be7fc405.png)

## How was this patch tested?

The notes were found via

```bash
grep -r "NOTE: " .
grep -r "Note that " .
```

And then fixed one by one comparing with API documentation.

After that, manually tested via `sh create-docs.sh` under `./R`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15952 from HyukjinKwon/SPARK-18514.

(cherry picked from commit 4922f9cdcac8b7c10320ac1fb701997fffa45d46)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 R/pkg/R/DataFrame.R | 6 ++++--
 R/pkg/R/functions.R | 7 ++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 4e3d97bb3ad07..9a51d530f120a 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2541,7 +2541,8 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #'
 #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
 #' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
-#' Note that this does not remove duplicate rows across the two SparkDataFrames.
+#'
+#' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
@@ -2584,7 +2585,8 @@ setMethod("unionAll",
 #' Union two or more SparkDataFrames
 #'
 #' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
-#' Note that this does not remove duplicate rows across the two SparkDataFrames.
+#'
+#' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
 #' @param x a SparkDataFrame.
 #' @param ... additional SparkDataFrame(s).
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index f8a9d3ce5d918..bf5c96373c632 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2296,7 +2296,7 @@ setMethod("n", signature(x = "Column"),
 #' A pattern could be for instance \preformatted{dd.MM.yyyy} and could return a string like '18.03.1993'. All
 #' pattern letters of \code{java.text.SimpleDateFormat} can be used.
 #'
-#' NOTE: Use when ever possible specialized functions like \code{year}. These benefit from a
+#' Note: Use when ever possible specialized functions like \code{year}. These benefit from a
 #' specialized implementation.
 #'
 #' @param y Column to compute on.
@@ -2341,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
 #' Locate the position of the first occurrence of substr column in the given string.
 #' Returns null if either of the arguments are null.
 #'
-#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
+#' Note: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param y column to check
@@ -2779,7 +2779,8 @@ setMethod("window", signature(x = "Column"),
 #' locate
 #'
 #' Locate the position of the first occurrence of substr.
-#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
+#'
+#' Note: The position is not zero based, but 1 based index. Returns 0 if substr
 #' could not be found in str.
 #'
 #' @param substr a character string to be matched.

From 36cd10d19d95418cec4b789545afc798088be315 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 22 Nov 2016 11:40:18 +0000
Subject: [PATCH 168/534] [SPARK-18447][DOCS] Fix the markdown for
 `Note:`/`NOTE:`/`Note that` across Python API documentation

## What changes were proposed in this pull request?

It seems in Python, there are

- `Note:`
- `NOTE:`
- `Note that`
- `.. note::`

This PR proposes to fix those to `.. note::` to be consistent.

**Before**

<img width="567" alt="2016-11-21 1 18 49" src="https://cloud.githubusercontent.com/assets/6477701/20464305/85144c86-af88-11e6-8ee9-90f584dd856c.png">

<img width="617" alt="2016-11-21 12 42 43" src="https://cloud.githubusercontent.com/assets/6477701/20464263/27be5022-af88-11e6-8577-4bbca7cdf36c.png">

**After**

<img width="554" alt="2016-11-21 1 18 42" src="https://cloud.githubusercontent.com/assets/6477701/20464306/8fe48932-af88-11e6-83e1-fc3cbf74407d.png">

<img width="628" alt="2016-11-21 12 42 51" src="https://cloud.githubusercontent.com/assets/6477701/20464264/2d3e156e-af88-11e6-93f3-cab8d8d02983.png">

## How was this patch tested?

The notes were found via

```bash
grep -r "Note: " .
grep -r "NOTE: " .
grep -r "Note that " .
```

And then fixed one by one comparing with API documentation.

After that, manually tested via `make html` under `./python/docs`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15947 from HyukjinKwon/SPARK-18447.

(cherry picked from commit 933a6548d423cf17448207a99299cf36fc1a95f6)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 python/pyspark/conf.py                     |  4 +-
 python/pyspark/context.py                  |  8 ++--
 python/pyspark/ml/classification.py        | 45 +++++++++---------
 python/pyspark/ml/clustering.py            |  8 ++--
 python/pyspark/ml/feature.py               | 13 +++---
 python/pyspark/ml/linalg/__init__.py       | 11 +++--
 python/pyspark/ml/regression.py            | 32 ++++++-------
 python/pyspark/mllib/clustering.py         |  6 +--
 python/pyspark/mllib/feature.py            | 24 +++++-----
 python/pyspark/mllib/linalg/__init__.py    | 11 +++--
 python/pyspark/mllib/linalg/distributed.py | 15 +++---
 python/pyspark/mllib/regression.py         |  2 +-
 python/pyspark/mllib/stat/_statistics.py   |  3 +-
 python/pyspark/mllib/tree.py               | 12 ++---
 python/pyspark/rdd.py                      | 54 +++++++++++-----------
 python/pyspark/sql/dataframe.py            | 28 ++++++-----
 python/pyspark/sql/functions.py            | 11 +++--
 python/pyspark/sql/streaming.py            | 10 ++--
 python/pyspark/streaming/context.py        |  2 +-
 python/pyspark/streaming/kinesis.py        |  4 +-
 20 files changed, 157 insertions(+), 146 deletions(-)

diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index 64b6f238e9c32..491b3a81972bc 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -90,8 +90,8 @@ class SparkConf(object):
     All setter methods in this class support chaining. For example,
     you can write C{conf.setMaster("local").setAppName("My app")}.
 
-    Note that once a SparkConf object is passed to Spark, it is cloned
-    and can no longer be modified by the user.
+    .. note:: Once a SparkConf object is passed to Spark, it is cloned
+        and can no longer be modified by the user.
     """
 
     def __init__(self, loadDefaults=True, _jvm=None, _jconf=None):
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 2c2cf6a373bb7..2fd3aee01d76c 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -520,8 +520,8 @@ def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):
           ...
           (a-hdfs-path/part-nnnnn, its content)
 
-        NOTE: Small files are preferred, as each file will be loaded
-        fully in memory.
+        .. note:: Small files are preferred, as each file will be loaded
+            fully in memory.
 
         >>> dirPath = os.path.join(tempdir, "files")
         >>> os.mkdir(dirPath)
@@ -547,8 +547,8 @@ def binaryFiles(self, path, minPartitions=None):
         in a key-value pair, where the key is the path of each file, the
         value is the content of each file.
 
-        Note: Small files are preferred, large file is also allowable, but
-        may cause bad performance.
+        .. note:: Small files are preferred, large file is also allowable, but
+            may cause bad performance.
         """
         minPartitions = minPartitions or self.defaultMinPartitions
         return RDD(self._jsc.binaryFiles(path, minPartitions), self,
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 83e1e89347660..8054a34db30f2 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -440,9 +440,9 @@ def roc(self):
         .. seealso:: `Wikipedia reference \
         <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("roc")
 
@@ -453,9 +453,9 @@ def areaUnderROC(self):
         Computes the area under the receiver operating characteristic
         (ROC) curve.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("areaUnderROC")
 
@@ -467,9 +467,9 @@ def pr(self):
         containing two fields recall, precision with (0.0, 1.0) prepended
         to it.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("pr")
 
@@ -480,9 +480,9 @@ def fMeasureByThreshold(self):
         Returns a dataframe with two fields (threshold, F-Measure) curve
         with beta = 1.0.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("fMeasureByThreshold")
 
@@ -494,9 +494,9 @@ def precisionByThreshold(self):
         Every possible probability obtained in transforming the dataset
         are used as thresholds used in calculating the precision.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("precisionByThreshold")
 
@@ -508,9 +508,9 @@ def recallByThreshold(self):
         Every possible probability obtained in transforming the dataset
         are used as thresholds used in calculating the recall.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LogisticRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LogisticRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("recallByThreshold")
 
@@ -695,9 +695,9 @@ def featureImportances(self):
             where gain is scaled by the number of instances passing through node
           - Normalize importances for tree to sum to 1.
 
-        Note: Feature importance for single decision trees can have high variance due to
-              correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
-              to determine feature importance instead.
+        .. note:: Feature importance for single decision trees can have high variance due to
+            correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
+            to determine feature importance instead.
         """
         return self._call_java("featureImportances")
 
@@ -839,7 +839,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
     learning algorithm for classification.
     It supports binary labels, as well as both continuous and categorical features.
-    Note: Multiclass labels are not currently supported.
 
     The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
 
@@ -851,6 +850,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     - We expect to implement TreeBoost in the future:
     `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_
 
+    .. note:: Multiclass labels are not currently supported.
+
     >>> from numpy import allclose
     >>> from pyspark.ml.linalg import Vectors
     >>> from pyspark.ml.feature import StringIndexer
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index e58ec1e7ac296..b29b5ac70e6fe 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -155,7 +155,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     While this process is generally guaranteed to converge, it is not guaranteed
     to find a global optimum.
 
-    Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+    .. note:: For high-dimensional data (with many features), this algorithm may perform poorly.
           This is due to high-dimensional data (a) making it difficult to cluster at all
           (based on statistical/theoretical arguments) and (b) numerical issues with
           Gaussian distributions.
@@ -749,9 +749,9 @@ def getCheckpointFiles(self):
         If using checkpointing and :py:attr:`LDA.keepLastCheckpoint` is set to true, then there may
         be saved checkpoint files.  This method is provided so that users can manage those files.
 
-        Note that removing the checkpoints can cause failures if a partition is lost and is needed
-        by certain :py:class:`DistributedLDAModel` methods.  Reference counting will clean up the
-        checkpoints when this model and derivative data go out of scope.
+        .. note:: Removing the checkpoints can cause failures if a partition is lost and is needed
+            by certain :py:class:`DistributedLDAModel` methods.  Reference counting will clean up
+            the checkpoints when this model and derivative data go out of scope.
 
         :return  List of checkpoint files from training
         """
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 635cf1304588e..40b63d4d31d4b 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -742,8 +742,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav
 
     For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)
 
-    Note that since zero values will probably be transformed to non-zero values, output of the
-    transformer will be DenseVector even for sparse input.
+    .. note:: Since zero values will probably be transformed to non-zero values, output of the
+        transformer will be DenseVector even for sparse input.
 
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
@@ -1014,9 +1014,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     :py:attr:`dropLast`) because it makes the vector entries sum up to
     one, and hence linearly dependent.
     So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
-    Note that this is different from scikit-learn's OneHotEncoder,
-    which keeps all categories.
-    The output vectors are sparse.
+
+    .. note:: This is different from scikit-learn's OneHotEncoder,
+        which keeps all categories. The output vectors are sparse.
 
     .. seealso::
 
@@ -1698,7 +1698,8 @@ def getLabels(self):
 class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
     A feature transformer that filters out stop words from input.
-    Note: null values from input array are preserved unless adding null to stopWords explicitly.
+
+    .. note:: null values from input array are preserved unless adding null to stopWords explicitly.
 
     >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["text"])
     >>> remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])
diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
index a5df727fdb418..1705c156ce4c8 100644
--- a/python/pyspark/ml/linalg/__init__.py
+++ b/python/pyspark/ml/linalg/__init__.py
@@ -746,11 +746,12 @@ def __hash__(self):
 class Vectors(object):
 
     """
-    Factory methods for working with vectors. Note that dense vectors
-    are simply represented as NumPy array objects, so there is no need
-    to covert them for use in MLlib. For sparse vectors, the factory
-    methods in this class create an MLlib-compatible type, or users
-    can pass in SciPy's C{scipy.sparse} column vectors.
+    Factory methods for working with vectors.
+
+    .. note:: Dense vectors are simply represented as NumPy array objects,
+        so there is no need to covert them for use in MLlib. For sparse vectors,
+        the factory methods in this class create an MLlib-compatible type, or users
+        can pass in SciPy's C{scipy.sparse} column vectors.
     """
 
     @staticmethod
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 385391ba53fd4..b42e807069802 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -245,9 +245,9 @@ def explainedVariance(self):
         .. seealso:: `Wikipedia explain variation \
         <http://en.wikipedia.org/wiki/Explained_variation>`_
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("explainedVariance")
 
@@ -259,9 +259,9 @@ def meanAbsoluteError(self):
         corresponding to the expected value of the absolute error
         loss or l1-norm loss.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("meanAbsoluteError")
 
@@ -273,9 +273,9 @@ def meanSquaredError(self):
         corresponding to the expected value of the squared error
         loss or quadratic loss.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("meanSquaredError")
 
@@ -286,9 +286,9 @@ def rootMeanSquaredError(self):
         Returns the root mean squared error, which is defined as the
         square root of the mean squared error.
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("rootMeanSquaredError")
 
@@ -301,9 +301,9 @@ def r2(self):
         .. seealso:: `Wikipedia coefficient of determination \
         <http://en.wikipedia.org/wiki/Coefficient_of_determination>`
 
-        Note: This ignores instance weights (setting all to 1.0) from
-        `LinearRegression.weightCol`. This will change in later Spark
-        versions.
+        .. note:: This ignores instance weights (setting all to 1.0) from
+            `LinearRegression.weightCol`. This will change in later Spark
+            versions.
         """
         return self._call_java("r2")
 
@@ -822,7 +822,7 @@ def featureImportances(self):
             where gain is scaled by the number of instances passing through node
           - Normalize importances for tree to sum to 1.
 
-        Note: Feature importance for single decision trees can have high variance due to
+        .. note:: Feature importance for single decision trees can have high variance due to
               correlated predictor variables. Consider using a :py:class:`RandomForestRegressor`
               to determine feature importance instead.
         """
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 2036168e456fd..91123ace3387e 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -699,9 +699,9 @@ class StreamingKMeansModel(KMeansModel):
     * n_t+1: New number of weights.
     * a: Decay Factor, which gives the forgetfulness.
 
-    Note that if a is set to 1, it is the weighted mean of the previous
-    and new data. If it set to zero, the old centroids are completely
-    forgotten.
+    .. note:: If a is set to 1, it is the weighted mean of the previous
+        and new data. If it set to zero, the old centroids are completely
+        forgotten.
 
     :param clusterCenters:
       Initial cluster centers.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 7eaa2282cb8bb..bde0f67be775c 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -114,9 +114,9 @@ def transform(self, vector):
         """
         Applies transformation on a vector or an RDD[Vector].
 
-        Note: In Python, transform cannot currently be used within
-              an RDD transformation or action.
-              Call transform directly on the RDD instead.
+        .. note:: In Python, transform cannot currently be used within
+            an RDD transformation or action.
+            Call transform directly on the RDD instead.
 
         :param vector: Vector or RDD of Vector to be transformed.
         """
@@ -139,9 +139,9 @@ def transform(self, vector):
         """
         Applies standardization transformation on a vector.
 
-        Note: In Python, transform cannot currently be used within
-              an RDD transformation or action.
-              Call transform directly on the RDD instead.
+        .. note:: In Python, transform cannot currently be used within
+            an RDD transformation or action.
+            Call transform directly on the RDD instead.
 
         :param vector: Vector or RDD of Vector to be standardized.
         :return: Standardized vector. If the variance of a column is
@@ -407,7 +407,7 @@ class HashingTF(object):
     Maps a sequence of terms to their term frequencies using the hashing
     trick.
 
-    Note: the terms must be hashable (can not be dict/set/list...).
+    .. note:: The terms must be hashable (can not be dict/set/list...).
 
     :param numFeatures: number of features (default: 2^20)
 
@@ -469,9 +469,9 @@ def transform(self, x):
         the terms which occur in fewer than `minDocFreq`
         documents will have an entry of 0.
 
-        Note: In Python, transform cannot currently be used within
-              an RDD transformation or action.
-              Call transform directly on the RDD instead.
+        .. note:: In Python, transform cannot currently be used within
+            an RDD transformation or action.
+            Call transform directly on the RDD instead.
 
         :param x: an RDD of term frequency vectors or a term frequency
                   vector
@@ -551,7 +551,7 @@ def transform(self, word):
         """
         Transforms a word to its vector representation
 
-        Note: local use only
+        .. note:: Local use only
 
         :param word: a word
         :return: vector representation of word(s)
@@ -570,7 +570,7 @@ def findSynonyms(self, word, num):
         :param num: number of synonyms to find
         :return: array of (word, cosineSimilarity)
 
-        Note: local use only
+        .. note:: Local use only
         """
         if not isinstance(word, basestring):
             word = _convert_to_vector(word)
diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index d37e715c8d8ec..031f22c02098e 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -835,11 +835,12 @@ def __hash__(self):
 class Vectors(object):
 
     """
-    Factory methods for working with vectors. Note that dense vectors
-    are simply represented as NumPy array objects, so there is no need
-    to covert them for use in MLlib. For sparse vectors, the factory
-    methods in this class create an MLlib-compatible type, or users
-    can pass in SciPy's C{scipy.sparse} column vectors.
+    Factory methods for working with vectors.
+
+    .. note:: Dense vectors are simply represented as NumPy array objects,
+        so there is no need to covert them for use in MLlib. For sparse vectors,
+        the factory methods in this class create an MLlib-compatible type, or users
+        can pass in SciPy's C{scipy.sparse} column vectors.
     """
 
     @staticmethod
diff --git a/python/pyspark/mllib/linalg/distributed.py b/python/pyspark/mllib/linalg/distributed.py
index 538cada7d163d..600655c912ca6 100644
--- a/python/pyspark/mllib/linalg/distributed.py
+++ b/python/pyspark/mllib/linalg/distributed.py
@@ -171,8 +171,9 @@ def computeColumnSummaryStatistics(self):
     def computeCovariance(self):
         """
         Computes the covariance matrix, treating each row as an
-        observation. Note that this cannot be computed on matrices
-        with more than 65535 columns.
+        observation.
+
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         >>> rows = sc.parallelize([[1, 2], [2, 1]])
         >>> mat = RowMatrix(rows)
@@ -185,8 +186,9 @@ def computeCovariance(self):
     @since('2.0.0')
     def computeGramianMatrix(self):
         """
-        Computes the Gramian matrix `A^T A`. Note that this cannot be
-        computed on matrices with more than 65535 columns.
+        Computes the Gramian matrix `A^T A`.
+
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])
         >>> mat = RowMatrix(rows)
@@ -458,8 +460,9 @@ def columnSimilarities(self):
     @since('2.0.0')
     def computeGramianMatrix(self):
         """
-        Computes the Gramian matrix `A^T A`. Note that this cannot be
-        computed on matrices with more than 65535 columns.
+        Computes the Gramian matrix `A^T A`.
+
+        .. note:: This cannot be computed on matrices with more than 65535 columns.
 
         >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
         ...                        IndexedRow(1, [4, 5, 6])])
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 705022934e41b..1b66f5b51044b 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -44,7 +44,7 @@ class LabeledPoint(object):
       Vector of features for this point (NumPy array, list,
       pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix).
 
-    Note: 'label' and 'features' are accessible as class attributes.
+    .. note:: 'label' and 'features' are accessible as class attributes.
 
     .. versionadded:: 1.0.0
     """
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index 67d5f0e44f41c..49b26446dbc32 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -164,7 +164,6 @@ def chiSqTest(observed, expected=None):
         of fit test of the observed data against the expected distribution,
         or againt the uniform distribution (by default), with each category
         having an expected frequency of `1 / len(observed)`.
-        (Note: `observed` cannot contain negative values)
 
         If `observed` is matrix, conduct Pearson's independence test on the
         input contingency matrix, which cannot contain negative entries or
@@ -176,6 +175,8 @@ def chiSqTest(observed, expected=None):
         contingency matrix for which the chi-squared statistic is computed.
         All label and feature values must be categorical.
 
+        .. note:: `observed` cannot contain negative values
+
         :param observed: it could be a vector containing the observed categorical
                          counts/relative frequencies, or the contingency matrix
                          (containing either counts or relative frequencies),
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index b3011d42e56af..a6089fc8b9d32 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -40,9 +40,9 @@ def predict(self, x):
         Predict values for a single data point or an RDD of points using
         the model trained.
 
-        Note: In Python, predict cannot currently be used within an RDD
-              transformation or action.
-              Call predict directly on the RDD instead.
+        .. note:: In Python, predict cannot currently be used within an RDD
+            transformation or action.
+            Call predict directly on the RDD instead.
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -85,9 +85,9 @@ def predict(self, x):
         """
         Predict the label of one or more examples.
 
-        Note: In Python, predict cannot currently be used within an RDD
-              transformation or action.
-              Call predict directly on the RDD instead.
+        .. note:: In Python, predict cannot currently be used within an RDD
+            transformation or action.
+            Call predict directly on the RDD instead.
 
         :param x:
           Data point (feature vector), or an RDD of data points (feature
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index f21a364df9100..9e05da89af082 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -417,10 +417,8 @@ def sample(self, withReplacement, fraction, seed=None):
             with replacement: expected number of times each element is chosen; fraction must be >= 0
         :param seed: seed for the random number generator
 
-        .. note::
-
-            This is not guaranteed to provide exactly the fraction specified of the total count
-            of the given :class:`DataFrame`.
+        .. note:: This is not guaranteed to provide exactly the fraction specified of the total
+            count of the given :class:`DataFrame`.
 
         >>> rdd = sc.parallelize(range(100), 4)
         >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14
@@ -460,8 +458,8 @@ def takeSample(self, withReplacement, num, seed=None):
         """
         Return a fixed-size sampled subset of this RDD.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         >>> rdd = sc.parallelize(range(0, 10))
         >>> len(rdd.takeSample(True, 20, 1))
@@ -572,7 +570,7 @@ def intersection(self, other):
         Return the intersection of this RDD and another one. The output will
         not contain any duplicate elements, even if the input RDDs did.
 
-        Note that this method performs a shuffle internally.
+        .. note:: This method performs a shuffle internally.
 
         >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
         >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
@@ -803,8 +801,9 @@ def func(it):
     def collect(self):
         """
         Return a list that contains all of the elements in this RDD.
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
         """
         with SCCallSiteSync(self.context) as css:
             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
@@ -1251,10 +1250,10 @@ def top(self, num, key=None):
         """
         Get the top N elements from an RDD.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
-        Note: It returns the list sorted in descending order.
+        .. note:: It returns the list sorted in descending order.
 
         >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
         [12]
@@ -1276,8 +1275,8 @@ def takeOrdered(self, num, key=None):
         Get the N elements from an RDD ordered in ascending order or as
         specified by the optional key function.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: this method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
         [1, 2, 3, 4, 5, 6]
@@ -1298,11 +1297,11 @@ def take(self, num):
         that partition to estimate the number of additional partitions needed
         to satisfy the limit.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
-
         Translated from the Scala implementation in RDD#take().
 
+        .. note:: this method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
+
         >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)
         [2, 3]
         >>> sc.parallelize([2, 3, 4, 5, 6]).take(10)
@@ -1366,8 +1365,9 @@ def first(self):
 
     def isEmpty(self):
         """
-        Returns true if and only if the RDD contains no elements at all. Note that an RDD
-        may be empty even when it has at least 1 partition.
+        Returns true if and only if the RDD contains no elements at all.
+
+        .. note:: an RDD may be empty even when it has at least 1 partition.
 
         >>> sc.parallelize([]).isEmpty()
         True
@@ -1558,8 +1558,8 @@ def collectAsMap(self):
         """
         Return the key-value pairs in this RDD to the master as a dictionary.
 
-        Note that this method should only be used if the resulting data is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: this method should only be used if the resulting data is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         >>> m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()
         >>> m[1]
@@ -1796,8 +1796,7 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         set of aggregation functions.
 
         Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined
-        type" C.  Note that V and C can be different -- for example, one might
-        group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]).
+        type" C.
 
         Users provide three functions:
 
@@ -1809,6 +1808,9 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
 
         In addition, users can control the partitioning of the output RDD.
 
+        .. note:: V and C can be different -- for example, one might group an RDD of type
+            (Int, Int) into an RDD of type (Int, List[Int]).
+
         >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> def add(a, b): return a + str(b)
         >>> sorted(x.combineByKey(str, add, add).collect())
@@ -1880,9 +1882,9 @@ def groupByKey(self, numPartitions=None, partitionFunc=portable_hash):
         Group the values for each key in the RDD into a single sequence.
         Hash-partitions the resulting RDD with numPartitions partitions.
 
-        Note: If you are grouping in order to perform an aggregation (such as a
-        sum or average) over each key, using reduceByKey or aggregateByKey will
-        provide much better performance.
+        .. note:: If you are grouping in order to perform an aggregation (such as a
+            sum or average) over each key, using reduceByKey or aggregateByKey will
+            provide much better performance.
 
         >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> sorted(rdd.groupByKey().mapValues(len).collect())
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 6fe622643291e..b9d90384e3e2c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -457,7 +457,7 @@ def foreachPartition(self, f):
     def cache(self):
         """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}).
 
-        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
+        .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         self._jdf.cache()
@@ -470,7 +470,7 @@ def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
         a new storage level if the :class:`DataFrame` does not have a storage level set yet.
         If no storage level is specified defaults to (C{MEMORY_AND_DISK}).
 
-        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
+        .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
@@ -597,10 +597,8 @@ def distinct(self):
     def sample(self, withReplacement, fraction, seed=None):
         """Returns a sampled subset of this :class:`DataFrame`.
 
-        .. note::
-
-            This is not guaranteed to provide exactly the fraction specified of the total count
-            of the given :class:`DataFrame`.
+        .. note:: This is not guaranteed to provide exactly the fraction specified of the total
+            count of the given :class:`DataFrame`.
 
         >>> df.sample(False, 0.5, 42).count()
         2
@@ -866,8 +864,8 @@ def describe(self, *cols):
         This include count, mean, stddev, min, and max. If no columns are
         given, this function computes statistics for all numerical or string columns.
 
-        .. note:: This function is meant for exploratory data analysis, as we make no \
-        guarantee about the backward compatibility of the schema of the resulting DataFrame.
+        .. note:: This function is meant for exploratory data analysis, as we make no
+            guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         >>> df.describe(['age']).show()
         +-------+------------------+
@@ -900,8 +898,8 @@ def describe(self, *cols):
     def head(self, n=None):
         """Returns the first ``n`` rows.
 
-        Note that this method should only be used if the resulting array is expected
-        to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting array is expected
+            to be small, as all the data is loaded into the driver's memory.
 
         :param n: int, default 1. Number of rows to return.
         :return: If n is greater than 1, return a list of :class:`Row`.
@@ -1462,8 +1460,8 @@ def freqItems(self, cols, support=None):
         "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
         :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
 
-        .. note::  This function is meant for exploratory data analysis, as we make no \
-        guarantee about the backward compatibility of the schema of the resulting DataFrame.
+        .. note:: This function is meant for exploratory data analysis, as we make no
+            guarantee about the backward compatibility of the schema of the resulting DataFrame.
 
         :param cols: Names of the columns to calculate frequent items for as a list or tuple of
             strings.
@@ -1564,11 +1562,11 @@ def toDF(self, *cols):
     def toPandas(self):
         """Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
-        Note that this method should only be used if the resulting Pandas's DataFrame is expected
-        to be small, as all the data is loaded into the driver's memory.
-
         This is only available if Pandas is installed and available.
 
+        .. note:: This method should only be used if the resulting Pandas's DataFrame is expected
+            to be small, as all the data is loaded into the driver's memory.
+
         >>> df.toPandas()  # doctest: +SKIP
            age   name
         0    2  Alice
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 46a092f16d4fc..d8abafcde3846 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -359,7 +359,7 @@ def grouping_id(*cols):
 
        (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
 
-    .. note:: the list of columns should match with grouping columns exactly, or empty (means all
+    .. note:: The list of columns should match with grouping columns exactly, or empty (means all
         the grouping columns).
 
     >>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
@@ -547,7 +547,7 @@ def shiftRightUnsigned(col, numBits):
 def spark_partition_id():
     """A column for partition ID.
 
-    Note that this is indeterministic because it depends on data partitioning and task scheduling.
+    .. note:: This is indeterministic because it depends on data partitioning and task scheduling.
 
     >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect()
     [Row(pid=0), Row(pid=0)]
@@ -1852,9 +1852,10 @@ def __call__(self, *cols):
 @since(1.3)
 def udf(f, returnType=StringType()):
     """Creates a :class:`Column` expression representing a user defined function (UDF).
-    Note that the user-defined functions must be deterministic. Due to optimization,
-    duplicate invocations may be eliminated or the function may even be invoked more times than
-    it is present in the query.
+
+    .. note:: The user-defined functions must be deterministic. Due to optimization,
+        duplicate invocations may be eliminated or the function may even be invoked more times than
+        it is present in the query.
 
     :param f: python function
     :param returnType: a :class:`pyspark.sql.types.DataType` object
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 0e4589be976ea..9c3a237699f96 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -90,10 +90,12 @@ def awaitTermination(self, timeout=None):
     @since(2.0)
     def processAllAvailable(self):
         """Blocks until all available data in the source has been processed and committed to the
-        sink. This method is intended for testing. Note that in the case of continually arriving
-        data, this method may block forever. Additionally, this method is only guaranteed to block
-        until data that has been synchronously appended data to a stream source prior to invocation.
-        (i.e. `getOffset` must immediately reflect the addition).
+        sink. This method is intended for testing.
+
+        .. note:: In the case of continually arriving data, this method may block forever.
+            Additionally, this method is only guaranteed to block until data that has been
+            synchronously appended data to a stream source prior to invocation.
+            (i.e. `getOffset` must immediately reflect the addition).
         """
         return self._jsq.processAllAvailable()
 
diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index ec3ad9933cf60..17c34f8a1c54c 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -304,7 +304,7 @@ def queueStream(self, rdds, oneAtATime=True, default=None):
         Create an input stream from an queue of RDDs or list. In each batch,
         it will process either one or all of the RDDs returned by the queue.
 
-        NOTE: changes to the queue after the stream is created will not be recognized.
+        .. note:: Changes to the queue after the stream is created will not be recognized.
 
         @param rdds:       Queue of RDDs
         @param oneAtATime: pick one rdd each time or pick all of them once.
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
index 434ce83e1e6f9..3a8d8b819fd37 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -42,8 +42,8 @@ def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,
         Create an input stream that pulls messages from a Kinesis stream. This uses the
         Kinesis Client Library (KCL) to pull messages from Kinesis.
 
-        Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is
-        enabled. Make sure that your checkpoint directory is secure.
+        .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing
+            is enabled. Make sure that your checkpoint directory is secure.
 
         :param ssc:  StreamingContext object
         :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to

From 0e60e4b88014fcdd54acc650bfd3a1683f06f09e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 22 Nov 2016 09:16:20 -0800
Subject: [PATCH 169/534] [SPARK-18519][SQL] map type can not be used in
 EqualTo

## What changes were proposed in this pull request?

Technically map type is not orderable, but can be used in equality comparison. However, due to the limitation of the current implementation, map type can't be used in equality comparison so that it can't be join key or grouping key.

This PR makes this limitation explicit, to avoid wrong result.

## How was this patch tested?

updated tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15956 from cloud-fan/map-type.

(cherry picked from commit bb152cdfbb8d02130c71d2326ae81939725c2cf0)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 15 -------
 .../sql/catalyst/expressions/predicates.scala | 30 +++++++++++++
 .../analysis/AnalysisErrorSuite.scala         | 44 +++++++------------
 .../ExpressionTypeCheckingSuite.scala         |  2 +
 4 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 98e50d0d3c674..80e577e5c4c79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -183,21 +183,6 @@ trait CheckAnalysis extends PredicateHelper {
               s"join condition '${condition.sql}' " +
                 s"of type ${condition.dataType.simpleString} is not a boolean.")
 
-          case j @ Join(_, _, _, Some(condition)) =>
-            def checkValidJoinConditionExprs(expr: Expression): Unit = expr match {
-              case p: Predicate =>
-                p.asInstanceOf[Expression].children.foreach(checkValidJoinConditionExprs)
-              case e if e.dataType.isInstanceOf[BinaryType] =>
-                failAnalysis(s"binary type expression ${e.sql} cannot be used " +
-                  "in join conditions")
-              case e if e.dataType.isInstanceOf[MapType] =>
-                failAnalysis(s"map type expression ${e.sql} cannot be used " +
-                  "in join conditions")
-              case _ => // OK
-            }
-
-            checkValidJoinConditionExprs(condition)
-
           case Aggregate(groupingExprs, aggregateExprs, child) =>
             def checkValidAggregateExpression(expr: Expression): Unit = expr match {
               case aggExpr: AggregateExpression =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 7946c201f4ffc..2ad452b6a90ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -412,6 +412,21 @@ case class EqualTo(left: Expression, right: Expression)
 
   override def inputType: AbstractDataType = AnyDataType
 
+  override def checkInputDataTypes(): TypeCheckResult = {
+    super.checkInputDataTypes() match {
+      case TypeCheckResult.TypeCheckSuccess =>
+        // TODO: although map type is not orderable, technically map type should be able to be used
+        // in equality comparison, remove this type check once we support it.
+        if (left.dataType.existsRecursively(_.isInstanceOf[MapType])) {
+          TypeCheckResult.TypeCheckFailure("Cannot use map type in EqualTo, but the actual " +
+            s"input type is ${left.dataType.catalogString}.")
+        } else {
+          TypeCheckResult.TypeCheckSuccess
+        }
+      case failure => failure
+    }
+  }
+
   override def symbol: String = "="
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
@@ -440,6 +455,21 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
 
   override def inputType: AbstractDataType = AnyDataType
 
+  override def checkInputDataTypes(): TypeCheckResult = {
+    super.checkInputDataTypes() match {
+      case TypeCheckResult.TypeCheckSuccess =>
+        // TODO: although map type is not orderable, technically map type should be able to be used
+        // in equality comparison, remove this type check once we support it.
+        if (left.dataType.existsRecursively(_.isInstanceOf[MapType])) {
+          TypeCheckResult.TypeCheckFailure("Cannot use map type in EqualNullSafe, but the actual " +
+            s"input type is ${left.dataType.catalogString}.")
+        } else {
+          TypeCheckResult.TypeCheckSuccess
+        }
+      case failure => failure
+    }
+  }
+
   override def symbol: String = "<=>"
 
   override def nullable: Boolean = false
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 21afe9fec5944..8c1faea2394c6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -465,34 +465,22 @@ class AnalysisErrorSuite extends AnalysisTest {
         "another aggregate function." :: Nil)
   }
 
-  test("Join can't work on binary and map types") {
-    val plan =
-      Join(
-        LocalRelation(
-          AttributeReference("a", BinaryType)(exprId = ExprId(2)),
-          AttributeReference("b", IntegerType)(exprId = ExprId(1))),
-        LocalRelation(
-          AttributeReference("c", BinaryType)(exprId = ExprId(4)),
-          AttributeReference("d", IntegerType)(exprId = ExprId(3))),
-        Cross,
-        Some(EqualTo(AttributeReference("a", BinaryType)(exprId = ExprId(2)),
-          AttributeReference("c", BinaryType)(exprId = ExprId(4)))))
-
-    assertAnalysisError(plan, "binary type expression `a` cannot be used in join conditions" :: Nil)
-
-    val plan2 =
-      Join(
-        LocalRelation(
-          AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
-          AttributeReference("b", IntegerType)(exprId = ExprId(1))),
-        LocalRelation(
-          AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4)),
-          AttributeReference("d", IntegerType)(exprId = ExprId(3))),
-        Cross,
-        Some(EqualTo(AttributeReference("a", MapType(IntegerType, StringType))(exprId = ExprId(2)),
-          AttributeReference("c", MapType(IntegerType, StringType))(exprId = ExprId(4)))))
-
-    assertAnalysisError(plan2, "map type expression `a` cannot be used in join conditions" :: Nil)
+  test("Join can work on binary types but can't work on map types") {
+    val left = LocalRelation('a.binary, 'b.map(StringType, StringType))
+    val right = LocalRelation('c.binary, 'd.map(StringType, StringType))
+
+    val plan1 = left.join(
+      right,
+      joinType = Cross,
+      condition = Some('a === 'c))
+
+    assertAnalysisSuccess(plan1)
+
+    val plan2 = left.join(
+      right,
+      joinType = Cross,
+      condition = Some('b === 'd))
+    assertAnalysisError(plan2, "Cannot use map type in EqualTo" :: Nil)
   }
 
   test("PredicateSubQuery is used outside of a filter") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 542e654bbce12..744057b7c5f4c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -111,6 +111,8 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite {
     assertErrorForDifferingTypes(GreaterThan('intField, 'booleanField))
     assertErrorForDifferingTypes(GreaterThanOrEqual('intField, 'booleanField))
 
+    assertError(EqualTo('mapField, 'mapField), "Cannot use map type in EqualTo")
+    assertError(EqualNullSafe('mapField, 'mapField), "Cannot use map type in EqualNullSafe")
     assertError(LessThan('mapField, 'mapField),
       s"requires ${TypeCollection.Ordered.simpleString} type")
     assertError(LessThanOrEqual('mapField, 'mapField),

From 0e624e990b3b426dba0a6149ad6340f85d214a58 Mon Sep 17 00:00:00 2001
From: Nattavut Sutyanyong <nsy.can@gmail.com>
Date: Tue, 22 Nov 2016 12:06:21 -0800
Subject: [PATCH 170/534] [SPARK-18504][SQL] Scalar subquery with extra group
 by columns returning incorrect result

## What changes were proposed in this pull request?

This PR blocks an incorrect result scenario in scalar subquery where there are GROUP BY column(s)
that are not part of the correlated predicate(s).

Example:
// Incorrect result
Seq(1).toDF("c1").createOrReplaceTempView("t1")
Seq((1,1),(1,2)).toDF("c1","c2").createOrReplaceTempView("t2")
sql("select (select sum(-1) from t2 where t1.c1=t2.c1 group by t2.c2) from t1").show

// How can selecting a scalar subquery from a 1-row table return 2 rows?

## How was this patch tested?
sql/test, catalyst/test
new test case covering the reported problem is added to SubquerySuite.scala

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #15936 from nsyca/scalarSubqueryIncorrect-1.

(cherry picked from commit 45ea46b7b397f023b4da878eb11e21b08d931115)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  3 --
 .../sql/catalyst/analysis/CheckAnalysis.scala | 30 +++++++++++++++----
 .../org/apache/spark/sql/SubquerySuite.scala  | 12 ++++++++
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index b7e167557c559..2918e9d158829 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1182,9 +1182,6 @@ class Analyzer(
      */
     private def resolveSubQueries(plan: LogicalPlan, plans: Seq[LogicalPlan]): LogicalPlan = {
       plan transformExpressions {
-        case s @ ScalarSubquery(sub, conditions, exprId)
-            if sub.resolved && conditions.isEmpty && sub.output.size != 1 =>
-          failAnalysis(s"Scalar subquery must return only one column, but got ${sub.output.size}")
         case s @ ScalarSubquery(sub, _, exprId) if !sub.resolved =>
           resolveSubQuery(s, plans, 1)(ScalarSubquery(_, _, exprId))
         case e @ Exists(sub, exprId) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 80e577e5c4c79..26d26385904f6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -117,19 +117,37 @@ trait CheckAnalysis extends PredicateHelper {
                 failAnalysis(s"Window specification $s is not valid because $m")
               case None => w
             }
+          case s @ ScalarSubquery(query, conditions, _)
+            // If no correlation, the output must be exactly one column
+            if (conditions.isEmpty && query.output.size != 1) =>
+              failAnalysis(
+                s"Scalar subquery must return only one column, but got ${query.output.size}")
 
           case s @ ScalarSubquery(query, conditions, _) if conditions.nonEmpty =>
-            // Make sure correlated scalar subqueries contain one row for every outer row by
-            // enforcing that they are aggregates which contain exactly one aggregate expressions.
-            // The analyzer has already checked that subquery contained only one output column, and
-            // added all the grouping expressions to the aggregate.
-            def checkAggregate(a: Aggregate): Unit = {
-              val aggregates = a.expressions.flatMap(_.collect {
+            def checkAggregate(agg: Aggregate): Unit = {
+              // Make sure correlated scalar subqueries contain one row for every outer row by
+              // enforcing that they are aggregates which contain exactly one aggregate expressions.
+              // The analyzer has already checked that subquery contained only one output column,
+              // and added all the grouping expressions to the aggregate.
+              val aggregates = agg.expressions.flatMap(_.collect {
                 case a: AggregateExpression => a
               })
               if (aggregates.isEmpty) {
                 failAnalysis("The output of a correlated scalar subquery must be aggregated")
               }
+
+              // SPARK-18504: block cases where GROUP BY columns
+              // are not part of the correlated columns
+              val groupByCols = ExpressionSet.apply(agg.groupingExpressions.flatMap(_.references))
+              val predicateCols = ExpressionSet.apply(conditions.flatMap(_.references))
+              val invalidCols = groupByCols.diff(predicateCols)
+              // GROUP BY columns must be a subset of columns in the predicates
+              if (invalidCols.nonEmpty) {
+                failAnalysis(
+                  "a GROUP BY clause in a scalar correlated subquery " +
+                    "cannot contain non-correlated columns: " +
+                    invalidCols.mkString(","))
+              }
             }
 
             // Skip projects and subquery aliases added by the Analyzer and the SQLBuilder.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index c84a6f161893c..f1dd1c620e660 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -483,6 +483,18 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       Row(1, null) :: Row(2, 6.0) :: Row(3, 2.0) :: Row(null, null) :: Row(6, null) :: Nil)
   }
 
+  test("SPARK-18504 extra GROUP BY column in correlated scalar subquery is not permitted") {
+    withTempView("t") {
+      Seq((1, 1), (1, 2)).toDF("c1", "c2").createOrReplaceTempView("t")
+
+      val errMsg = intercept[AnalysisException] {
+        sql("select (select sum(-1) from t t2 where t1.c2 = t2.c1 group by t2.c2) sum from t t1")
+      }
+      assert(errMsg.getMessage.contains(
+        "a GROUP BY clause in a scalar correlated subquery cannot contain non-correlated columns:"))
+    }
+  }
+
   test("non-aggregated correlated scalar subquery") {
     val msg1 = intercept[AnalysisException] {
       sql("select a, (select b from l l2 where l2.a = l1.a) sum_b from l l1")

From fa360134d06e5bfb423f0bd769edb47dbda1d9af Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 22 Nov 2016 15:25:22 -0500
Subject: [PATCH 171/534] [SPARK-18507][SQL] HiveExternalCatalog.listPartitions
 should only call getTable once

## What changes were proposed in this pull request?

HiveExternalCatalog.listPartitions should only call `getTable` once, instead of calling it for every partitions.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15978 from cloud-fan/perf.

(cherry picked from commit 702cd403fc8e5ce8281fe8828197ead46bdb8832)
Signed-off-by: Andrew Or <andrewor14@gmail.com>
---
 .../scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 5dbb4024bbee0..ff0923f04893d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -907,8 +907,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
+    val actualPartColNames = getTable(db, table).partitionColumnNames
     client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
-      part.copy(spec = restorePartitionSpec(part.spec, getTable(db, table).partitionColumnNames))
+      part.copy(spec = restorePartitionSpec(part.spec, actualPartColNames))
     }
   }
 

From fb2ea54a69b521463b93b270b63081da726ee036 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Tue, 22 Nov 2016 13:03:50 -0800
Subject: [PATCH 172/534] [SPARK-18465] Add 'IF EXISTS' clause to 'UNCACHE' to
 not throw exceptions when table doesn't exist

## What changes were proposed in this pull request?

While this behavior is debatable, consider the following use case:
```sql
UNCACHE TABLE foo;
CACHE TABLE foo AS
SELECT * FROM bar
```
The command above fails the first time you run it. But I want to run the command above over and over again, and I don't want to change my code just for the first run of it.
The issue is that subsequent `CACHE TABLE` commands do not overwrite the existing table.

Now we can do:
```sql
UNCACHE TABLE IF EXISTS foo;
CACHE TABLE foo AS
SELECT * FROM bar
```

## How was this patch tested?

Unit tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #15896 from brkyvz/uncache.

(cherry picked from commit bdc8153e8689262708c7fade5c065bd7fc8a84fc)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../org/apache/spark/sql/catalyst/parser/SqlBase.g4  |  2 +-
 .../apache/spark/sql/execution/SparkSqlParser.scala  |  2 +-
 .../apache/spark/sql/execution/command/cache.scala   | 12 ++++++++++--
 .../org/apache/spark/sql/hive/CachedTableSuite.scala |  5 ++++-
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index b599a884957a8..0aa2a97407c53 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -142,7 +142,7 @@ statement
     | REFRESH TABLE tableIdentifier                                    #refreshTable
     | REFRESH .*?                                                      #refreshResource
     | CACHE LAZY? TABLE tableIdentifier (AS? query)?                   #cacheTable
-    | UNCACHE TABLE tableIdentifier                                    #uncacheTable
+    | UNCACHE TABLE (IF EXISTS)? tableIdentifier                       #uncacheTable
     | CLEAR CACHE                                                      #clearCache
     | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE
         tableIdentifier partitionSpec?                                 #loadData
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index b8be3d17ba444..47610453ac23a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -233,7 +233,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
    * Create an [[UncacheTableCommand]] logical plan.
    */
   override def visitUncacheTable(ctx: UncacheTableContext): LogicalPlan = withOrigin(ctx) {
-    UncacheTableCommand(visitTableIdentifier(ctx.tableIdentifier))
+    UncacheTableCommand(visitTableIdentifier(ctx.tableIdentifier), ctx.EXISTS != null)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
index c31f4dc9aba4b..336f14dd97aea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.command
 
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
@@ -49,10 +50,17 @@ case class CacheTableCommand(
 }
 
 
-case class UncacheTableCommand(tableIdent: TableIdentifier) extends RunnableCommand {
+case class UncacheTableCommand(
+    tableIdent: TableIdentifier,
+    ifExists: Boolean) extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
-    sparkSession.catalog.uncacheTable(tableIdent.quotedString)
+    val tableId = tableIdent.quotedString
+    try {
+      sparkSession.catalog.uncacheTable(tableId)
+    } catch {
+      case _: NoSuchTableException if ifExists => // don't throw
+    }
     Seq.empty[Row]
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index fc35304c80ecc..3871b3d785882 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -101,13 +101,16 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
     sql("DROP TABLE IF EXISTS nonexistantTable")
   }
 
-  test("correct error on uncache of nonexistant tables") {
+  test("uncache of nonexistant tables") {
+    // make sure table doesn't exist
+    intercept[NoSuchTableException](spark.table("nonexistantTable"))
     intercept[NoSuchTableException] {
       spark.catalog.uncacheTable("nonexistantTable")
     }
     intercept[NoSuchTableException] {
       sql("UNCACHE TABLE nonexistantTable")
     }
+    sql("UNCACHE TABLE IF EXISTS nonexistantTable")
   }
 
   test("no error on uncache of non-cached table") {

From bd338f60d7f30f0cb735dffb39b3a6ec60766301 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 22 Nov 2016 14:15:57 -0800
Subject: [PATCH 173/534] [SPARK-18373][SPARK-18529][SS][KAFKA] Make
 failOnDataLoss=false work with Spark jobs

## What changes were proposed in this pull request?

This PR adds `CachedKafkaConsumer.getAndIgnoreLostData` to handle corner cases of `failOnDataLoss=false`.

It also resolves [SPARK-18529](https://issues.apache.org/jira/browse/SPARK-18529) after refactoring codes: Timeout will throw a TimeoutException.

## How was this patch tested?

Because I cannot find any way to manually control the Kafka server to clean up logs, it's impossible to write unit tests for each corner case. Therefore, I just created `test("stress test for failOnDataLoss=false")` which should cover most of corner cases.

I also modified some existing tests to test for both `failOnDataLoss=false` and `failOnDataLoss=true` to make sure it doesn't break existing logic.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15820 from zsxwing/failOnDataLoss.

(cherry picked from commit 2fd101b2f0028e005fbb0bdd29e59af37aa637da)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/kafka010/CachedKafkaConsumer.scala    | 236 ++++++++++++--
 .../spark/sql/kafka010/KafkaSource.scala      |  23 +-
 .../spark/sql/kafka010/KafkaSourceRDD.scala   |  42 ++-
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 297 +++++++++++++++---
 .../spark/sql/kafka010/KafkaTestUtils.scala   |  20 +-
 5 files changed, 523 insertions(+), 95 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
index 3b5a96534f9b6..3f438e99185b5 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
@@ -18,12 +18,16 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
+import java.util.concurrent.TimeoutException
 
-import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
+import scala.collection.JavaConverters._
+
+import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer, OffsetOutOfRangeException}
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.{SparkEnv, SparkException, TaskContext}
 import org.apache.spark.internal.Logging
+import org.apache.spark.sql.kafka010.KafkaSource._
 
 
 /**
@@ -34,10 +38,18 @@ import org.apache.spark.internal.Logging
 private[kafka010] case class CachedKafkaConsumer private(
     topicPartition: TopicPartition,
     kafkaParams: ju.Map[String, Object]) extends Logging {
+  import CachedKafkaConsumer._
 
   private val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
 
-  private val consumer = {
+  private var consumer = createConsumer
+
+  /** Iterator to the already fetch data */
+  private var fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
+  private var nextOffsetInFetchedData = UNKNOWN_OFFSET
+
+  /** Create a KafkaConsumer to fetch records for `topicPartition` */
+  private def createConsumer: KafkaConsumer[Array[Byte], Array[Byte]] = {
     val c = new KafkaConsumer[Array[Byte], Array[Byte]](kafkaParams)
     val tps = new ju.ArrayList[TopicPartition]()
     tps.add(topicPartition)
@@ -45,42 +57,193 @@ private[kafka010] case class CachedKafkaConsumer private(
     c
   }
 
-  /** Iterator to the already fetch data */
-  private var fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
-  private var nextOffsetInFetchedData = -2L
-
   /**
-   * Get the record for the given offset, waiting up to timeout ms if IO is necessary.
-   * Sequential forward access will use buffers, but random access will be horribly inefficient.
+   * Get the record for the given offset if available. Otherwise it will either throw error
+   * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset),
+   * or null.
+   *
+   * @param offset the offset to fetch.
+   * @param untilOffset the max offset to fetch. Exclusive.
+   * @param pollTimeoutMs timeout in milliseconds to poll data from Kafka.
+   * @param failOnDataLoss When `failOnDataLoss` is `true`, this method will either return record at
+   *                       offset if available, or throw exception.when `failOnDataLoss` is `false`,
+   *                       this method will either return record at offset if available, or return
+   *                       the next earliest available record less than untilOffset, or null. It
+   *                       will not throw any exception.
    */
-  def get(offset: Long, pollTimeoutMs: Long): ConsumerRecord[Array[Byte], Array[Byte]] = {
+  def get(
+      offset: Long,
+      untilOffset: Long,
+      pollTimeoutMs: Long,
+      failOnDataLoss: Boolean): ConsumerRecord[Array[Byte], Array[Byte]] = {
+    require(offset < untilOffset,
+      s"offset must always be less than untilOffset [offset: $offset, untilOffset: $untilOffset]")
     logDebug(s"Get $groupId $topicPartition nextOffset $nextOffsetInFetchedData requested $offset")
-    if (offset != nextOffsetInFetchedData) {
-      logInfo(s"Initial fetch for $topicPartition $offset")
-      seek(offset)
-      poll(pollTimeoutMs)
+    // The following loop is basically for `failOnDataLoss = false`. When `failOnDataLoss` is
+    // `false`, first, we will try to fetch the record at `offset`. If no such record exists, then
+    // we will move to the next available offset within `[offset, untilOffset)` and retry.
+    // If `failOnDataLoss` is `true`, the loop body will be executed only once.
+    var toFetchOffset = offset
+    while (toFetchOffset != UNKNOWN_OFFSET) {
+      try {
+        return fetchData(toFetchOffset, pollTimeoutMs)
+      } catch {
+        case e: OffsetOutOfRangeException =>
+          // When there is some error thrown, it's better to use a new consumer to drop all cached
+          // states in the old consumer. We don't need to worry about the performance because this
+          // is not a common path.
+          resetConsumer()
+          reportDataLoss(failOnDataLoss, s"Cannot fetch offset $toFetchOffset", e)
+          toFetchOffset = getEarliestAvailableOffsetBetween(toFetchOffset, untilOffset)
+      }
     }
+    resetFetchedData()
+    null
+  }
 
-    if (!fetchedData.hasNext()) { poll(pollTimeoutMs) }
-    assert(fetchedData.hasNext(),
-      s"Failed to get records for $groupId $topicPartition $offset " +
-        s"after polling for $pollTimeoutMs")
-    var record = fetchedData.next()
+  /**
+   * Return the next earliest available offset in [offset, untilOffset). If all offsets in
+   * [offset, untilOffset) are invalid (e.g., the topic is deleted and recreated), it will return
+   * `UNKNOWN_OFFSET`.
+   */
+  private def getEarliestAvailableOffsetBetween(offset: Long, untilOffset: Long): Long = {
+    val (earliestOffset, latestOffset) = getAvailableOffsetRange()
+    logWarning(s"Some data may be lost. Recovering from the earliest offset: $earliestOffset")
+    if (offset >= latestOffset || earliestOffset >= untilOffset) {
+      // [offset, untilOffset) and [earliestOffset, latestOffset) have no overlap,
+      // either
+      // --------------------------------------------------------
+      //         ^                 ^         ^         ^
+      //         |                 |         |         |
+      //   earliestOffset   latestOffset   offset   untilOffset
+      //
+      // or
+      // --------------------------------------------------------
+      //      ^          ^              ^                ^
+      //      |          |              |                |
+      //   offset   untilOffset   earliestOffset   latestOffset
+      val warningMessage =
+        s"""
+          |The current available offset range is [$earliestOffset, $latestOffset).
+          | Offset ${offset} is out of range, and records in [$offset, $untilOffset) will be
+          | skipped ${additionalMessage(failOnDataLoss = false)}
+        """.stripMargin
+      logWarning(warningMessage)
+      UNKNOWN_OFFSET
+    } else if (offset >= earliestOffset) {
+      // -----------------------------------------------------------------------------
+      //         ^            ^                  ^                                 ^
+      //         |            |                  |                                 |
+      //   earliestOffset   offset   min(untilOffset,latestOffset)   max(untilOffset, latestOffset)
+      //
+      // This will happen when a topic is deleted and recreated, and new data are pushed very fast,
+      // then we will see `offset` disappears first then appears again. Although the parameters
+      // are same, the state in Kafka cluster is changed, so the outer loop won't be endless.
+      logWarning(s"Found a disappeared offset $offset. " +
+        s"Some data may be lost ${additionalMessage(failOnDataLoss = false)}")
+      offset
+    } else {
+      // ------------------------------------------------------------------------------
+      //      ^           ^                       ^                                 ^
+      //      |           |                       |                                 |
+      //   offset   earliestOffset   min(untilOffset,latestOffset)   max(untilOffset, latestOffset)
+      val warningMessage =
+        s"""
+           |The current available offset range is [$earliestOffset, $latestOffset).
+           | Offset ${offset} is out of range, and records in [$offset, $earliestOffset) will be
+           | skipped ${additionalMessage(failOnDataLoss = false)}
+        """.stripMargin
+      logWarning(warningMessage)
+      earliestOffset
+    }
+  }
 
-    if (record.offset != offset) {
-      logInfo(s"Buffer miss for $groupId $topicPartition $offset")
+  /**
+   * Get the record at `offset`.
+   *
+   * @throws OffsetOutOfRangeException if `offset` is out of range
+   * @throws TimeoutException if cannot fetch the record in `pollTimeoutMs` milliseconds.
+   */
+  private def fetchData(
+      offset: Long,
+      pollTimeoutMs: Long): ConsumerRecord[Array[Byte], Array[Byte]] = {
+    if (offset != nextOffsetInFetchedData || !fetchedData.hasNext()) {
+      // This is the first fetch, or the last pre-fetched data has been drained.
+      // Seek to the offset because we may call seekToBeginning or seekToEnd before this.
       seek(offset)
       poll(pollTimeoutMs)
-      assert(fetchedData.hasNext(),
-        s"Failed to get records for $groupId $topicPartition $offset " +
-          s"after polling for $pollTimeoutMs")
-      record = fetchedData.next()
+    }
+
+    if (!fetchedData.hasNext()) {
+      // We cannot fetch anything after `poll`. Two possible cases:
+      // - `offset` is out of range so that Kafka returns nothing. Just throw
+      // `OffsetOutOfRangeException` to let the caller handle it.
+      // - Cannot fetch any data before timeout. TimeoutException will be thrown.
+      val (earliestOffset, latestOffset) = getAvailableOffsetRange()
+      if (offset < earliestOffset || offset >= latestOffset) {
+        throw new OffsetOutOfRangeException(
+          Map(topicPartition -> java.lang.Long.valueOf(offset)).asJava)
+      } else {
+        throw new TimeoutException(
+          s"Cannot fetch record for offset $offset in $pollTimeoutMs milliseconds")
+      }
+    } else {
+      val record = fetchedData.next()
+      nextOffsetInFetchedData = record.offset + 1
+      // `seek` is always called before "poll". So "record.offset" must be same as "offset".
       assert(record.offset == offset,
-        s"Got wrong record for $groupId $topicPartition even after seeking to offset $offset")
+        s"The fetched data has a different offset: expected $offset but was ${record.offset}")
+      record
     }
+  }
+
+  /** Create a new consumer and reset cached states */
+  private def resetConsumer(): Unit = {
+    consumer.close()
+    consumer = createConsumer
+    resetFetchedData()
+  }
 
-    nextOffsetInFetchedData = offset + 1
-    record
+  /** Reset the internal pre-fetched data. */
+  private def resetFetchedData(): Unit = {
+    nextOffsetInFetchedData = UNKNOWN_OFFSET
+    fetchedData = ju.Collections.emptyIterator[ConsumerRecord[Array[Byte], Array[Byte]]]
+  }
+
+  /**
+   * Return an addition message including useful message and instruction.
+   */
+  private def additionalMessage(failOnDataLoss: Boolean): String = {
+    if (failOnDataLoss) {
+      s"(GroupId: $groupId, TopicPartition: $topicPartition). " +
+        s"$INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE"
+    } else {
+      s"(GroupId: $groupId, TopicPartition: $topicPartition). " +
+        s"$INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE"
+    }
+  }
+
+  /**
+   * Throw an exception or log a warning as per `failOnDataLoss`.
+   */
+  private def reportDataLoss(
+      failOnDataLoss: Boolean,
+      message: String,
+      cause: Throwable = null): Unit = {
+    val finalMessage = s"$message ${additionalMessage(failOnDataLoss)}"
+    if (failOnDataLoss) {
+      if (cause != null) {
+        throw new IllegalStateException(finalMessage)
+      } else {
+        throw new IllegalStateException(finalMessage, cause)
+      }
+    } else {
+      if (cause != null) {
+        logWarning(finalMessage)
+      } else {
+        logWarning(finalMessage, cause)
+      }
+    }
   }
 
   private def close(): Unit = consumer.close()
@@ -96,10 +259,24 @@ private[kafka010] case class CachedKafkaConsumer private(
     logDebug(s"Polled $groupId ${p.partitions()}  ${r.size}")
     fetchedData = r.iterator
   }
+
+  /**
+   * Return the available offset range of the current partition. It's a pair of the earliest offset
+   * and the latest offset.
+   */
+  private def getAvailableOffsetRange(): (Long, Long) = {
+    consumer.seekToBeginning(Set(topicPartition).asJava)
+    val earliestOffset = consumer.position(topicPartition)
+    consumer.seekToEnd(Set(topicPartition).asJava)
+    val latestOffset = consumer.position(topicPartition)
+    (earliestOffset, latestOffset)
+  }
 }
 
 private[kafka010] object CachedKafkaConsumer extends Logging {
 
+  private val UNKNOWN_OFFSET = -2L
+
   private case class CacheKey(groupId: String, topicPartition: TopicPartition)
 
   private lazy val cache = {
@@ -140,7 +317,10 @@ private[kafka010] object CachedKafkaConsumer extends Logging {
     // If this is reattempt at running the task, then invalidate cache and start with
     // a new consumer
     if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) {
-      cache.remove(key)
+      val removedConsumer = cache.remove(key)
+      if (removedConsumer != null) {
+        removedConsumer.close()
+      }
       new CachedKafkaConsumer(topicPartition, kafkaParams)
     } else {
       if (!cache.containsKey(key)) {
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 341081a338c0e..1d0d402b82a35 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -281,7 +281,7 @@ private[kafka010] case class KafkaSource(
 
     // Create an RDD that reads from Kafka and get the (key, value) pair as byte arrays.
     val rdd = new KafkaSourceRDD(
-      sc, executorKafkaParams, offsetRanges, pollTimeoutMs).map { cr =>
+      sc, executorKafkaParams, offsetRanges, pollTimeoutMs, failOnDataLoss).map { cr =>
       Row(cr.key, cr.value, cr.topic, cr.partition, cr.offset, cr.timestamp, cr.timestampType.id)
     }
 
@@ -463,10 +463,9 @@ private[kafka010] case class KafkaSource(
    */
   private def reportDataLoss(message: String): Unit = {
     if (failOnDataLoss) {
-      throw new IllegalStateException(message +
-        ". Set the source option 'failOnDataLoss' to 'false' if you want to ignore these checks.")
+      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
     } else {
-      logWarning(message)
+      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
     }
   }
 }
@@ -475,6 +474,22 @@ private[kafka010] case class KafkaSource(
 /** Companion object for the [[KafkaSource]]. */
 private[kafka010] object KafkaSource {
 
+  val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE =
+    """
+      |Some data may have been lost because they are not available in Kafka any more; either the
+      | data was aged out by Kafka or the topic may have been deleted before all the data in the
+      | topic was processed. If you want your streaming query to fail on such cases, set the source
+      | option "failOnDataLoss" to "true".
+    """.stripMargin
+
+  val INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE =
+    """
+      |Some data may have been lost because they are not available in Kafka any more; either the
+      | data was aged out by Kafka or the topic may have been deleted before all the data in the
+      | topic was processed. If you don't want your streaming query to fail on such cases, set the
+      | source option "failOnDataLoss" to "false".
+    """.stripMargin
+
   def kafkaSchema: StructType = StructType(Seq(
     StructField("key", BinaryType),
     StructField("value", BinaryType),
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
index 802dd040aed93..244cd2c225bdd 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -28,6 +28,7 @@ import org.apache.spark.{Partition, SparkContext, TaskContext}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.NextIterator
 
 
 /** Offset range that one partition of the KafkaSourceRDD has to read */
@@ -61,7 +62,8 @@ private[kafka010] class KafkaSourceRDD(
     sc: SparkContext,
     executorKafkaParams: ju.Map[String, Object],
     offsetRanges: Seq[KafkaSourceRDDOffsetRange],
-    pollTimeoutMs: Long)
+    pollTimeoutMs: Long,
+    failOnDataLoss: Boolean)
   extends RDD[ConsumerRecord[Array[Byte], Array[Byte]]](sc, Nil) {
 
   override def persist(newLevel: StorageLevel): this.type = {
@@ -130,23 +132,31 @@ private[kafka010] class KafkaSourceRDD(
       logInfo(s"Beginning offset ${range.fromOffset} is the same as ending offset " +
         s"skipping ${range.topic} ${range.partition}")
       Iterator.empty
-
     } else {
-
-      val consumer = CachedKafkaConsumer.getOrCreate(
-        range.topic, range.partition, executorKafkaParams)
-      var requestOffset = range.fromOffset
-
-      logDebug(s"Creating iterator for $range")
-
-      new Iterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {
-        override def hasNext(): Boolean = requestOffset < range.untilOffset
-        override def next(): ConsumerRecord[Array[Byte], Array[Byte]] = {
-          assert(hasNext(), "Can't call next() once untilOffset has been reached")
-          val r = consumer.get(requestOffset, pollTimeoutMs)
-          requestOffset += 1
-          r
+      new NextIterator[ConsumerRecord[Array[Byte], Array[Byte]]]() {
+        val consumer = CachedKafkaConsumer.getOrCreate(
+          range.topic, range.partition, executorKafkaParams)
+        var requestOffset = range.fromOffset
+
+        override def getNext(): ConsumerRecord[Array[Byte], Array[Byte]] = {
+          if (requestOffset >= range.untilOffset) {
+            // Processed all offsets in this partition.
+            finished = true
+            null
+          } else {
+            val r = consumer.get(requestOffset, range.untilOffset, pollTimeoutMs, failOnDataLoss)
+            if (r == null) {
+              // Losing some data. Skip the rest offsets in this partition.
+              finished = true
+              null
+            } else {
+              requestOffset = r.offset + 1
+              r
+            }
+          }
         }
+
+        override protected def close(): Unit = {}
       }
     }
   }
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 89e713f92df46..cd52fd93d10a4 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -17,8 +17,12 @@
 
 package org.apache.spark.sql.kafka010
 
+import java.util.Properties
+import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.concurrent.atomic.AtomicInteger
 
+import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.util.Random
 
 import org.apache.kafka.clients.producer.RecordMetadata
@@ -27,8 +31,9 @@ import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.streaming.{ ProcessingTime, StreamTest }
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.test.SharedSQLContext
 
 abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
@@ -202,7 +207,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 5)
+    testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, (101 to 105).map { _.toString }.toArray)
 
     val reader = spark
@@ -223,52 +228,85 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
-  test("assign from latest offsets") {
-    val topic = newTopic()
-    testFromLatestOffsets(topic, false, "assign" -> assignString(topic, 0 to 4))
-  }
+  for (failOnDataLoss <- Seq(true, false)) {
+    test(s"assign from latest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromLatestOffsets(
+        topic,
+        addPartitions = false,
+        failOnDataLoss = failOnDataLoss,
+        "assign" -> assignString(topic, 0 to 4))
+    }
 
-  test("assign from earliest offsets") {
-    val topic = newTopic()
-    testFromEarliestOffsets(topic, false, "assign" -> assignString(topic, 0 to 4))
-  }
+    test(s"assign from earliest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromEarliestOffsets(
+        topic,
+        addPartitions = false,
+        failOnDataLoss = failOnDataLoss,
+        "assign" -> assignString(topic, 0 to 4))
+    }
 
-  test("assign from specific offsets") {
-    val topic = newTopic()
-    testFromSpecificOffsets(topic, "assign" -> assignString(topic, 0 to 4))
-  }
+    test(s"assign from specific offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromSpecificOffsets(
+        topic,
+        failOnDataLoss = failOnDataLoss,
+        "assign" -> assignString(topic, 0 to 4),
+        "failOnDataLoss" -> failOnDataLoss.toString)
+    }
 
-  test("subscribing topic by name from latest offsets") {
-    val topic = newTopic()
-    testFromLatestOffsets(topic, true, "subscribe" -> topic)
-  }
+    test(s"subscribing topic by name from latest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromLatestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribe" -> topic)
+    }
 
-  test("subscribing topic by name from earliest offsets") {
-    val topic = newTopic()
-    testFromEarliestOffsets(topic, true, "subscribe" -> topic)
-  }
+    test(s"subscribing topic by name from earliest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromEarliestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribe" -> topic)
+    }
 
-  test("subscribing topic by name from specific offsets") {
-    val topic = newTopic()
-    testFromSpecificOffsets(topic, "subscribe" -> topic)
-  }
+    test(s"subscribing topic by name from specific offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topic = newTopic()
+      testFromSpecificOffsets(topic, failOnDataLoss = failOnDataLoss, "subscribe" -> topic)
+    }
 
-  test("subscribing topic by pattern from latest offsets") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-suffix"
-    testFromLatestOffsets(topic, true, "subscribePattern" -> s"$topicPrefix-.*")
-  }
+    test(s"subscribing topic by pattern from latest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topicPrefix = newTopic()
+      val topic = topicPrefix + "-suffix"
+      testFromLatestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribePattern" -> s"$topicPrefix-.*")
+    }
 
-  test("subscribing topic by pattern from earliest offsets") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-suffix"
-    testFromEarliestOffsets(topic, true, "subscribePattern" -> s"$topicPrefix-.*")
-  }
+    test(s"subscribing topic by pattern from earliest offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topicPrefix = newTopic()
+      val topic = topicPrefix + "-suffix"
+      testFromEarliestOffsets(
+        topic,
+        addPartitions = true,
+        failOnDataLoss = failOnDataLoss,
+        "subscribePattern" -> s"$topicPrefix-.*")
+    }
 
-  test("subscribing topic by pattern from specific offsets") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-suffix"
-    testFromSpecificOffsets(topic, "subscribePattern" -> s"$topicPrefix-.*")
+    test(s"subscribing topic by pattern from specific offsets (failOnDataLoss: $failOnDataLoss)") {
+      val topicPrefix = newTopic()
+      val topic = topicPrefix + "-suffix"
+      testFromSpecificOffsets(
+        topic,
+        failOnDataLoss = failOnDataLoss,
+        "subscribePattern" -> s"$topicPrefix-.*")
+    }
   }
 
   test("subscribing topic by pattern with topic deletions") {
@@ -413,13 +451,59 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("delete a topic when a Spark job is running") {
+    KafkaSourceSuite.collectedData.clear()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 1)
+    testUtils.sendMessages(topic, (1 to 10).map(_.toString).toArray)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribe", topic)
+      // If a topic is deleted and we try to poll data starting from offset 0,
+      // the Kafka consumer will just block until timeout and return an empty result.
+      // So set the timeout to 1 second to make this test fast.
+      .option("kafkaConsumer.pollTimeoutMs", "1000")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    KafkaSourceSuite.globalTestUtils = testUtils
+    // The following ForeachWriter will delete the topic before fetching data from Kafka
+    // in executors.
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+      override def open(partitionId: Long, version: Long): Boolean = {
+        KafkaSourceSuite.globalTestUtils.deleteTopic(topic)
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        KafkaSourceSuite.collectedData.add(value)
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {}
+    }).start()
+    query.processAllAvailable()
+    query.stop()
+    // `failOnDataLoss` is `false`, we should not fail the query
+    assert(query.exception.isEmpty)
+  }
+
   private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
 
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
 
-  private def testFromSpecificOffsets(topic: String, options: (String, String)*): Unit = {
+  private def testFromSpecificOffsets(
+      topic: String,
+      failOnDataLoss: Boolean,
+      options: (String, String)*): Unit = {
     val partitionOffsets = Map(
       new TopicPartition(topic, 0) -> -2L,
       new TopicPartition(topic, 1) -> -1L,
@@ -448,6 +532,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .option("startingOffsets", startingOffsets)
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
+      .option("failOnDataLoss", failOnDataLoss.toString)
     options.foreach { case (k, v) => reader.option(k, v) }
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -469,6 +554,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
+      failOnDataLoss: Boolean,
       options: (String, String)*): Unit = {
     testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, Array("-1"))
@@ -480,6 +566,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .option("startingOffsets", s"latest")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
+      .option("failOnDataLoss", failOnDataLoss.toString)
     options.foreach { case (k, v) => reader.option(k, v) }
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -513,6 +600,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
   private def testFromEarliestOffsets(
       topic: String,
       addPartitions: Boolean,
+      failOnDataLoss: Boolean,
       options: (String, String)*): Unit = {
     testUtils.createTopic(topic, partitions = 5)
     testUtils.sendMessages(topic, (1 to 3).map { _.toString }.toArray)
@@ -524,6 +612,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .option("startingOffsets", s"earliest")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
+      .option("failOnDataLoss", failOnDataLoss.toString)
     options.foreach { case (k, v) => reader.option(k, v) }
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -552,6 +641,11 @@ class KafkaSourceSuite extends KafkaSourceTest {
   }
 }
 
+object KafkaSourceSuite {
+  @volatile var globalTestUtils: KafkaTestUtils = _
+  val collectedData = new ConcurrentLinkedQueue[Any]()
+}
+
 
 class KafkaSourceStressSuite extends KafkaSourceTest {
 
@@ -615,7 +709,7 @@ class KafkaSourceStressSuite extends KafkaSourceTest {
                 }
               })
           case 2 => // Add new partitions
-            AddKafkaData(topics.toSet, d: _*)(message = "Add partitiosn",
+            AddKafkaData(topics.toSet, d: _*)(message = "Add partition",
               topicAction = (topic, partition) => {
                 testUtils.addPartitions(topic, partition.get + nextInt(1, 6))
               })
@@ -626,3 +720,122 @@ class KafkaSourceStressSuite extends KafkaSourceTest {
       iterations = 50)
   }
 }
+
+class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with SharedSQLContext {
+
+  import testImplicits._
+
+  private var testUtils: KafkaTestUtils = _
+
+  private val topicId = new AtomicInteger(0)
+
+  private def newTopic(): String = s"failOnDataLoss-${topicId.getAndIncrement()}"
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils {
+      override def brokerConfiguration: Properties = {
+        val props = super.brokerConfiguration
+        // Try to make Kafka clean up messages as fast as possible. However, there is a hard-code
+        // 30 seconds delay (kafka.log.LogManager.InitialTaskDelayMs) so this test should run at
+        // least 30 seconds.
+        props.put("log.cleaner.backoff.ms", "100")
+        props.put("log.segment.bytes", "40")
+        props.put("log.retention.bytes", "40")
+        props.put("log.retention.check.interval.ms", "100")
+        props.put("delete.retention.ms", "10")
+        props.put("log.flush.scheduler.interval.ms", "10")
+        props
+      }
+    }
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+      super.afterAll()
+    }
+  }
+
+  test("stress test for failOnDataLoss=false") {
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", "failOnDataLoss.*")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+
+      override def open(partitionId: Long, version: Long): Boolean = {
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        // Slow down the processing speed so that messages may be aged out.
+        Thread.sleep(Random.nextInt(500))
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {
+      }
+    }).start()
+
+    val testTime = 1.minutes
+    val startTime = System.currentTimeMillis()
+    // Track the current existing topics
+    val topics = mutable.ArrayBuffer[String]()
+    // Track topics that have been deleted
+    val deletedTopics = mutable.Set[String]()
+    while (System.currentTimeMillis() - testTime.toMillis < startTime) {
+      Random.nextInt(10) match {
+        case 0 => // Create a new topic
+          val topic = newTopic()
+          topics += topic
+          // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small
+          // chance that a topic will be recreated after deletion due to the asynchronous update.
+          // Hence, always overwrite to handle this race condition.
+          testUtils.createTopic(topic, partitions = 1, overwrite = true)
+          logInfo(s"Create topic $topic")
+        case 1 if topics.nonEmpty => // Delete an existing topic
+          val topic = topics.remove(Random.nextInt(topics.size))
+          testUtils.deleteTopic(topic)
+          logInfo(s"Delete topic $topic")
+          deletedTopics += topic
+        case 2 if deletedTopics.nonEmpty => // Recreate a topic that was deleted.
+          val topic = deletedTopics.toSeq(Random.nextInt(deletedTopics.size))
+          deletedTopics -= topic
+          topics += topic
+          // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small
+          // chance that a topic will be recreated after deletion due to the asynchronous update.
+          // Hence, always overwrite to handle this race condition.
+          testUtils.createTopic(topic, partitions = 1, overwrite = true)
+          logInfo(s"Create topic $topic")
+        case 3 =>
+          Thread.sleep(1000)
+        case _ => // Push random messages
+          for (topic <- topics) {
+            val size = Random.nextInt(10)
+            for (_ <- 0 until size) {
+              testUtils.sendMessages(topic, Array(Random.nextInt(10).toString))
+            }
+          }
+      }
+      // `failOnDataLoss` is `false`, we should not fail the query
+      if (query.exception.nonEmpty) {
+        throw query.exception.get
+      }
+    }
+
+    query.stop()
+    // `failOnDataLoss` is `false`, we should not fail the query
+    if (query.exception.nonEmpty) {
+      throw query.exception.get
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 9b24ccdd560e8..f43917e151c57 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -155,8 +155,16 @@ class KafkaTestUtils extends Logging {
   }
 
   /** Create a Kafka topic and wait until it is propagated to the whole cluster */
-  def createTopic(topic: String, partitions: Int): Unit = {
-    AdminUtils.createTopic(zkUtils, topic, partitions, 1)
+  def createTopic(topic: String, partitions: Int, overwrite: Boolean = false): Unit = {
+    var created = false
+    while (!created) {
+      try {
+        AdminUtils.createTopic(zkUtils, topic, partitions, 1)
+        created = true
+      } catch {
+        case e: kafka.common.TopicExistsException if overwrite => deleteTopic(topic)
+      }
+    }
     // wait until metadata is propagated
     (0 until partitions).foreach { p =>
       waitUntilMetadataIsPropagated(topic, p)
@@ -244,7 +252,7 @@ class KafkaTestUtils extends Logging {
     offsets
   }
 
-  private def brokerConfiguration: Properties = {
+  protected def brokerConfiguration: Properties = {
     val props = new Properties()
     props.put("broker.id", "0")
     props.put("host.name", "localhost")
@@ -302,9 +310,11 @@ class KafkaTestUtils extends Logging {
         }
         checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp))
       })
-      deletePath && topicPath && replicaManager && logManager && cleaner
+      // ensure the topic is gone
+      val deleted = !zkUtils.getAllTopics().contains(topic)
+      deletePath && topicPath && replicaManager && logManager && cleaner && deleted
     }
-    eventually(timeout(10.seconds)) {
+    eventually(timeout(60.seconds)) {
       assert(isDeleted, s"$topic not deleted after timeout")
     }
   }

From 64b9de9c079672eff49dc38e55749d9a26c743a6 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 22 Nov 2016 15:10:49 -0800
Subject: [PATCH 174/534] [SPARK-16803][SQL] SaveAsTable does not work when
 target table is a Hive serde table

### What changes were proposed in this pull request?

In Spark 2.0, `SaveAsTable` does not work when the target table is a Hive serde table, but Spark 1.6 works.

**Spark 1.6**

``` Scala
scala> sql("create table sample.sample stored as SEQUENCEFILE as select 1 as key, 'abc' as value")
res2: org.apache.spark.sql.DataFrame = []

scala> val df = sql("select key, value as value from sample.sample")
df: org.apache.spark.sql.DataFrame = [key: int, value: string]

scala> df.write.mode("append").saveAsTable("sample.sample")

scala> sql("select * from sample.sample").show()
+---+-----+
|key|value|
+---+-----+
|  1|  abc|
|  1|  abc|
+---+-----+
```

**Spark 2.0**

``` Scala
scala> df.write.mode("append").saveAsTable("sample.sample")
org.apache.spark.sql.AnalysisException: Saving data in MetastoreRelation sample, sample
 is not supported.;
```

So far, we do not plan to support it in Spark 2.1 due to the risk. Spark 1.6 works because it internally uses insertInto. But, if we change it back it will break the semantic of saveAsTable (this method uses by-name resolution instead of using by-position resolution used by insertInto). More extra changes are needed to support `hive` as a `format` in DataFrameWriter.

Instead, users should use insertInto API. This PR corrects the error messages. Users can understand how to bypass it before we support it in a separate PR.
### How was this patch tested?

Test cases are added

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15926 from gatorsmile/saveAsTableFix5.

(cherry picked from commit 9c42d4a76ca8046fcca2e20067f2aa461977e65a)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../command/createDataSourceTables.scala      |  4 ++++
 .../sql/hive/MetastoreDataSourcesSuite.scala  | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 7e16e43f2bb0e..add732c1afc16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -175,6 +175,10 @@ case class CreateDataSourceTableAsSelectCommand(
               existingSchema = Some(l.schema)
             case s: SimpleCatalogRelation if DDLUtils.isDatasourceTable(s.metadata) =>
               existingSchema = Some(s.metadata.schema)
+            case c: CatalogRelation if c.catalogTable.provider == Some(DDLUtils.HIVE_PROVIDER) =>
+              throw new AnalysisException("Saving data in the Hive serde table " +
+                s"${c.catalogTable.identifier} is not supported yet. Please use the " +
+                "insertInto() API as an alternative..")
             case o =>
               throw new AnalysisException(s"Saving data in ${o.toString} is not supported.")
           }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 4ab1a54edc46d..c7cc75fbc8a07 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -413,6 +413,26 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
     }
   }
 
+  test("saveAsTable(CTAS) using append and insertInto when the target table is Hive serde") {
+    val tableName = "tab1"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName STORED AS SEQUENCEFILE AS SELECT 1 AS key, 'abc' AS value")
+
+      val df = sql(s"SELECT key, value FROM $tableName")
+      val e = intercept[AnalysisException] {
+        df.write.mode(SaveMode.Append).saveAsTable(tableName)
+      }.getMessage
+      assert(e.contains("Saving data in the Hive serde table `default`.`tab1` is not supported " +
+        "yet. Please use the insertInto() API as an alternative."))
+
+      df.write.insertInto(tableName)
+      checkAnswer(
+        sql(s"SELECT * FROM $tableName"),
+        Row(1, "abc") :: Row(1, "abc") :: Nil
+      )
+    }
+  }
+
   test("SPARK-5839 HiveMetastoreCatalog does not recognize table aliases of data source tables.") {
     withTable("savedJsonTable") {
       // Save the df as a managed table (by not specifying the path).

From 4b96ffb13a5171ef422aed955fd6b50354ae4253 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Tue, 22 Nov 2016 15:57:07 -0800
Subject: [PATCH 175/534] [SPARK-18533] Raise correct error upon specification
 of schema for datasource tables created using CTAS

## What changes were proposed in this pull request?
Fixes the inconsistency of error raised between data source and hive serde
tables when schema is specified in CTAS scenario. In the process the grammar for
create table (datasource) is simplified.

**before:**
``` SQL
spark-sql> create table t2 (c1 int, c2 int) using parquet as select * from t1;
Error in query:
mismatched input 'as' expecting {<EOF>, '.', 'OPTIONS', 'CLUSTERED', 'PARTITIONED'}(line 1, pos 64)

== SQL ==
create table t2 (c1 int, c2 int) using parquet as select * from t1
----------------------------------------------------------------^^^
```

**After:**
```SQL
spark-sql> create table t2 (c1 int, c2 int) using parquet as select * from t1
         > ;
Error in query:
Operation not allowed: Schema may not be specified in a Create Table As Select (CTAS) statement(line 1, pos 0)

== SQL ==
create table t2 (c1 int, c2 int) using parquet as select * from t1
^^^
```
## How was this patch tested?
Added a new test in CreateTableAsSelectSuite

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #15968 from dilipbiswal/ctas.

(cherry picked from commit 39a1d30636857715247c82d551b200e1c331ad69)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  6 +----
 .../spark/sql/execution/SparkSqlParser.scala  | 24 +++++++++++++++++--
 .../sources/CreateTableAsSelectSuite.scala    |  9 +++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 0aa2a97407c53..df85c70c6cdea 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -71,11 +71,7 @@ statement
     | createTableHeader ('(' colTypeList ')')? tableProvider
         (OPTIONS tablePropertyList)?
         (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec?                                                    #createTableUsing
-    | createTableHeader tableProvider
-        (OPTIONS tablePropertyList)?
-        (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec? AS? query                                          #createTableUsing
+        bucketSpec? (AS? query)?                                       #createTableUsing
     | createTableHeader ('(' columns=colTypeList ')')?
         (COMMENT STRING)?
         (PARTITIONED BY '(' partitionColumns=colTypeList ')')?
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 47610453ac23a..5f89a229d6242 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -322,7 +322,20 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
   }
 
   /**
-   * Create a [[CreateTable]] logical plan.
+   * Create a data source table, returning a [[CreateTable]] logical plan.
+   *
+   * Expected format:
+   * {{{
+   *   CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
+   *   USING table_provider
+   *   [OPTIONS table_property_list]
+   *   [PARTITIONED BY (col_name, col_name, ...)]
+   *   [CLUSTERED BY (col_name, col_name, ...)
+   *    [SORTED BY (col_name [ASC|DESC], ...)]
+   *    INTO num_buckets BUCKETS
+   *   ]
+   *   [AS select_statement];
+   * }}}
    */
   override def visitCreateTableUsing(ctx: CreateTableUsingContext): LogicalPlan = withOrigin(ctx) {
     val (table, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader)
@@ -371,6 +384,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
         operationNotAllowed("CREATE TEMPORARY TABLE ... USING ... AS query", ctx)
       }
 
+      // Don't allow explicit specification of schema for CTAS
+      if (schema.nonEmpty) {
+        operationNotAllowed(
+          "Schema may not be specified in a Create Table As Select (CTAS) statement",
+          ctx)
+      }
       CreateTable(tableDesc, mode, Some(query))
     } else {
       if (temp) {
@@ -1052,7 +1071,8 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
             "CTAS statement."
           operationNotAllowed(errorMessage, ctx)
         }
-        // Just use whatever is projected in the select statement as our schema
+
+        // Don't allow explicit specification of schema for CTAS.
         if (schema.nonEmpty) {
           operationNotAllowed(
             "Schema may not be specified in a Create Table As Select (CTAS) statement",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index 5cc9467395adc..61939fe5ef5b5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -249,4 +249,13 @@ class CreateTableAsSelectSuite
       }
     }
   }
+
+  test("specifying the column list for CTAS") {
+    withTable("t") {
+      val e = intercept[ParseException] {
+        sql("CREATE TABLE t (a int, b int) USING parquet AS SELECT 1, 2")
+      }.getMessage
+      assert(e.contains("Schema may not be specified in a Create Table As Select (CTAS)"))
+    }
+  }
 }

From 3be2d1e0b52bf15ac28a9f96b03ae048e680b035 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 22 Nov 2016 16:49:15 -0800
Subject: [PATCH 176/534] [SPARK-18530][SS][KAFKA] Change Kafka timestamp
 column type to TimestampType

## What changes were proposed in this pull request?

Changed Kafka timestamp column type to TimestampType.

## How was this patch tested?

`test("Kafka column types")`.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15969 from zsxwing/SPARK-18530.

(cherry picked from commit d0212eb0f22473ee5482fe98dafc24e16ffcfc63)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../spark/sql/kafka010/KafkaSource.scala      | 16 +++-
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 81 ++++++++++++++++++-
 2 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 1d0d402b82a35..d9ab4bb4f873d 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -32,9 +32,12 @@ import org.apache.spark.SparkContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.kafka010.KafkaSource._
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.UninterruptibleThread
 
 /**
@@ -282,7 +285,14 @@ private[kafka010] case class KafkaSource(
     // Create an RDD that reads from Kafka and get the (key, value) pair as byte arrays.
     val rdd = new KafkaSourceRDD(
       sc, executorKafkaParams, offsetRanges, pollTimeoutMs, failOnDataLoss).map { cr =>
-      Row(cr.key, cr.value, cr.topic, cr.partition, cr.offset, cr.timestamp, cr.timestampType.id)
+      InternalRow(
+        cr.key,
+        cr.value,
+        UTF8String.fromString(cr.topic),
+        cr.partition,
+        cr.offset,
+        DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(cr.timestamp)),
+        cr.timestampType.id)
     }
 
     logInfo("GetBatch generating RDD of offset range: " +
@@ -293,7 +303,7 @@ private[kafka010] case class KafkaSource(
       currentPartitionOffsets = Some(untilPartitionOffsets)
     }
 
-    sqlContext.createDataFrame(rdd, schema)
+    sqlContext.internalCreateDataFrame(rdd, schema)
   }
 
   /** Stop this source and free any resources it has allocated. */
@@ -496,7 +506,7 @@ private[kafka010] object KafkaSource {
     StructField("topic", StringType),
     StructField("partition", IntegerType),
     StructField("offset", LongType),
-    StructField("timestamp", LongType),
+    StructField("timestamp", TimestampType),
     StructField("timestampType", IntegerType)
   ))
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index cd52fd93d10a4..f9f62581a3066 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.kafka010
 
+import java.nio.charset.StandardCharsets.UTF_8
 import java.util.Properties
 import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.concurrent.atomic.AtomicInteger
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.Random
 
@@ -33,6 +33,7 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -551,6 +552,84 @@ class KafkaSourceSuite extends KafkaSourceTest {
     )
   }
 
+  test("Kafka column types") {
+    val now = System.currentTimeMillis()
+    val topic = newTopic()
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
+      .option("subscribe", topic)
+      .load()
+
+    val query = kafka
+      .writeStream
+      .format("memory")
+      .outputMode("append")
+      .queryName("kafkaColumnTypes")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaColumnTypes").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    assert(row.getAs[Array[Byte]]("key") === null, s"Unexpected results: $row")
+    assert(row.getAs[Array[Byte]]("value") === "1".getBytes(UTF_8), s"Unexpected results: $row")
+    assert(row.getAs[String]("topic") === topic, s"Unexpected results: $row")
+    assert(row.getAs[Int]("partition") === 0, s"Unexpected results: $row")
+    assert(row.getAs[Long]("offset") === 0L, s"Unexpected results: $row")
+    // We cannot check the exact timestamp as it's the time that messages were inserted by the
+    // producer. So here we just use a low bound to make sure the internal conversion works.
+    assert(row.getAs[java.sql.Timestamp]("timestamp").getTime >= now, s"Unexpected results: $row")
+    assert(row.getAs[Int]("timestampType") === 0, s"Unexpected results: $row")
+    query.stop()
+  }
+
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
+    val topic = newTopic()
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
+      .option("subscribe", topic)
+      .load()
+
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
+
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
+  }
+
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,

From fc5fee83e363bc6df22459a9b1ba2ba11bfdfa20 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 22 Nov 2016 19:17:48 -0800
Subject: [PATCH 177/534] [SPARK-18501][ML][SPARKR] Fix spark.glm errors when
 fitting on collinear data

## What changes were proposed in this pull request?
* Fix SparkR ```spark.glm``` errors when fitting on collinear data, since ```standard error of coefficients, t value and p value``` are not available in this condition.
* Scala/Python GLM summary should throw exception if users get ```standard error of coefficients, t value and p value``` but the underlying WLS was solved by local "l-bfgs".

## How was this patch tested?
Add unit tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15930 from yanboliang/spark-18501.

(cherry picked from commit 982b82e32e0fc7d30c5d557944a79eb3e6d2da59)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/mllib.R                               | 21 ++++++--
 R/pkg/inst/tests/testthat/test_mllib.R        |  9 ++++
 .../GeneralizedLinearRegressionWrapper.scala  | 54 +++++++++++--------
 .../GeneralizedLinearRegression.scala         | 46 +++++++++++++---
 .../GeneralizedLinearRegressionSuite.scala    | 21 ++++++++
 5 files changed, 115 insertions(+), 36 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 265e64e7466fa..02bc6456de4d0 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -278,8 +278,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
 
 #' @param object a fitted generalized linear model.
 #' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including at least the coefficients, null/residual deviance, null/residual degrees
-#'         of freedom, AIC and number of iterations IRLS takes.
+#'         including at least the coefficients matrix (which includes coefficients, standard error
+#'         of coefficients, t value and p value), null/residual deviance, null/residual degrees of
+#'         freedom, AIC and number of iterations IRLS takes. If there are collinear columns
+#'         in you data, the coefficients matrix only provides coefficients.
 #'
 #' @rdname spark.glm
 #' @export
@@ -303,9 +305,18 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             } else {
               dataFrame(callJMethod(jobj, "rDevianceResiduals"))
             }
-            coefficients <- matrix(coefficients, ncol = 4)
-            colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
-            rownames(coefficients) <- unlist(features)
+            # If the underlying WeightedLeastSquares using "normal" solver, we can provide
+            # coefficients, standard error of coefficients, t value and p value. Otherwise,
+            # it will be fitted by local "l-bfgs", we can only provide coefficients.
+            if (length(features) == length(coefficients)) {
+              coefficients <- matrix(coefficients, ncol = 1)
+              colnames(coefficients) <- c("Estimate")
+              rownames(coefficients) <- unlist(features)
+            } else {
+              coefficients <- matrix(coefficients, ncol = 4)
+              colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
+              rownames(coefficients) <- unlist(features)
+            }
             ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
                         dispersion = dispersion, null.deviance = null.deviance,
                         deviance = deviance, df.null = df.null, df.residual = df.residual,
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 70a033de5308e..b05be476a3fa8 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -169,6 +169,15 @@ test_that("spark.glm summary", {
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
   expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
+
+  # Test spark.glm works on collinear data
+  A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+  b <- c(1, 2, 3, 4)
+  data <- as.data.frame(cbind(A, b))
+  df <- createDataFrame(data)
+  stats <- summary(spark.glm(df, b ~ . - 1))
+  coefs <- unlist(stats$coefficients)
+  expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
 })
 
 test_that("spark.glm save/load", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index add4d49110d16..8bcc9fe5d1b85 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -144,30 +144,38 @@ private[r] object GeneralizedLinearRegressionWrapper
       features
     }
 
-    val rCoefficientStandardErrors = if (glm.getFitIntercept) {
-      Array(summary.coefficientStandardErrors.last) ++
-        summary.coefficientStandardErrors.dropRight(1)
+    val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
+      val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+        Array(summary.coefficientStandardErrors.last) ++
+          summary.coefficientStandardErrors.dropRight(1)
+      } else {
+        summary.coefficientStandardErrors
+      }
+
+      val rTValues = if (glm.getFitIntercept) {
+        Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
+      } else {
+        summary.tValues
+      }
+
+      val rPValues = if (glm.getFitIntercept) {
+        Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
+      } else {
+        summary.pValues
+      }
+
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray ++
+          rCoefficientStandardErrors ++ rTValues ++ rPValues
+      } else {
+        glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      }
     } else {
-      summary.coefficientStandardErrors
-    }
-
-    val rTValues = if (glm.getFitIntercept) {
-      Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
-    } else {
-      summary.tValues
-    }
-
-    val rPValues = if (glm.getFitIntercept) {
-      Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
-    } else {
-      summary.pValues
-    }
-
-    val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
-      Array(glm.intercept) ++ glm.coefficients.toArray ++
-        rCoefficientStandardErrors ++ rTValues ++ rPValues
-    } else {
-      glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray
+      } else {
+        glm.coefficients.toArray
+      }
     }
 
     val rDispersion: Double = summary.dispersion
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 3f9de1fe74c9c..f33dd0fd294ba 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1063,45 +1063,75 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
 
   import GeneralizedLinearRegression._
 
+  /**
+   * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver.
+   */
+  private[ml] val isNormalSolver: Boolean = {
+    diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
+  }
+
   /**
    * Standard error of estimated coefficients and intercept.
+   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * using the "normal" solver.
    *
    * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val coefficientStandardErrors: Array[Double] = {
-    diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+    if (isNormalSolver) {
+      diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+    } else {
+      throw new UnsupportedOperationException(
+        "No Std. Error of coefficients available for this GeneralizedLinearRegressionModel")
+    }
   }
 
   /**
    * T-statistic of estimated coefficients and intercept.
+   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * using the "normal" solver.
    *
    * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val tValues: Array[Double] = {
-    val estimate = if (model.getFitIntercept) {
-      Array.concat(model.coefficients.toArray, Array(model.intercept))
+    if (isNormalSolver) {
+      val estimate = if (model.getFitIntercept) {
+        Array.concat(model.coefficients.toArray, Array(model.intercept))
+      } else {
+        model.coefficients.toArray
+      }
+      estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
     } else {
-      model.coefficients.toArray
+      throw new UnsupportedOperationException(
+        "No t-statistic available for this GeneralizedLinearRegressionModel")
     }
-    estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
   }
 
   /**
    * Two-sided p-value of estimated coefficients and intercept.
+   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * using the "normal" solver.
    *
    * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val pValues: Array[Double] = {
-    if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
-      tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+    if (isNormalSolver) {
+      if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
+        tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+      } else {
+        tValues.map { x =>
+          2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x)))
+        }
+      }
     } else {
-      tValues.map { x => 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
+      throw new UnsupportedOperationException(
+        "No p-value available for this GeneralizedLinearRegressionModel")
     }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 9b0fa67630d2e..4fab2160339c6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1048,6 +1048,27 @@ class GeneralizedLinearRegressionSuite
     assert(summary.solver === "irls")
   }
 
+  test("glm handle collinear features") {
+    val collinearInstances = Seq(
+      Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)),
+      Instance(2.0, 1.0, Vectors.dense(2.0, 4.0)),
+      Instance(3.0, 1.0, Vectors.dense(3.0, 6.0)),
+      Instance(4.0, 1.0, Vectors.dense(4.0, 8.0))
+    ).toDF()
+    val trainer = new GeneralizedLinearRegression()
+    val model = trainer.fit(collinearInstances)
+    // to make it clear that underlying WLS did not solve analytically
+    intercept[UnsupportedOperationException] {
+      model.summary.coefficientStandardErrors
+    }
+    intercept[UnsupportedOperationException] {
+      model.summary.pValues
+    }
+    intercept[UnsupportedOperationException] {
+      model.summary.tValues
+    }
+  }
+
   test("read/write") {
     def checkModelData(
         model: GeneralizedLinearRegressionModel,

From fabb5aeaf62e5c18d5d489e769e998e52379ba20 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 22 Nov 2016 22:25:27 -0800
Subject: [PATCH 178/534] [SPARK-18179][SQL] Throws analysis exception with a
 proper message for unsupported argument types in reflect/java_method function

## What changes were proposed in this pull request?

This PR proposes throwing an `AnalysisException` with a proper message rather than `NoSuchElementException` with the message ` key not found: TimestampType` when unsupported types are given to `reflect` and `java_method` functions.

```scala
spark.range(1).selectExpr("reflect('java.lang.String', 'valueOf', cast('1990-01-01' as timestamp))")
```

produces

**Before**

```
java.util.NoSuchElementException: key not found: TimestampType
  at scala.collection.MapLike$class.default(MapLike.scala:228)
  at scala.collection.AbstractMap.default(Map.scala:59)
  at scala.collection.MapLike$class.apply(MapLike.scala:141)
  at scala.collection.AbstractMap.apply(Map.scala:59)
  at org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection$$anonfun$findMethod$1$$anonfun$apply$1.apply(CallMethodViaReflection.scala:159)
...
```

**After**

```
cannot resolve 'reflect('java.lang.String', 'valueOf', CAST('1990-01-01' AS TIMESTAMP))' due to data type mismatch: arguments from the third require boolean, byte, short, integer, long, float, double or string expressions; line 1 pos 0;
'Project [unresolvedalias(reflect(java.lang.String, valueOf, cast(1990-01-01 as timestamp)), Some(<function1>))]
+- Range (0, 1, step=1, splits=Some(2))
...
```

Added message is,

```
arguments from the third require boolean, byte, short, integer, long, float, double or string expressions
```

## How was this patch tested?

Tests added in `CallMethodViaReflection`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15694 from HyukjinKwon/SPARK-18179.

(cherry picked from commit 2559fb4b40c9f42f7b3ed2b77de14461f68b6fa5)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalyst/expressions/CallMethodViaReflection.scala   | 4 ++++
 .../expressions/CallMethodViaReflectionSuite.scala       | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
index 40f1b148f9287..4859e0c537610 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflection.scala
@@ -65,6 +65,10 @@ case class CallMethodViaReflection(children: Seq[Expression])
       TypeCheckFailure("first two arguments should be string literals")
     } else if (!classExists) {
       TypeCheckFailure(s"class $className not found")
+    } else if (children.slice(2, children.length)
+        .exists(e => !CallMethodViaReflection.typeMapping.contains(e.dataType))) {
+      TypeCheckFailure("arguments from the third require boolean, byte, short, " +
+        "integer, long, float, double or string expressions")
     } else if (method == null) {
       TypeCheckFailure(s"cannot find a static method that matches the argument types in $className")
     } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala
index 43367c7e14c34..88d4d460751b6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CallMethodViaReflectionSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.Timestamp
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
 import org.apache.spark.sql.types.{IntegerType, StringType}
@@ -85,6 +87,13 @@ class CallMethodViaReflectionSuite extends SparkFunSuite with ExpressionEvalHelp
     assert(createExpr(staticClassName, "method1").checkInputDataTypes().isSuccess)
   }
 
+  test("unsupported type checking") {
+    val ret = createExpr(staticClassName, "method1", new Timestamp(1)).checkInputDataTypes()
+    assert(ret.isFailure)
+    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
+    assert(errorMsg.contains("arguments from the third require boolean, byte, short"))
+  }
+
   test("invoking methods using acceptable types") {
     checkEvaluation(createExpr(staticClassName, "method1"), "m1")
     checkEvaluation(createExpr(staticClassName, "method2", 2), "m2")

From 5f198d200d47703f6ab770e592c0a1d9f8d7b0dc Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 23 Nov 2016 11:25:47 +0000
Subject: [PATCH 179/534] [SPARK-18073][DOCS][WIP] Migrate wiki to
 spark.apache.org web site

## What changes were proposed in this pull request?

Updates links to the wiki to links to the new location of content on spark.apache.org.

## How was this patch tested?

Doc builds

Author: Sean Owen <sowen@cloudera.com>

Closes #15967 from srowen/SPARK-18073.1.

(cherry picked from commit 7e0cd1d9b168286386f15e9b55988733476ae2bb)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .github/PULL_REQUEST_TEMPLATE                         |  2 +-
 CONTRIBUTING.md                                       |  4 ++--
 R/README.md                                           |  2 +-
 R/pkg/DESCRIPTION                                     |  2 +-
 README.md                                             | 11 ++++++-----
 dev/checkstyle.xml                                    |  2 +-
 docs/_layouts/global.html                             |  4 ++--
 docs/building-spark.md                                |  4 ++--
 docs/contributing-to-spark.md                         |  2 +-
 docs/index.md                                         |  4 ++--
 docs/sparkr.md                                        |  2 +-
 docs/streaming-programming-guide.md                   |  2 +-
 .../spark/sql/execution/datasources/DataSource.scala  |  5 ++---
 13 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
index 0e41cf1826453..5af45d6fa7988 100644
--- a/.github/PULL_REQUEST_TEMPLATE
+++ b/.github/PULL_REQUEST_TEMPLATE
@@ -7,4 +7,4 @@
 (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
 (If this patch involves UI changes, please attach a screenshot; otherwise, remove this)
 
-Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.
+Please review http://spark.apache.org/contributing.html before opening a pull request.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a8206abe3838..8fdd5aa9e7dfb 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
 ## Contributing to Spark
 
 *Before opening a pull request*, review the 
-[Contributing to Spark wiki](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark). 
+[Contributing to Spark guide](http://spark.apache.org/contributing.html). 
 It lists steps that are required before creating a PR. In particular, consider:
 
 - Is the change important and ready enough to ask the community to spend time reviewing?
 - Have you searched for existing, related JIRAs and pull requests?
-- Is this a new feature that can stand alone as a [third party project](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects) ?
+- Is this a new feature that can stand alone as a [third party project](http://spark.apache.org/third-party-projects.html) ?
 - Is the change being proposed clearly explained and motivated?
 
 When you contribute code, you affirm that the contribution is your original work and that you 
diff --git a/R/README.md b/R/README.md
index 47f9a86dfde11..4c40c5963db70 100644
--- a/R/README.md
+++ b/R/README.md
@@ -51,7 +51,7 @@ sparkR.session()
 
 #### Making changes to SparkR
 
-The [instructions](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) for making contributions to Spark also apply to SparkR.
+The [instructions](http://spark.apache.org/contributing.html) for making contributions to Spark also apply to SparkR.
 If you only make R file changes (i.e. no Scala changes) then you can just re-install the R package using `R/install-dev.sh` and test your changes.
 Once you have made your changes, please include unit tests for them and run existing unit tests using the `R/run-tests.sh` script as described below.
 
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index fe41a9e7dabbd..981ae1246476b 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -11,7 +11,7 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "felixcheung@apache.org"),
              person(family = "The Apache Software Foundation", role = c("aut", "cph")))
 URL: http://www.apache.org/ http://spark.apache.org/
-BugReports: https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-ContributingBugReports
+BugReports: http://spark.apache.org/contributing.html
 Depends:
     R (>= 3.0),
     methods
diff --git a/README.md b/README.md
index dd7d0e22495b3..853f7f5ded3cb 100644
--- a/README.md
+++ b/README.md
@@ -29,8 +29,9 @@ To build Spark and its example programs, run:
 You can build Spark using more than one thread by using the -T option with Maven, see ["Parallel builds in Maven 3"](https://cwiki.apache.org/confluence/display/MAVEN/Parallel+builds+in+Maven+3).
 More detailed documentation is available from the project site, at
 ["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
-For developing Spark using an IDE, see [Eclipse](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-Eclipse)
-and [IntelliJ](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IntelliJ).
+
+For general development tips, including info on developing Spark using an IDE, see 
+[http://spark.apache.org/developer-tools.html](the Useful Developer Tools page).
 
 ## Interactive Scala Shell
 
@@ -80,7 +81,7 @@ can be run using:
     ./dev/run-tests
 
 Please see the guidance on how to
-[run tests for a module, or individual tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).
+[run tests for a module, or individual tests](http://spark.apache.org/developer-tools.html#individual-tests).
 
 ## A Note About Hadoop Versions
 
@@ -100,5 +101,5 @@ in the online documentation for an overview on how to configure Spark.
 
 ## Contributing
 
-Please review the [Contribution to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
-wiki for information on how to get started contributing to the project.
+Please review the [Contribution to Spark guide](http://spark.apache.org/contributing.html)
+for information on how to get started contributing to the project.
diff --git a/dev/checkstyle.xml b/dev/checkstyle.xml
index 92c5251c85037..fd73ca73ee7ef 100644
--- a/dev/checkstyle.xml
+++ b/dev/checkstyle.xml
@@ -28,7 +28,7 @@
 
     with Spark-specific changes from:
 
-    https://cwiki.apache.org/confluence/display/SPARK/Spark+Code+Style+Guide
+    http://spark.apache.org/contributing.html#code-style-guide
 
     Checkstyle is very configurable. Be sure to read the documentation at
     http://checkstyle.sf.net (or in your downloaded distribution).
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index ad5b5c9adfac8..c00d0db63cd10 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -113,8 +113,8 @@
                                 <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
                                 <li class="divider"></li>
                                 <li><a href="building-spark.html">Building Spark</a></li>
-                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
-                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects">Third Party Projects</a></li>
+                                <li><a href="http://spark.apache.org/contributing.html">Contributing to Spark</a></li>
+                                <li><a href="http://spark.apache.org/third-party-projects.html">Third Party Projects</a></li>
                             </ul>
                         </li>
                     </ul>
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 88da0cc9c3bbf..65c2895b29b10 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -197,7 +197,7 @@ can be set to control the SBT build. For example:
 To avoid the overhead of launching sbt each time you need to re-compile, you can launch sbt
 in interactive mode by running `build/sbt`, and then run all build commands at the command
 prompt. For more recommendations on reducing build time, refer to the
-[wiki page](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-ReducingBuildTimes).
+[Useful Developer Tools page](http://spark.apache.org/developer-tools.html).
 
 ## Encrypted Filesystems
 
@@ -215,7 +215,7 @@ to the `sharedSettings` val. See also [this PR](https://github.com/apache/spark/
 ## IntelliJ IDEA or Eclipse
 
 For help in setting up IntelliJ IDEA or Eclipse for Spark development, and troubleshooting, refer to the
-[wiki page for IDE setup](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IDESetup).
+[Useful Developer Tools page](http://spark.apache.org/developer-tools.html).
 
 
 # Running Tests
diff --git a/docs/contributing-to-spark.md b/docs/contributing-to-spark.md
index ef1b3ad6da57a..9252545e4a129 100644
--- a/docs/contributing-to-spark.md
+++ b/docs/contributing-to-spark.md
@@ -5,4 +5,4 @@ title: Contributing to Spark
 
 The Spark team welcomes all forms of contributions, including bug reports, documentation or patches.
 For the newest information on how to contribute to the project, please read the
-[wiki page on contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark).
+[Contributing to Spark guide](http://spark.apache.org/contributing.html).
diff --git a/docs/index.md b/docs/index.md
index 39de11de854a7..c5d34cb5c4e73 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -125,8 +125,8 @@ options for deployment:
 * Integration with other storage systems:
   * [OpenStack Swift](storage-openstack-swift.html)
 * [Building Spark](building-spark.html): build Spark using the Maven system
-* [Contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
-* [Third Party Projects](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects): related third party Spark projects
+* [Contributing to Spark](http://spark.apache.org/contributing.html)
+* [Third Party Projects](http://spark.apache.org/third-party-projects.html): related third party Spark projects
 
 **External Resources:**
 
diff --git a/docs/sparkr.md b/docs/sparkr.md
index f30bd4026fed3..d26949226b117 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -126,7 +126,7 @@ head(df)
 SparkR supports operating on a variety of data sources through the `SparkDataFrame` interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more [specific options](sql-programming-guide.html#manually-specifying-options) that are available for the built-in data sources.
 
 The general method for creating SparkDataFrames from data sources is `read.df`. This method takes in the path for the file to load and the type of data source, and the currently active SparkSession will be used automatically.
-SparkR supports reading JSON, CSV and Parquet files natively, and through packages available from sources like [Third Party Projects](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects), you can find data source connectors for popular file formats like Avro. These packages can either be added by
+SparkR supports reading JSON, CSV and Parquet files natively, and through packages available from sources like [Third Party Projects](http://spark.apache.org/third-party-projects.html), you can find data source connectors for popular file formats like Avro. These packages can either be added by
 specifying `--packages` with `spark-submit` or `sparkR` commands, or if initializing SparkSession with `sparkPackages` parameter when in an interactive R shell or from RStudio.
 
 <div data-lang="r" markdown="1">
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 18fc1cd934826..1fcd198685a51 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -2382,7 +2382,7 @@ additional effort may be necessary to achieve exactly-once semantics. There are
     - [Kafka Integration Guide](streaming-kafka-integration.html)
     - [Kinesis Integration Guide](streaming-kinesis-integration.html)
     - [Custom Receiver Guide](streaming-custom-receivers.html)
-* Third-party DStream data sources can be found in [Third Party Projects](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects)
+* Third-party DStream data sources can be found in [Third Party Projects](http://spark.apache.org/third-party-projects.html)
 * API documentation
   - Scala docs
     * [StreamingContext](api/scala/index.html#org.apache.spark.streaming.StreamingContext) and
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index cfee7be1e3f07..84fde0bbf9268 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -505,12 +505,11 @@ object DataSource {
                   provider1 == "com.databricks.spark.avro") {
                   throw new AnalysisException(
                     s"Failed to find data source: ${provider1.toLowerCase}. Please find an Avro " +
-                      "package at " +
-                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects")
+                      "package at http://spark.apache.org/third-party-projects.html")
                 } else {
                   throw new ClassNotFoundException(
                     s"Failed to find data source: $provider1. Please find packages at " +
-                      "https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects",
+                      "http://spark.apache.org/third-party-projects.html",
                     error)
                 }
             }

From ebeb051405b84cb4abafbb6929ddcfadf59672db Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 23 Nov 2016 04:15:19 -0800
Subject: [PATCH 180/534] [SPARK-18053][SQL] compare unsafe and safe
 complex-type values correctly

## What changes were proposed in this pull request?

In Spark SQL, some expression may output safe format values, e.g. `CreateArray`, `CreateStruct`, `Cast`, etc. When we compare 2 values, we should be able to compare safe and unsafe formats.

The `GreaterThan`, `LessThan`, etc. in Spark SQL already handles it, but the `EqualTo` doesn't. This PR fixes it.

## How was this patch tested?

new unit test and regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15929 from cloud-fan/type-aware.

(cherry picked from commit 84284e8c82542d80dad94e458a0c0210bf803db3)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/expressions/UnsafeRow.java   |  6 +---
 .../expressions/codegen/CodeGenerator.scala   | 20 ++++++++++--
 .../sql/catalyst/expressions/predicates.scala | 32 +++----------------
 .../catalyst/expressions/PredicateSuite.scala | 29 +++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  7 ++++
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index c3f0abac244cf..d205547698c5b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -578,12 +578,8 @@ public boolean equals(Object other) {
       return (sizeInBytes == o.sizeInBytes) &&
         ByteArrayMethods.arrayEquals(baseObject, baseOffset, o.baseObject, o.baseOffset,
           sizeInBytes);
-    } else if (!(other instanceof InternalRow)) {
-      return false;
-    } else {
-      throw new IllegalArgumentException(
-        "Cannot compare UnsafeRow to " + other.getClass().getName());
     }
+    return false;
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 9c3c6d3b2a7f2..09007b7c89fe3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -481,8 +481,13 @@ class CodegenContext {
     case FloatType => s"(java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2"
     case DoubleType => s"(java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2"
     case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
+    case dt: DataType if dt.isInstanceOf[AtomicType] => s"$c1.equals($c2)"
+    case array: ArrayType => genComp(array, c1, c2) + " == 0"
+    case struct: StructType => genComp(struct, c1, c2) + " == 0"
     case udt: UserDefinedType[_] => genEqual(udt.sqlType, c1, c2)
-    case other => s"$c1.equals($c2)"
+    case _ =>
+      throw new IllegalArgumentException(
+        "cannot generate equality code for un-comparable type: " + dataType.simpleString)
   }
 
   /**
@@ -512,6 +517,11 @@ class CodegenContext {
       val funcCode: String =
         s"""
           public int $compareFunc(ArrayData a, ArrayData b) {
+            // when comparing unsafe arrays, try equals first as it compares the binary directly
+            // which is very fast.
+            if (a instanceof UnsafeArrayData && b instanceof UnsafeArrayData && a.equals(b)) {
+              return 0;
+            }
             int lengthA = a.numElements();
             int lengthB = b.numElements();
             int $minLength = (lengthA > lengthB) ? lengthB : lengthA;
@@ -551,6 +561,11 @@ class CodegenContext {
       val funcCode: String =
         s"""
           public int $compareFunc(InternalRow a, InternalRow b) {
+            // when comparing unsafe rows, try equals first as it compares the binary directly
+            // which is very fast.
+            if (a instanceof UnsafeRow && b instanceof UnsafeRow && a.equals(b)) {
+              return 0;
+            }
             InternalRow i = null;
             $comparisons
             return 0;
@@ -561,7 +576,8 @@ class CodegenContext {
     case other if other.isInstanceOf[AtomicType] => s"$c1.compare($c2)"
     case udt: UserDefinedType[_] => genComp(udt.sqlType, c1, c2)
     case _ =>
-      throw new IllegalArgumentException("cannot generate compare code for un-comparable type")
+      throw new IllegalArgumentException(
+        "cannot generate compare code for un-comparable type: " + dataType.simpleString)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 2ad452b6a90ca..3fcbb05372d87 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -388,6 +388,8 @@ abstract class BinaryComparison extends BinaryOperator with Predicate {
       defineCodeGen(ctx, ev, (c1, c2) => s"${ctx.genComp(left.dataType, c1, c2)} $symbol 0")
     }
   }
+
+  protected lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
 }
 
 
@@ -429,17 +431,7 @@ case class EqualTo(left: Expression, right: Expression)
 
   override def symbol: String = "="
 
-  protected override def nullSafeEval(input1: Any, input2: Any): Any = {
-    if (left.dataType == FloatType) {
-      Utils.nanSafeCompareFloats(input1.asInstanceOf[Float], input2.asInstanceOf[Float]) == 0
-    } else if (left.dataType == DoubleType) {
-      Utils.nanSafeCompareDoubles(input1.asInstanceOf[Double], input2.asInstanceOf[Double]) == 0
-    } else if (left.dataType != BinaryType) {
-      input1 == input2
-    } else {
-      java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
-    }
-  }
+  protected override def nullSafeEval(left: Any, right: Any): Any = ordering.equiv(left, right)
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     defineCodeGen(ctx, ev, (c1, c2) => ctx.genEqual(left.dataType, c1, c2))
@@ -482,15 +474,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp
     } else if (input1 == null || input2 == null) {
       false
     } else {
-      if (left.dataType == FloatType) {
-        Utils.nanSafeCompareFloats(input1.asInstanceOf[Float], input2.asInstanceOf[Float]) == 0
-      } else if (left.dataType == DoubleType) {
-        Utils.nanSafeCompareDoubles(input1.asInstanceOf[Double], input2.asInstanceOf[Double]) == 0
-      } else if (left.dataType != BinaryType) {
-        input1 == input2
-      } else {
-        java.util.Arrays.equals(input1.asInstanceOf[Array[Byte]], input2.asInstanceOf[Array[Byte]])
-      }
+      ordering.equiv(input1, input2)
     }
   }
 
@@ -513,8 +497,6 @@ case class LessThan(left: Expression, right: Expression)
 
   override def symbol: String = "<"
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lt(input1, input2)
 }
 
@@ -527,8 +509,6 @@ case class LessThanOrEqual(left: Expression, right: Expression)
 
   override def symbol: String = "<="
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.lteq(input1, input2)
 }
 
@@ -541,8 +521,6 @@ case class GreaterThan(left: Expression, right: Expression)
 
   override def symbol: String = ">"
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gt(input1, input2)
 }
 
@@ -555,7 +533,5 @@ case class GreaterThanOrEqual(left: Expression, right: Expression)
 
   override def symbol: String = ">="
 
-  private lazy val ordering = TypeUtils.getInterpretedOrdering(left.dataType)
-
   protected override def nullSafeEval(input1: Any, input2: Any): Any = ordering.gteq(input1, input2)
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 2a445b8cdb091..f9f6799e6e72f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -21,6 +21,8 @@ import scala.collection.immutable.HashSet
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.GenericArrayData
 import org.apache.spark.sql.types._
 
 
@@ -293,4 +295,31 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(EqualNullSafe(nullInt, normalInt), false)
     checkEvaluation(EqualNullSafe(nullInt, nullInt), true)
   }
+
+  test("EqualTo on complex type") {
+    val array = new GenericArrayData(Array(1, 2, 3))
+    val struct = create_row("a", 1L, array)
+
+    val arrayType = ArrayType(IntegerType)
+    val structType = new StructType()
+      .add("1", StringType)
+      .add("2", LongType)
+      .add("3", ArrayType(IntegerType))
+
+    val projection = UnsafeProjection.create(
+      new StructType().add("array", arrayType).add("struct", structType))
+
+    val unsafeRow = projection(InternalRow(array, struct))
+
+    val unsafeArray = unsafeRow.getArray(0)
+    val unsafeStruct = unsafeRow.getStruct(1, 3)
+
+    checkEvaluation(EqualTo(
+      Literal.create(array, arrayType),
+      Literal.create(unsafeArray, arrayType)), true)
+
+    checkEvaluation(EqualTo(
+      Literal.create(struct, structType),
+      Literal.create(unsafeStruct, structType)), true)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 6b517bc70f7d2..806381008aba6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2476,4 +2476,11 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-18053: ARRAY equality is broken") {
+    withTable("array_tbl") {
+      spark.range(10).select(array($"id").as("arr")).write.saveAsTable("array_tbl")
+      assert(sql("SELECT * FROM array_tbl where arr = ARRAY(1L)").count == 1)
+    }
+  }
 }

From 539c193af7e3e08e9b48df15e94eafcc3532105c Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 23 Nov 2016 20:14:08 +0800
Subject: [PATCH 181/534] [SPARK-18545][SQL] Verify number of hive client RPCs
 in PartitionedTablePerfStatsSuite

## What changes were proposed in this pull request?

This would help catch accidental O(n) calls to the hive client as in https://issues.apache.org/jira/browse/SPARK-18507

## How was this patch tested?

Checked that the test fails before https://issues.apache.org/jira/browse/SPARK-18507 was patched. cc cloud-fan

Author: Eric Liang <ekl@databricks.com>

Closes #15985 from ericl/spark-18545.

(cherry picked from commit 85235ed6c600270e3fa434738bd50dce3564440a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/metrics/source/StaticSources.scala  |  7 +++
 .../sql/hive/client/HiveClientImpl.scala      |  1 +
 .../hive/PartitionedTablePerfStatsSuite.scala | 58 ++++++++++++++++++-
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index 3f7cfd9d2c11f..b433cd0a89ac9 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -85,6 +85,11 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_FILE_CACHE_HITS = metricRegistry.counter(MetricRegistry.name("fileCacheHits"))
 
+  /**
+   * Tracks the total number of Hive client calls (e.g. to lookup a table).
+   */
+  val METRIC_HIVE_CLIENT_CALLS = metricRegistry.counter(MetricRegistry.name("hiveClientCalls"))
+
   /**
    * Resets the values of all metrics to zero. This is useful in tests.
    */
@@ -92,10 +97,12 @@ object HiveCatalogMetrics extends Source {
     METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount())
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
     METRIC_FILE_CACHE_HITS.dec(METRIC_FILE_CACHE_HITS.getCount())
+    METRIC_HIVE_CLIENT_CALLS.dec(METRIC_HIVE_CLIENT_CALLS.getCount())
   }
 
   // clients can use these to avoid classloader issues with the codahale classes
   def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n)
   def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
   def incrementFileCacheHits(n: Int): Unit = METRIC_FILE_CACHE_HITS.inc(n)
+  def incrementHiveClientCalls(n: Int): Unit = METRIC_HIVE_CLIENT_CALLS.inc(n)
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index daae8523c6366..68dcfd86731bd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -281,6 +281,7 @@ private[hive] class HiveClientImpl(
     shim.setCurrentSessionState(state)
     val ret = try f finally {
       Thread.currentThread().setContextClassLoader(original)
+      HiveCatalogMetrics.incrementHiveClientCalls(1)
     }
     ret
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index b41bc862e9bc5..9838b9a4eba3d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -57,7 +57,11 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedHiveTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+    setupPartitionedHiveTable(tableName, dir, 5)
+  }
+
+  private def setupPartitionedHiveTable(tableName: String, dir: File, scale: Int): Unit = {
+    spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
@@ -71,7 +75,11 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+    setupPartitionedDatasourceTable(tableName, dir, 5)
+  }
+
+  private def setupPartitionedDatasourceTable(tableName: String, dir: File, scale: Int): Unit = {
+    spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
@@ -242,6 +250,52 @@ class PartitionedTablePerfStatsSuite
     }
   }
 
+  test("hive table: num hive client calls does not scale with partition count") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedHiveTable("test", dir, scale = 100)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("show partitions test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+        }
+      }
+    }
+  }
+
+  test("datasource table: num hive client calls does not scale with partition count") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir, scale = 100)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("show partitions test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+        }
+      }
+    }
+  }
+
   test("hive table: files read and cached when filesource partition management is off") {
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {

From e11d7c6874debfbbe44be4a2b0983d6b6763fff8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 23 Nov 2016 04:22:26 -0800
Subject: [PATCH 182/534] [SPARK-18557] Downgrade confusing memory leak warning
 message

## What changes were proposed in this pull request?
TaskMemoryManager has a memory leak detector that gets called at task completion callback and checks whether any memory has not been released. If they are not released by the time the callback is invoked, TaskMemoryManager releases them.

The current error message says something like the following:
```
WARN  [Executor task launch worker-0]
org.apache.spark.memory.TaskMemoryManager - leak 16.3 MB memory from
org.apache.spark.unsafe.map.BytesToBytesMap33fb6a15
In practice, there are multiple reasons why these can be triggered in the normal code path (e.g. limit, or task failures), and the fact that these messages are log means the "leak" is fixed by TaskMemoryManager.
```

To not confuse users, this patch downgrade the message from warning to debug level, and avoids using the word "leak" since it is not actually a leak.

## How was this patch tested?
N/A - this is a simple logging improvement.

Author: Reynold Xin <rxin@databricks.com>

Closes #15989 from rxin/SPARK-18557.

(cherry picked from commit 9785ed40d7fe4e1fcd440e55706519c6e5f8d6b1)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../main/java/org/apache/spark/memory/TaskMemoryManager.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index 1a700aa37554e..c40974b54cb47 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -378,14 +378,14 @@ public long cleanUpAllAllocatedMemory() {
       for (MemoryConsumer c: consumers) {
         if (c != null && c.getUsed() > 0) {
           // In case of failed task, it's normal to see leaked memory
-          logger.warn("leak " + Utils.bytesToString(c.getUsed()) + " memory from " + c);
+          logger.debug("unreleased " + Utils.bytesToString(c.getUsed()) + " memory from " + c);
         }
       }
       consumers.clear();
 
       for (MemoryBlock page : pageTable) {
         if (page != null) {
-          logger.warn("leak a page: " + page + " in task " + taskAttemptId);
+          logger.debug("unreleased page: " + page + " in task " + taskAttemptId);
           memoryManager.tungstenMemoryAllocator().free(page);
         }
       }

From 599dac1594ed52934dd483e12d2e39d514793dd9 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 23 Nov 2016 20:48:41 +0800
Subject: [PATCH 183/534] [SPARK-18522][SQL] Explicit contract for column stats
 serialization

## What changes were proposed in this pull request?
The current implementation of column stats uses the base64 encoding of the internal UnsafeRow format to persist statistics (in table properties in Hive metastore). This is an internal format that is not stable across different versions of Spark and should NOT be used for persistence. In addition, it would be better if statistics stored in the catalog is human readable.

This pull request introduces the following changes:

1. Created a single ColumnStat class to for all data types. All data types track the same set of statistics.
2. Updated the implementation for stats collection to get rid of the dependency on internal data structures (e.g. InternalRow, or storing DateType as an int32). For example, previously dates were stored as a single integer, but are now stored as java.sql.Date. When we implement the next steps of CBO, we can add code to convert those back into internal types again.
3. Documented clearly what JVM data types are being used to store what data.
4. Defined a simple Map[String, String] interface for serializing and deserializing column stats into/from the catalog.
5. Rearranged the method/function structure so it is more clear what the supported data types are, and also moved how stats are generated into ColumnStat class so they are easy to find.

## How was this patch tested?
Removed most of the original test cases created for column statistics, and added three very simple ones to cover all the cases. The three test cases validate:
1. Roundtrip serialization works.
2. Behavior when analyzing non-existent column or unsupported data type column.
3. Result for stats collection for all valid data types.

Also moved parser related tests into a parser test suite and added an explicit serialization test for the Hive external catalog.

Author: Reynold Xin <rxin@databricks.com>

Closes #15959 from rxin/SPARK-18522.

(cherry picked from commit 70ad07a9d20586ae182c4e60ed97bdddbcbceff3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/plans/logical/Statistics.scala   | 212 ++++++++---
 .../command/AnalyzeColumnCommand.scala        | 105 +-----
 .../spark/sql/StatisticsCollectionSuite.scala | 218 ++++++++++++
 .../spark/sql/StatisticsColumnSuite.scala     | 334 ------------------
 .../apache/spark/sql/StatisticsSuite.scala    |  92 -----
 .../org/apache/spark/sql/StatisticsTest.scala | 130 -------
 .../sql/execution/SparkSqlParserSuite.scala   |  26 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  |  93 +++--
 .../spark/sql/hive/StatisticsSuite.scala      | 299 ++++++----------
 9 files changed, 591 insertions(+), 918 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index f3e2147b8f974..79865609cb647 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.commons.codec.binary.Base64
+import scala.util.control.NonFatal
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.types._
 
+
 /**
  * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
  * corresponding statistic produced by the children.  To override this behavior, override
@@ -58,60 +61,175 @@ case class Statistics(
   }
 }
 
+
 /**
- * Statistics for a column.
+ * Statistics collected for a column.
+ *
+ * 1. Supported data types are defined in `ColumnStat.supportsType`.
+ * 2. The JVM data type stored in min/max is the external data type (used in Row) for the
+ * corresponding Catalyst data type. For example, for DateType we store java.sql.Date, and for
+ * TimestampType we store java.sql.Timestamp.
+ * 3. For integral types, they are all upcasted to longs, i.e. shorts are stored as longs.
+ * 4. There is no guarantee that the statistics collected are accurate. Approximation algorithms
+ *    (sketches) might have been used, and the data collected can also be stale.
+ *
+ * @param distinctCount number of distinct values
+ * @param min minimum value
+ * @param max maximum value
+ * @param nullCount number of nulls
+ * @param avgLen average length of the values. For fixed-length types, this should be a constant.
+ * @param maxLen maximum length of the values. For fixed-length types, this should be a constant.
  */
-case class ColumnStat(statRow: InternalRow) {
+case class ColumnStat(
+    distinctCount: BigInt,
+    min: Option[Any],
+    max: Option[Any],
+    nullCount: BigInt,
+    avgLen: Long,
+    maxLen: Long) {
 
-  def forNumeric[T <: AtomicType](dataType: T): NumericColumnStat[T] = {
-    NumericColumnStat(statRow, dataType)
-  }
-  def forString: StringColumnStat = StringColumnStat(statRow)
-  def forBinary: BinaryColumnStat = BinaryColumnStat(statRow)
-  def forBoolean: BooleanColumnStat = BooleanColumnStat(statRow)
+  // We currently don't store min/max for binary/string type. This can change in the future and
+  // then we need to remove this require.
+  require(min.isEmpty || (!min.get.isInstanceOf[Array[Byte]] && !min.get.isInstanceOf[String]))
+  require(max.isEmpty || (!max.get.isInstanceOf[Array[Byte]] && !max.get.isInstanceOf[String]))
 
-  override def toString: String = {
-    // use Base64 for encoding
-    Base64.encodeBase64String(statRow.asInstanceOf[UnsafeRow].getBytes)
+  /**
+   * Returns a map from string to string that can be used to serialize the column stats.
+   * The key is the name of the field (e.g. "distinctCount" or "min"), and the value is the string
+   * representation for the value. The deserialization side is defined in [[ColumnStat.fromMap]].
+   *
+   * As part of the protocol, the returned map always contains a key called "version".
+   * In the case min/max values are null (None), they won't appear in the map.
+   */
+  def toMap: Map[String, String] = {
+    val map = new scala.collection.mutable.HashMap[String, String]
+    map.put(ColumnStat.KEY_VERSION, "1")
+    map.put(ColumnStat.KEY_DISTINCT_COUNT, distinctCount.toString)
+    map.put(ColumnStat.KEY_NULL_COUNT, nullCount.toString)
+    map.put(ColumnStat.KEY_AVG_LEN, avgLen.toString)
+    map.put(ColumnStat.KEY_MAX_LEN, maxLen.toString)
+    min.foreach { v => map.put(ColumnStat.KEY_MIN_VALUE, v.toString) }
+    max.foreach { v => map.put(ColumnStat.KEY_MAX_VALUE, v.toString) }
+    map.toMap
   }
 }
 
-object ColumnStat {
-  def apply(numFields: Int, str: String): ColumnStat = {
-    // use Base64 for decoding
-    val bytes = Base64.decodeBase64(str)
-    val unsafeRow = new UnsafeRow(numFields)
-    unsafeRow.pointTo(bytes, bytes.length)
-    ColumnStat(unsafeRow)
+
+object ColumnStat extends Logging {
+
+  // List of string keys used to serialize ColumnStat
+  val KEY_VERSION = "version"
+  private val KEY_DISTINCT_COUNT = "distinctCount"
+  private val KEY_MIN_VALUE = "min"
+  private val KEY_MAX_VALUE = "max"
+  private val KEY_NULL_COUNT = "nullCount"
+  private val KEY_AVG_LEN = "avgLen"
+  private val KEY_MAX_LEN = "maxLen"
+
+  /** Returns true iff the we support gathering column statistics on column of the given type. */
+  def supportsType(dataType: DataType): Boolean = dataType match {
+    case _: IntegralType => true
+    case _: DecimalType => true
+    case DoubleType | FloatType => true
+    case BooleanType => true
+    case DateType => true
+    case TimestampType => true
+    case BinaryType | StringType => true
+    case _ => false
   }
-}
 
-case class NumericColumnStat[T <: AtomicType](statRow: InternalRow, dataType: T) {
-  // The indices here must be consistent with `ColumnStatStruct.numericColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val max: T#InternalType = statRow.get(1, dataType).asInstanceOf[T#InternalType]
-  val min: T#InternalType = statRow.get(2, dataType).asInstanceOf[T#InternalType]
-  val ndv: Long = statRow.getLong(3)
-}
+  /**
+   * Creates a [[ColumnStat]] object from the given map. This is used to deserialize column stats
+   * from some external storage. The serialization side is defined in [[ColumnStat.toMap]].
+   */
+  def fromMap(table: String, field: StructField, map: Map[String, String])
+    : Option[ColumnStat] = {
+    val str2val: (String => Any) = field.dataType match {
+      case _: IntegralType => _.toLong
+      case _: DecimalType => new java.math.BigDecimal(_)
+      case DoubleType | FloatType => _.toDouble
+      case BooleanType => _.toBoolean
+      case DateType => java.sql.Date.valueOf
+      case TimestampType => java.sql.Timestamp.valueOf
+      // This version of Spark does not use min/max for binary/string types so we ignore it.
+      case BinaryType | StringType => _ => null
+      case _ =>
+        throw new AnalysisException("Column statistics deserialization is not supported for " +
+          s"column ${field.name} of data type: ${field.dataType}.")
+    }
 
-case class StringColumnStat(statRow: InternalRow) {
-  // The indices here must be consistent with `ColumnStatStruct.stringColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val avgColLen: Double = statRow.getDouble(1)
-  val maxColLen: Long = statRow.getInt(2)
-  val ndv: Long = statRow.getLong(3)
-}
+    try {
+      Some(ColumnStat(
+        distinctCount = BigInt(map(KEY_DISTINCT_COUNT).toLong),
+        // Note that flatMap(Option.apply) turns Option(null) into None.
+        min = map.get(KEY_MIN_VALUE).map(str2val).flatMap(Option.apply),
+        max = map.get(KEY_MAX_VALUE).map(str2val).flatMap(Option.apply),
+        nullCount = BigInt(map(KEY_NULL_COUNT).toLong),
+        avgLen = map.getOrElse(KEY_AVG_LEN, field.dataType.defaultSize.toString).toLong,
+        maxLen = map.getOrElse(KEY_MAX_LEN, field.dataType.defaultSize.toString).toLong
+      ))
+    } catch {
+      case NonFatal(e) =>
+        logWarning(s"Failed to parse column statistics for column ${field.name} in table $table", e)
+        None
+    }
+  }
 
-case class BinaryColumnStat(statRow: InternalRow) {
-  // The indices here must be consistent with `ColumnStatStruct.binaryColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val avgColLen: Double = statRow.getDouble(1)
-  val maxColLen: Long = statRow.getInt(2)
-}
+  /**
+   * Constructs an expression to compute column statistics for a given column.
+   *
+   * The expression should create a single struct column with the following schema:
+   * distinctCount: Long, min: T, max: T, nullCount: Long, avgLen: Long, maxLen: Long
+   *
+   * Together with [[rowToColumnStat]], this function is used to create [[ColumnStat]] and
+   * as a result should stay in sync with it.
+   */
+  def statExprs(col: Attribute, relativeSD: Double): CreateNamedStruct = {
+    def struct(exprs: Expression*): CreateNamedStruct = CreateStruct(exprs.map { expr =>
+      expr.transformUp { case af: AggregateFunction => af.toAggregateExpression() }
+    })
+    val one = Literal(1, LongType)
+
+    // the approximate ndv (num distinct value) should never be larger than the number of rows
+    val numNonNulls = if (col.nullable) Count(col) else Count(one)
+    val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
+    val numNulls = Subtract(Count(one), numNonNulls)
+
+    def fixedLenTypeStruct(castType: DataType) = {
+      // For fixed width types, avg size should be the same as max size.
+      val avgSize = Literal(col.dataType.defaultSize, LongType)
+      struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), numNulls, avgSize, avgSize)
+    }
+
+    col.dataType match {
+      case _: IntegralType => fixedLenTypeStruct(LongType)
+      case _: DecimalType => fixedLenTypeStruct(col.dataType)
+      case DoubleType | FloatType => fixedLenTypeStruct(DoubleType)
+      case BooleanType => fixedLenTypeStruct(col.dataType)
+      case DateType => fixedLenTypeStruct(col.dataType)
+      case TimestampType => fixedLenTypeStruct(col.dataType)
+      case BinaryType | StringType =>
+        // For string and binary type, we don't store min/max.
+        val nullLit = Literal(null, col.dataType)
+        struct(
+          ndv, nullLit, nullLit, numNulls,
+          Ceil(Average(Length(col))), Cast(Max(Length(col)), LongType))
+      case _ =>
+        throw new AnalysisException("Analyzing column statistics is not supported for column " +
+            s"${col.name} of data type: ${col.dataType}.")
+    }
+  }
+
+  /** Convert a struct for column stats (defined in statExprs) into [[ColumnStat]]. */
+  def rowToColumnStat(row: Row): ColumnStat = {
+    ColumnStat(
+      distinctCount = BigInt(row.getLong(0)),
+      min = Option(row.get(1)),  // for string/binary min/max, get should return null
+      max = Option(row.get(2)),
+      nullCount = BigInt(row.getLong(3)),
+      avgLen = row.getLong(4),
+      maxLen = row.getLong(5)
+    )
+  }
 
-case class BooleanColumnStat(statRow: InternalRow) {
-  // The indices here must be consistent with `ColumnStatStruct.booleanColumnStat`.
-  val numNulls: Long = statRow.getLong(0)
-  val numTrues: Long = statRow.getLong(1)
-  val numFalses: Long = statRow.getLong(2)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 7fc57d09e9243..9dffe3614a87c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -24,9 +24,8 @@ import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogTable}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, ColumnStat, LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.types._
 
 
 /**
@@ -62,7 +61,7 @@ case class AnalyzeColumnCommand(
 
     // Compute stats for each column
     val (rowCount, newColStats) =
-      AnalyzeColumnCommand.computeColStats(sparkSession, relation, columnNames)
+      AnalyzeColumnCommand.computeColumnStats(sparkSession, tableIdent.table, relation, columnNames)
 
     // We also update table-level stats in order to keep them consistent with column-level stats.
     val statistics = Statistics(
@@ -88,8 +87,9 @@ object AnalyzeColumnCommand extends Logging {
    *
    * This is visible for testing.
    */
-  def computeColStats(
+  def computeColumnStats(
       sparkSession: SparkSession,
+      tableName: String,
       relation: LogicalPlan,
       columnNames: Seq[String]): (Long, Map[String, ColumnStat]) = {
 
@@ -97,102 +97,33 @@ object AnalyzeColumnCommand extends Logging {
     val resolver = sparkSession.sessionState.conf.resolver
     val attributesToAnalyze = AttributeSet(columnNames.map { col =>
       val exprOption = relation.output.find(attr => resolver(attr.name, col))
-      exprOption.getOrElse(throw new AnalysisException(s"Invalid column name: $col."))
+      exprOption.getOrElse(throw new AnalysisException(s"Column $col does not exist."))
     }).toSeq
 
+    // Make sure the column types are supported for stats gathering.
+    attributesToAnalyze.foreach { attr =>
+      if (!ColumnStat.supportsType(attr.dataType)) {
+        throw new AnalysisException(
+          s"Column ${attr.name} in table $tableName is of type ${attr.dataType}, " +
+            "and Spark does not support statistics collection on this column type.")
+      }
+    }
+
     // Collect statistics per column.
     // The first element in the result will be the overall row count, the following elements
     // will be structs containing all column stats.
     // The layout of each struct follows the layout of the ColumnStats.
     val ndvMaxErr = sparkSession.sessionState.conf.ndvMaxError
     val expressions = Count(Literal(1)).toAggregateExpression() +:
-      attributesToAnalyze.map(AnalyzeColumnCommand.createColumnStatStruct(_, ndvMaxErr))
+        attributesToAnalyze.map(ColumnStat.statExprs(_, ndvMaxErr))
+
     val namedExpressions = expressions.map(e => Alias(e, e.toString)())
-    val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, namedExpressions, relation))
-      .queryExecution.toRdd.collect().head
+    val statsRow = Dataset.ofRows(sparkSession, Aggregate(Nil, namedExpressions, relation)).head()
 
-    // unwrap the result
-    // TODO: Get rid of numFields by using the public Dataset API.
     val rowCount = statsRow.getLong(0)
     val columnStats = attributesToAnalyze.zipWithIndex.map { case (expr, i) =>
-      val numFields = AnalyzeColumnCommand.numStatFields(expr.dataType)
-      (expr.name, ColumnStat(statsRow.getStruct(i + 1, numFields)))
+      (expr.name, ColumnStat.rowToColumnStat(statsRow.getStruct(i + 1)))
     }.toMap
     (rowCount, columnStats)
   }
-
-  private val zero = Literal(0, LongType)
-  private val one = Literal(1, LongType)
-
-  private def numNulls(e: Expression): Expression = {
-    if (e.nullable) Sum(If(IsNull(e), one, zero)) else zero
-  }
-  private def max(e: Expression): Expression = Max(e)
-  private def min(e: Expression): Expression = Min(e)
-  private def ndv(e: Expression, relativeSD: Double): Expression = {
-    // the approximate ndv should never be larger than the number of rows
-    Least(Seq(HyperLogLogPlusPlus(e, relativeSD), Count(one)))
-  }
-  private def avgLength(e: Expression): Expression = Average(Length(e))
-  private def maxLength(e: Expression): Expression = Max(Length(e))
-  private def numTrues(e: Expression): Expression = Sum(If(e, one, zero))
-  private def numFalses(e: Expression): Expression = Sum(If(Not(e), one, zero))
-
-  /**
-   * Creates a struct that groups the sequence of expressions together. This is used to create
-   * one top level struct per column.
-   */
-  private def createStruct(exprs: Seq[Expression]): CreateNamedStruct = {
-    CreateStruct(exprs.map { expr: Expression =>
-      expr.transformUp {
-        case af: AggregateFunction => af.toAggregateExpression()
-      }
-    })
-  }
-
-  private def numericColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
-    Seq(numNulls(e), max(e), min(e), ndv(e, relativeSD))
-  }
-
-  private def stringColumnStat(e: Expression, relativeSD: Double): Seq[Expression] = {
-    Seq(numNulls(e), avgLength(e), maxLength(e), ndv(e, relativeSD))
-  }
-
-  private def binaryColumnStat(e: Expression): Seq[Expression] = {
-    Seq(numNulls(e), avgLength(e), maxLength(e))
-  }
-
-  private def booleanColumnStat(e: Expression): Seq[Expression] = {
-    Seq(numNulls(e), numTrues(e), numFalses(e))
-  }
-
-  // TODO(rxin): Get rid of this function.
-  def numStatFields(dataType: DataType): Int = {
-    dataType match {
-      case BinaryType | BooleanType => 3
-      case _ => 4
-    }
-  }
-
-  /**
-   * Creates a struct expression that contains the statistics to collect for a column.
-   *
-   * @param attr column to collect statistics
-   * @param relativeSD relative error for approximate number of distinct values.
-   */
-  def createColumnStatStruct(attr: Attribute, relativeSD: Double): CreateNamedStruct = {
-    attr.dataType match {
-      case _: NumericType | TimestampType | DateType =>
-        createStruct(numericColumnStat(attr, relativeSD))
-      case StringType =>
-        createStruct(stringColumnStat(attr, relativeSD))
-      case BinaryType =>
-        createStruct(binaryColumnStat(attr))
-      case BooleanType =>
-        createStruct(booleanColumnStat(attr))
-      case otherType =>
-        throw new AnalysisException("Analyzing columns is not supported for column " +
-            s"${attr.name} of data type: ${attr.dataType}.")
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
new file mode 100644
index 0000000000000..1fcccd061079e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.{lang => jl}
+import java.sql.{Date, Timestamp}
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
+import org.apache.spark.sql.test.SQLTestData.ArrayData
+import org.apache.spark.sql.types._
+
+
+/**
+ * End-to-end suite testing statistics collection and use on both entire table and columns.
+ */
+class StatisticsCollectionSuite extends StatisticsCollectionTestBase with SharedSQLContext {
+  import testImplicits._
+
+  private def checkTableStats(tableName: String, expectedRowCount: Option[Int])
+    : Option[Statistics] = {
+    val df = spark.table(tableName)
+    val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
+      assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
+      rel.catalogTable.get.stats
+    }
+    assert(stats.size == 1)
+    stats.head
+  }
+
+  test("estimates the size of a limit 0 on outer join") {
+    withTempView("test") {
+      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
+        .createOrReplaceTempView("test")
+      val df1 = spark.table("test")
+      val df2 = spark.table("test").limit(0)
+      val df = df1.join(df2, Seq("k"), "left")
+
+      val sizes = df.queryExecution.analyzed.collect { case g: Join =>
+        g.statistics.sizeInBytes
+      }
+
+      assert(sizes.size === 1, s"number of Join nodes is wrong:\n ${df.queryExecution}")
+      assert(sizes.head === BigInt(96),
+        s"expected exact size 96 for table 'test', got: ${sizes.head}")
+    }
+  }
+
+  test("analyze column command - unsupported types and invalid columns") {
+    val tableName = "column_stats_test1"
+    withTable(tableName) {
+      Seq(ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3)))).toDF().write.saveAsTable(tableName)
+
+      // Test unsupported data types
+      val err1 = intercept[AnalysisException] {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS data")
+      }
+      assert(err1.message.contains("does not support statistics collection"))
+
+      // Test invalid columns
+      val err2 = intercept[AnalysisException] {
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS some_random_column")
+      }
+      assert(err2.message.contains("does not exist"))
+    }
+  }
+
+  test("test table-level statistics for data source table") {
+    val tableName = "tbl"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName(i INT, j STRING) USING parquet")
+      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto(tableName)
+
+      // noscan won't count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
+      checkTableStats(tableName, expectedRowCount = None)
+
+      // without noscan, we count the number of rows
+      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
+      checkTableStats(tableName, expectedRowCount = Some(2))
+    }
+  }
+
+  test("SPARK-15392: DataFrame created from RDD should not be broadcasted") {
+    val rdd = sparkContext.range(1, 100).map(i => Row(i, i))
+    val df = spark.createDataFrame(rdd, new StructType().add("a", LongType).add("b", LongType))
+    assert(df.queryExecution.analyzed.statistics.sizeInBytes >
+      spark.sessionState.conf.autoBroadcastJoinThreshold)
+    assert(df.selectExpr("a").queryExecution.analyzed.statistics.sizeInBytes >
+      spark.sessionState.conf.autoBroadcastJoinThreshold)
+  }
+
+  test("estimates the size of limit") {
+    withTempView("test") {
+      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
+        .createOrReplaceTempView("test")
+      Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) =>
+        val df = sql(s"""SELECT * FROM test limit $limit""")
+
+        val sizesGlobalLimit = df.queryExecution.analyzed.collect { case g: GlobalLimit =>
+          g.statistics.sizeInBytes
+        }
+        assert(sizesGlobalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+        assert(sizesGlobalLimit.head === BigInt(expected),
+          s"expected exact size $expected for table 'test', got: ${sizesGlobalLimit.head}")
+
+        val sizesLocalLimit = df.queryExecution.analyzed.collect { case l: LocalLimit =>
+          l.statistics.sizeInBytes
+        }
+        assert(sizesLocalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+        assert(sizesLocalLimit.head === BigInt(expected),
+          s"expected exact size $expected for table 'test', got: ${sizesLocalLimit.head}")
+      }
+    }
+  }
+
+}
+
+
+/**
+ * The base for test cases that we want to include in both the hive module (for verifying behavior
+ * when using the Hive external catalog) as well as in the sql/core module.
+ */
+abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
+  import testImplicits._
+
+  private val dec1 = new java.math.BigDecimal("1.000000000000000000")
+  private val dec2 = new java.math.BigDecimal("8.000000000000000000")
+  private val d1 = Date.valueOf("2016-05-08")
+  private val d2 = Date.valueOf("2016-05-09")
+  private val t1 = Timestamp.valueOf("2016-05-08 00:00:01")
+  private val t2 = Timestamp.valueOf("2016-05-09 00:00:02")
+
+  /**
+   * Define a very simple 3 row table used for testing column serialization.
+   * Note: last column is seq[int] which doesn't support stats collection.
+   */
+  protected val data = Seq[
+    (jl.Boolean, jl.Byte, jl.Short, jl.Integer, jl.Long,
+      jl.Double, jl.Float, java.math.BigDecimal,
+      String, Array[Byte], Date, Timestamp,
+      Seq[Int])](
+    (false, 1.toByte, 1.toShort, 1, 1L, 1.0, 1.0f, dec1, "s1", "b1".getBytes, d1, t1, null),
+    (true, 2.toByte, 3.toShort, 4, 5L, 6.0, 7.0f, dec2, "ss9", "bb0".getBytes, d2, t2, null),
+    (null, null, null, null, null, null, null, null, null, null, null, null, null)
+  )
+
+  /** A mapping from column to the stats collected. */
+  protected val stats = mutable.LinkedHashMap(
+    "cbool" -> ColumnStat(2, Some(false), Some(true), 1, 1, 1),
+    "cbyte" -> ColumnStat(2, Some(1L), Some(2L), 1, 1, 1),
+    "cshort" -> ColumnStat(2, Some(1L), Some(3L), 1, 2, 2),
+    "cint" -> ColumnStat(2, Some(1L), Some(4L), 1, 4, 4),
+    "clong" -> ColumnStat(2, Some(1L), Some(5L), 1, 8, 8),
+    "cdouble" -> ColumnStat(2, Some(1.0), Some(6.0), 1, 8, 8),
+    "cfloat" -> ColumnStat(2, Some(1.0), Some(7.0), 1, 4, 4),
+    "cdecimal" -> ColumnStat(2, Some(dec1), Some(dec2), 1, 16, 16),
+    "cstring" -> ColumnStat(2, None, None, 1, 3, 3),
+    "cbinary" -> ColumnStat(2, None, None, 1, 3, 3),
+    "cdate" -> ColumnStat(2, Some(d1), Some(d2), 1, 4, 4),
+    "ctimestamp" -> ColumnStat(2, Some(t1), Some(t2), 1, 8, 8)
+  )
+
+  test("column stats round trip serialization") {
+    // Make sure we serialize and then deserialize and we will get the result data
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+    stats.zip(df.schema).foreach { case ((k, v), field) =>
+      withClue(s"column $k with type ${field.dataType}") {
+        val roundtrip = ColumnStat.fromMap("table_is_foo", field, v.toMap)
+        assert(roundtrip == Some(v))
+      }
+    }
+  }
+
+  test("analyze column command - result verification") {
+    val tableName = "column_stats_test2"
+    // (data.head.productArity - 1) because the last column does not support stats collection.
+    assert(stats.size == data.head.productArity - 1)
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+
+    withTable(tableName) {
+      df.write.saveAsTable(tableName)
+
+      // Collect statistics
+      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " + stats.keys.mkString(", "))
+
+      // Validate statistics
+      val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
+      assert(table.stats.isDefined)
+      assert(table.stats.get.colStats.size == stats.size)
+
+      stats.foreach { case (k, v) =>
+        withClue(s"column $k") {
+          assert(table.stats.get.colStats(k) == v)
+        }
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
deleted file mode 100644
index e866ac2cb3b34..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsColumnSuite.scala
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import java.sql.{Date, Timestamp}
-
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
-import org.apache.spark.sql.catalyst.parser.ParseException
-import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.command.AnalyzeColumnCommand
-import org.apache.spark.sql.test.SQLTestData.ArrayData
-import org.apache.spark.sql.types._
-
-class StatisticsColumnSuite extends StatisticsTest {
-  import testImplicits._
-
-  test("parse analyze column commands") {
-    val tableName = "tbl"
-
-    // we need to specify column names
-    intercept[ParseException] {
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS")
-    }
-
-    val analyzeSql = s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS key, value"
-    val parsed = spark.sessionState.sqlParser.parsePlan(analyzeSql)
-    val expected = AnalyzeColumnCommand(TableIdentifier(tableName), Seq("key", "value"))
-    comparePlans(parsed, expected)
-  }
-
-  test("analyzing columns of non-atomic types is not supported") {
-    val tableName = "tbl"
-    withTable(tableName) {
-      Seq(ArrayData(Seq(1, 2, 3), Seq(Seq(1, 2, 3)))).toDF().write.saveAsTable(tableName)
-      val err = intercept[AnalysisException] {
-        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS data")
-      }
-      assert(err.message.contains("Analyzing columns is not supported"))
-    }
-  }
-
-  test("check correctness of columns") {
-    val table = "tbl"
-    val colName1 = "abc"
-    val colName2 = "x.yz"
-    withTable(table) {
-      sql(s"CREATE TABLE $table ($colName1 int, `$colName2` string) USING PARQUET")
-
-      val invalidColError = intercept[AnalysisException] {
-        sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS key")
-      }
-      assert(invalidColError.message == "Invalid column name: key.")
-
-      withSQLConf("spark.sql.caseSensitive" -> "true") {
-        val invalidErr = intercept[AnalysisException] {
-          sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS ${colName1.toUpperCase}")
-        }
-        assert(invalidErr.message == s"Invalid column name: ${colName1.toUpperCase}.")
-      }
-
-      withSQLConf("spark.sql.caseSensitive" -> "false") {
-        val columnsToAnalyze = Seq(colName2.toUpperCase, colName1, colName2)
-        val tableIdent = TableIdentifier(table, Some("default"))
-        val relation = spark.sessionState.catalog.lookupRelation(tableIdent)
-        val (_, columnStats) =
-          AnalyzeColumnCommand.computeColStats(spark, relation, columnsToAnalyze)
-        assert(columnStats.contains(colName1))
-        assert(columnStats.contains(colName2))
-        // check deduplication
-        assert(columnStats.size == 2)
-        assert(!columnStats.contains(colName2.toUpperCase))
-      }
-    }
-  }
-
-  private def getNonNullValues[T](values: Seq[Option[T]]): Seq[T] = {
-    values.filter(_.isDefined).map(_.get)
-  }
-
-  test("column-level statistics for integral type columns") {
-    val values = (0 to 5).map { i =>
-      if (i % 2 == 0) None else Some(i)
-    }
-    val data = values.map { i =>
-      (i.map(_.toByte), i.map(_.toShort), i.map(_.toInt), i.map(_.toLong))
-    }
-
-    val df = data.toDF("c1", "c2", "c3", "c4")
-    val nonNullValues = getNonNullValues[Int](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.max,
-        nonNullValues.min,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for fractional type columns") {
-    val values: Seq[Option[Decimal]] = (0 to 5).map { i =>
-      if (i == 0) None else Some(Decimal(i + i * 0.01))
-    }
-    val data = values.map { i =>
-      (i.map(_.toFloat), i.map(_.toDouble), i)
-    }
-
-    val df = data.toDF("c1", "c2", "c3")
-    val nonNullValues = getNonNullValues[Decimal](values)
-    val numNulls = values.count(_.isEmpty).toLong
-    val ndv = nonNullValues.distinct.length.toLong
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = f.dataType match {
-        case floatType: FloatType =>
-          ColumnStat(InternalRow(numNulls, nonNullValues.max.toFloat, nonNullValues.min.toFloat,
-            ndv))
-        case doubleType: DoubleType =>
-          ColumnStat(InternalRow(numNulls, nonNullValues.max.toDouble, nonNullValues.min.toDouble,
-            ndv))
-        case decimalType: DecimalType =>
-          ColumnStat(InternalRow(numNulls, nonNullValues.max, nonNullValues.min, ndv))
-      }
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for string column") {
-    val values = Seq(None, Some("a"), Some("bbbb"), Some("cccc"), Some(""))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[String](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.map(_.length).sum / nonNullValues.length.toDouble,
-        nonNullValues.map(_.length).max.toInt,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for binary column") {
-    val values = Seq(None, Some("a"), Some("bbbb"), Some("cccc"), Some("")).map(_.map(_.getBytes))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Array[Byte]](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.map(_.length).sum / nonNullValues.length.toDouble,
-        nonNullValues.map(_.length).max.toInt))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for boolean column") {
-    val values = Seq(None, Some(true), Some(false), Some(true))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Boolean](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        nonNullValues.count(_.equals(true)).toLong,
-        nonNullValues.count(_.equals(false)).toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for date column") {
-    val values = Seq(None, Some("1970-01-01"), Some("1970-02-02")).map(_.map(Date.valueOf))
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Date](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        // Internally, DateType is represented as the number of days from 1970-01-01.
-        nonNullValues.map(DateTimeUtils.fromJavaDate).max,
-        nonNullValues.map(DateTimeUtils.fromJavaDate).min,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for timestamp column") {
-    val values = Seq(None, Some("1970-01-01 00:00:00"), Some("1970-01-01 00:00:05")).map { i =>
-      i.map(Timestamp.valueOf)
-    }
-    val df = values.toDF("c1")
-    val nonNullValues = getNonNullValues[Timestamp](values)
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = ColumnStat(InternalRow(
-        values.count(_.isEmpty).toLong,
-        // Internally, TimestampType is represented as the number of days from 1970-01-01
-        nonNullValues.map(DateTimeUtils.fromJavaTimestamp).max,
-        nonNullValues.map(DateTimeUtils.fromJavaTimestamp).min,
-        nonNullValues.distinct.length.toLong))
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for null columns") {
-    val values = Seq(None, None)
-    val data = values.map { i =>
-      (i.map(_.toString), i.map(_.toString.toInt))
-    }
-    val df = data.toDF("c1", "c2")
-    val expectedColStatsSeq = df.schema.map { f =>
-      (f, ColumnStat(InternalRow(values.count(_.isEmpty).toLong, null, null, 0L)))
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("column-level statistics for columns with different types") {
-    val intSeq = Seq(1, 2)
-    val doubleSeq = Seq(1.01d, 2.02d)
-    val stringSeq = Seq("a", "bb")
-    val binarySeq = Seq("a", "bb").map(_.getBytes)
-    val booleanSeq = Seq(true, false)
-    val dateSeq = Seq("1970-01-01", "1970-02-02").map(Date.valueOf)
-    val timestampSeq = Seq("1970-01-01 00:00:00", "1970-01-01 00:00:05").map(Timestamp.valueOf)
-    val longSeq = Seq(5L, 4L)
-
-    val data = intSeq.indices.map { i =>
-      (intSeq(i), doubleSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i), dateSeq(i),
-        timestampSeq(i), longSeq(i))
-    }
-    val df = data.toDF("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8")
-    val expectedColStatsSeq = df.schema.map { f =>
-      val colStat = f.dataType match {
-        case IntegerType =>
-          ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
-        case DoubleType =>
-          ColumnStat(InternalRow(0L, doubleSeq.max, doubleSeq.min,
-              doubleSeq.distinct.length.toLong))
-        case StringType =>
-          ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-                stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
-        case BinaryType =>
-          ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / binarySeq.length.toDouble,
-                binarySeq.map(_.length).max.toInt))
-        case BooleanType =>
-          ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
-              booleanSeq.count(_.equals(false)).toLong))
-        case DateType =>
-          ColumnStat(InternalRow(0L, dateSeq.map(DateTimeUtils.fromJavaDate).max,
-                dateSeq.map(DateTimeUtils.fromJavaDate).min, dateSeq.distinct.length.toLong))
-        case TimestampType =>
-          ColumnStat(InternalRow(0L, timestampSeq.map(DateTimeUtils.fromJavaTimestamp).max,
-                timestampSeq.map(DateTimeUtils.fromJavaTimestamp).min,
-                timestampSeq.distinct.length.toLong))
-        case LongType =>
-          ColumnStat(InternalRow(0L, longSeq.max, longSeq.min, longSeq.distinct.length.toLong))
-      }
-      (f, colStat)
-    }
-    checkColStats(df, expectedColStatsSeq)
-  }
-
-  test("update table-level stats while collecting column-level stats") {
-    val table = "tbl"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (c1 int) USING PARQUET")
-      sql(s"INSERT INTO $table SELECT 1")
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
-      checkTableStats(tableName = table, expectedRowCount = Some(1))
-
-      // update table-level stats between analyze table and analyze column commands
-      sql(s"INSERT INTO $table SELECT 1")
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
-      val fetchedStats = checkTableStats(tableName = table, expectedRowCount = Some(2))
-
-      val colStat = fetchedStats.get.colStats("c1")
-      StatisticsTest.checkColStat(
-        dataType = IntegerType,
-        colStat = colStat,
-        expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
-        rsd = spark.sessionState.conf.ndvMaxError)
-    }
-  }
-
-  test("analyze column stats independently") {
-    val table = "tbl"
-    withTable(table) {
-      sql(s"CREATE TABLE $table (c1 int, c2 long) USING PARQUET")
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c1")
-      val fetchedStats1 = checkTableStats(tableName = table, expectedRowCount = Some(0))
-      assert(fetchedStats1.get.colStats.size == 1)
-      val expected1 = ColumnStat(InternalRow(0L, null, null, 0L))
-      val rsd = spark.sessionState.conf.ndvMaxError
-      StatisticsTest.checkColStat(
-        dataType = IntegerType,
-        colStat = fetchedStats1.get.colStats("c1"),
-        expectedColStat = expected1,
-        rsd = rsd)
-
-      sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS c2")
-      val fetchedStats2 = checkTableStats(tableName = table, expectedRowCount = Some(0))
-      // column c1 is kept in the stats
-      assert(fetchedStats2.get.colStats.size == 2)
-      StatisticsTest.checkColStat(
-        dataType = IntegerType,
-        colStat = fetchedStats2.get.colStats("c1"),
-        expectedColStat = expected1,
-        rsd = rsd)
-      val expected2 = ColumnStat(InternalRow(0L, null, null, 0L))
-      StatisticsTest.checkColStat(
-        dataType = LongType,
-        colStat = fetchedStats2.get.colStats("c2"),
-        expectedColStat = expected2,
-        rsd = rsd)
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
deleted file mode 100644
index 8cf42e9248c2a..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import org.apache.spark.sql.catalyst.plans.logical.{GlobalLimit, Join, LocalLimit}
-import org.apache.spark.sql.types._
-
-class StatisticsSuite extends StatisticsTest {
-  import testImplicits._
-
-  test("SPARK-15392: DataFrame created from RDD should not be broadcasted") {
-    val rdd = sparkContext.range(1, 100).map(i => Row(i, i))
-    val df = spark.createDataFrame(rdd, new StructType().add("a", LongType).add("b", LongType))
-    assert(df.queryExecution.analyzed.statistics.sizeInBytes >
-      spark.sessionState.conf.autoBroadcastJoinThreshold)
-    assert(df.selectExpr("a").queryExecution.analyzed.statistics.sizeInBytes >
-      spark.sessionState.conf.autoBroadcastJoinThreshold)
-  }
-
-  test("estimates the size of limit") {
-    withTempView("test") {
-      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
-        .createOrReplaceTempView("test")
-      Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) =>
-        val df = sql(s"""SELECT * FROM test limit $limit""")
-
-        val sizesGlobalLimit = df.queryExecution.analyzed.collect { case g: GlobalLimit =>
-          g.statistics.sizeInBytes
-        }
-        assert(sizesGlobalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-        assert(sizesGlobalLimit.head === BigInt(expected),
-          s"expected exact size $expected for table 'test', got: ${sizesGlobalLimit.head}")
-
-        val sizesLocalLimit = df.queryExecution.analyzed.collect { case l: LocalLimit =>
-          l.statistics.sizeInBytes
-        }
-        assert(sizesLocalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}")
-        assert(sizesLocalLimit.head === BigInt(expected),
-          s"expected exact size $expected for table 'test', got: ${sizesLocalLimit.head}")
-      }
-    }
-  }
-
-  test("estimates the size of a limit 0 on outer join") {
-    withTempView("test") {
-      Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v")
-        .createOrReplaceTempView("test")
-      val df1 = spark.table("test")
-      val df2 = spark.table("test").limit(0)
-      val df = df1.join(df2, Seq("k"), "left")
-
-      val sizes = df.queryExecution.analyzed.collect { case g: Join =>
-        g.statistics.sizeInBytes
-      }
-
-      assert(sizes.size === 1, s"number of Join nodes is wrong:\n ${df.queryExecution}")
-      assert(sizes.head === BigInt(96),
-        s"expected exact size 96 for table 'test', got: ${sizes.head}")
-    }
-  }
-
-  test("test table-level statistics for data source table created in InMemoryCatalog") {
-    val tableName = "tbl"
-    withTable(tableName) {
-      sql(s"CREATE TABLE $tableName(i INT, j STRING) USING parquet")
-      Seq(1 -> "a", 2 -> "b").toDF("i", "j").write.mode("overwrite").insertInto(tableName)
-
-      // noscan won't count the number of rows
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
-      checkTableStats(tableName, expectedRowCount = None)
-
-      // without noscan, we count the number of rows
-      sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
-      checkTableStats(tableName, expectedRowCount = Some(2))
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
deleted file mode 100644
index 915ee0d31bca2..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsTest.scala
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql
-
-import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
-import org.apache.spark.sql.execution.command.AnalyzeColumnCommand
-import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types._
-
-
-trait StatisticsTest extends QueryTest with SharedSQLContext {
-
-  def checkColStats(
-      df: DataFrame,
-      expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
-    val table = "tbl"
-    withTable(table) {
-      df.write.format("json").saveAsTable(table)
-      val columns = expectedColStatsSeq.map(_._1)
-      val tableIdent = TableIdentifier(table, Some("default"))
-      val relation = spark.sessionState.catalog.lookupRelation(tableIdent)
-      val (_, columnStats) =
-        AnalyzeColumnCommand.computeColStats(spark, relation, columns.map(_.name))
-      expectedColStatsSeq.foreach { case (field, expectedColStat) =>
-        assert(columnStats.contains(field.name))
-        val colStat = columnStats(field.name)
-        StatisticsTest.checkColStat(
-          dataType = field.dataType,
-          colStat = colStat,
-          expectedColStat = expectedColStat,
-          rsd = spark.sessionState.conf.ndvMaxError)
-
-        // check if we get the same colStat after encoding and decoding
-        val encodedCS = colStat.toString
-        val numFields = AnalyzeColumnCommand.numStatFields(field.dataType)
-        val decodedCS = ColumnStat(numFields, encodedCS)
-        StatisticsTest.checkColStat(
-          dataType = field.dataType,
-          colStat = decodedCS,
-          expectedColStat = expectedColStat,
-          rsd = spark.sessionState.conf.ndvMaxError)
-      }
-    }
-  }
-
-  def checkTableStats(tableName: String, expectedRowCount: Option[Int]): Option[Statistics] = {
-    val df = spark.table(tableName)
-    val stats = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-      assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
-      rel.catalogTable.get.stats
-    }
-    assert(stats.size == 1)
-    stats.head
-  }
-}
-
-object StatisticsTest {
-  def checkColStat(
-      dataType: DataType,
-      colStat: ColumnStat,
-      expectedColStat: ColumnStat,
-      rsd: Double): Unit = {
-    dataType match {
-      case StringType =>
-        val cs = colStat.forString
-        val expectedCS = expectedColStat.forString
-        assert(cs.numNulls == expectedCS.numNulls)
-        assert(cs.avgColLen == expectedCS.avgColLen)
-        assert(cs.maxColLen == expectedCS.maxColLen)
-        checkNdv(ndv = cs.ndv, expectedNdv = expectedCS.ndv, rsd = rsd)
-      case BinaryType =>
-        val cs = colStat.forBinary
-        val expectedCS = expectedColStat.forBinary
-        assert(cs.numNulls == expectedCS.numNulls)
-        assert(cs.avgColLen == expectedCS.avgColLen)
-        assert(cs.maxColLen == expectedCS.maxColLen)
-      case BooleanType =>
-        val cs = colStat.forBoolean
-        val expectedCS = expectedColStat.forBoolean
-        assert(cs.numNulls == expectedCS.numNulls)
-        assert(cs.numTrues == expectedCS.numTrues)
-        assert(cs.numFalses == expectedCS.numFalses)
-      case atomicType: AtomicType =>
-        checkNumericColStats(
-          dataType = atomicType, colStat = colStat, expectedColStat = expectedColStat, rsd = rsd)
-    }
-  }
-
-  private def checkNumericColStats(
-      dataType: AtomicType,
-      colStat: ColumnStat,
-      expectedColStat: ColumnStat,
-      rsd: Double): Unit = {
-    val cs = colStat.forNumeric(dataType)
-    val expectedCS = expectedColStat.forNumeric(dataType)
-    assert(cs.numNulls == expectedCS.numNulls)
-    assert(cs.max == expectedCS.max)
-    assert(cs.min == expectedCS.min)
-    checkNdv(ndv = cs.ndv, expectedNdv = expectedCS.ndv, rsd = rsd)
-  }
-
-  private def checkNdv(ndv: Long, expectedNdv: Long, rsd: Double): Unit = {
-    // ndv is an approximate value, so we make sure we have the value, and it should be
-    // within 3*SD's of the given rsd.
-    if (expectedNdv == 0) {
-      assert(ndv == 0)
-    } else if (expectedNdv > 0) {
-      assert(ndv > 0)
-      val error = math.abs((ndv / expectedNdv.toDouble) - 1.0d)
-      assert(error <= rsd * 3.0d, "Error should be within 3 std. errors.")
-    }
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index 797fe9ffa8be1..b070138be05d5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -23,9 +23,8 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat,
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DescribeFunctionCommand,
-  DescribeTableCommand, ShowFunctionsCommand}
-import org.apache.spark.sql.execution.datasources.{CreateTable, CreateTempViewUsing}
+import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.execution.datasources.CreateTable
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
 
@@ -221,12 +220,22 @@ class SparkSqlParserSuite extends PlanTest {
     intercept("explain describe tables x", "Unsupported SQL statement")
   }
 
-  test("SPARK-18106 analyze table") {
+  test("analyze table statistics") {
     assertEqual("analyze table t compute statistics",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
     assertEqual("analyze table t compute statistics noscan",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
-    assertEqual("analyze table t partition (a) compute statistics noscan",
+    assertEqual("analyze table t partition (a) compute statistics nOscAn",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+
+    // Partitions specified - we currently parse them but don't do anything with it
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+    assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
+    assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS",
+      AnalyzeTableCommand(TableIdentifier("t"), noscan = false))
+    assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS noscan",
       AnalyzeTableCommand(TableIdentifier("t"), noscan = true))
 
     intercept("analyze table t compute statistics xxxx",
@@ -234,4 +243,11 @@ class SparkSqlParserSuite extends PlanTest {
     intercept("analyze table t partition (a) compute statistics xxxx",
       "Expected `NOSCAN` instead of `xxxx`")
   }
+
+  test("analyze table column statistics") {
+    intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS", "")
+
+    assertEqual("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS key, value",
+      AnalyzeColumnCommand(TableIdentifier("t"), Seq("key", "value")))
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index ff0923f04893d..fd9dc32063872 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
-import org.apache.spark.sql.execution.command.{AnalyzeColumnCommand, DDLUtils}
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -514,7 +514,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         statsProperties += STATISTICS_NUM_ROWS -> stats.rowCount.get.toString()
       }
       stats.colStats.foreach { case (colName, colStat) =>
-        statsProperties += (STATISTICS_COL_STATS_PREFIX + colName) -> colStat.toString
+        colStat.toMap.foreach { case (k, v) =>
+          statsProperties += (columnStatKeyPropName(colName, k) -> v)
+        }
       }
       tableDefinition.copy(properties = tableDefinition.properties ++ statsProperties)
     } else {
@@ -605,48 +607,65 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
    * It reads table schema, provider, partition column names and bucket specification from table
    * properties, and filter out these special entries from table properties.
    */
-  private def restoreTableMetadata(table: CatalogTable): CatalogTable = {
+  private def restoreTableMetadata(inputTable: CatalogTable): CatalogTable = {
     if (conf.get(DEBUG_MODE)) {
-      return table
+      return inputTable
     }
 
-    val tableWithSchema = if (table.tableType == VIEW) {
-      table
-    } else {
-      getProviderFromTableProperties(table) match {
+    var table = inputTable
+
+    if (table.tableType != VIEW) {
+      table.properties.get(DATASOURCE_PROVIDER) match {
         // No provider in table properties, which means this table is created by Spark prior to 2.1,
         // or is created at Hive side.
         case None =>
-          table.copy(provider = Some(DDLUtils.HIVE_PROVIDER), tracksPartitionsInCatalog = true)
+          table = table.copy(
+            provider = Some(DDLUtils.HIVE_PROVIDER), tracksPartitionsInCatalog = true)
 
         // This is a Hive serde table created by Spark 2.1 or higher versions.
-        case Some(DDLUtils.HIVE_PROVIDER) => restoreHiveSerdeTable(table)
+        case Some(DDLUtils.HIVE_PROVIDER) =>
+          table = restoreHiveSerdeTable(table)
 
         // This is a regular data source table.
-        case Some(provider) => restoreDataSourceTable(table, provider)
+        case Some(provider) =>
+          table = restoreDataSourceTable(table, provider)
       }
     }
 
     // construct Spark's statistics from information in Hive metastore
-    val statsProps = tableWithSchema.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
-    val tableWithStats = if (statsProps.nonEmpty) {
-      val colStatsProps = statsProps.filterKeys(_.startsWith(STATISTICS_COL_STATS_PREFIX))
-        .map { case (k, v) => (k.drop(STATISTICS_COL_STATS_PREFIX.length), v) }
-      val colStats: Map[String, ColumnStat] = tableWithSchema.schema.collect {
-        case f if colStatsProps.contains(f.name) =>
-          val numFields = AnalyzeColumnCommand.numStatFields(f.dataType)
-          (f.name, ColumnStat(numFields, colStatsProps(f.name)))
-      }.toMap
-      tableWithSchema.copy(
+    val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
+
+    if (statsProps.nonEmpty) {
+      val colStats = new scala.collection.mutable.HashMap[String, ColumnStat]
+
+      // For each column, recover its column stats. Note that this is currently a O(n^2) operation,
+      // but given the number of columns it usually not enormous, this is probably OK as a start.
+      // If we want to map this a linear operation, we'd need a stronger contract between the
+      // naming convention used for serialization.
+      table.schema.foreach { field =>
+        if (statsProps.contains(columnStatKeyPropName(field.name, ColumnStat.KEY_VERSION))) {
+          // If "version" field is defined, then the column stat is defined.
+          val keyPrefix = columnStatKeyPropName(field.name, "")
+          val colStatMap = statsProps.filterKeys(_.startsWith(keyPrefix)).map { case (k, v) =>
+            (k.drop(keyPrefix.length), v)
+          }
+
+          ColumnStat.fromMap(table.identifier.table, field, colStatMap).foreach {
+            colStat => colStats += field.name -> colStat
+          }
+        }
+      }
+
+      table = table.copy(
         stats = Some(Statistics(
-          sizeInBytes = BigInt(tableWithSchema.properties(STATISTICS_TOTAL_SIZE)),
-          rowCount = tableWithSchema.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
-          colStats = colStats)))
-    } else {
-      tableWithSchema
+          sizeInBytes = BigInt(table.properties(STATISTICS_TOTAL_SIZE)),
+          rowCount = table.properties.get(STATISTICS_NUM_ROWS).map(BigInt(_)),
+          colStats = colStats.toMap)))
     }
 
-    tableWithStats.copy(properties = getOriginalTableProperties(table))
+    // Get the original table properties as defined by the user.
+    table.copy(
+      properties = table.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) })
   }
 
   private def restoreHiveSerdeTable(table: CatalogTable): CatalogTable = {
@@ -1020,17 +1039,17 @@ object HiveExternalCatalog {
   val TABLE_PARTITION_PROVIDER_CATALOG = "catalog"
   val TABLE_PARTITION_PROVIDER_FILESYSTEM = "filesystem"
 
-
-  def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = {
-    metadata.properties.get(DATASOURCE_PROVIDER)
-  }
-
-  def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = {
-    metadata.properties.filterNot { case (key, _) => key.startsWith(SPARK_SQL_PREFIX) }
+  /**
+   * Returns the fully qualified name used in table properties for a particular column stat.
+   * For example, for column "mycol", and "min" stat, this should return
+   * "spark.sql.statistics.colStats.mycol.min".
+   */
+  private def columnStatKeyPropName(columnName: String, statKey: String): String = {
+    STATISTICS_COL_STATS_PREFIX + columnName + "." + statKey
   }
 
   // A persisted data source table always store its schema in the catalog.
-  def getSchemaFromTableProperties(metadata: CatalogTable): StructType = {
+  private def getSchemaFromTableProperties(metadata: CatalogTable): StructType = {
     val errorMessage = "Could not read schema from the hive metastore because it is corrupted."
     val props = metadata.properties
     val schema = props.get(DATASOURCE_SCHEMA)
@@ -1078,11 +1097,11 @@ object HiveExternalCatalog {
     )
   }
 
-  def getPartitionColumnsFromTableProperties(metadata: CatalogTable): Seq[String] = {
+  private def getPartitionColumnsFromTableProperties(metadata: CatalogTable): Seq[String] = {
     getColumnNamesByType(metadata.properties, "part", "partitioning columns")
   }
 
-  def getBucketSpecFromTableProperties(metadata: CatalogTable): Option[BucketSpec] = {
+  private def getBucketSpecFromTableProperties(metadata: CatalogTable): Option[BucketSpec] = {
     metadata.properties.get(DATASOURCE_SCHEMA_NUMBUCKETS).map { numBuckets =>
       BucketSpec(
         numBuckets.toInt,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 4f5ebc3d838b9..5ae202fdc98da 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -22,56 +22,16 @@ import java.io.{File, PrintWriter}
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
-import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DDLUtils}
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 
-class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
-
-  test("parse analyze commands") {
-    def assertAnalyzeCommand(analyzeCommand: String, c: Class[_]) {
-      val parsed = spark.sessionState.sqlParser.parsePlan(analyzeCommand)
-      val operators = parsed.collect {
-        case a: AnalyzeTableCommand => a
-        case o => o
-      }
-
-      assert(operators.size === 1)
-      if (operators(0).getClass() != c) {
-        fail(
-          s"""$analyzeCommand expected command: $c, but got ${operators(0)}
-             |parsed command:
-             |$parsed
-           """.stripMargin)
-      }
-    }
-
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 COMPUTE STATISTICS",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS",
-      classOf[AnalyzeTableCommand])
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 PARTITION(ds, hr) COMPUTE STATISTICS noscan",
-      classOf[AnalyzeTableCommand])
-
-    assertAnalyzeCommand(
-      "ANALYZE TABLE Table1 COMPUTE STATISTICS nOscAn",
-      classOf[AnalyzeTableCommand])
-  }
+class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
 
   test("MetastoreRelations fallback to HDFS for size estimation") {
     val enableFallBackToHdfsForStats = spark.sessionState.conf.fallBackToHdfsForStatsEnabled
@@ -310,6 +270,110 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
+  test("verify serialized column stats after analyzing columns") {
+    import testImplicits._
+
+    val tableName = "column_stats_test2"
+    // (data.head.productArity - 1) because the last column does not support stats collection.
+    assert(stats.size == data.head.productArity - 1)
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+
+    withTable(tableName) {
+      df.write.saveAsTable(tableName)
+
+      // Collect statistics
+      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " + stats.keys.mkString(", "))
+
+      // Validate statistics
+      val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+      val table = hiveClient.getTable("default", tableName)
+
+      val props = table.properties.filterKeys(_.startsWith("spark.sql.statistics.colStats"))
+      assert(props == Map(
+        "spark.sql.statistics.colStats.cbinary.avgLen" -> "3",
+        "spark.sql.statistics.colStats.cbinary.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cbinary.maxLen" -> "3",
+        "spark.sql.statistics.colStats.cbinary.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cbinary.version" -> "1",
+        "spark.sql.statistics.colStats.cbool.avgLen" -> "1",
+        "spark.sql.statistics.colStats.cbool.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cbool.max" -> "true",
+        "spark.sql.statistics.colStats.cbool.maxLen" -> "1",
+        "spark.sql.statistics.colStats.cbool.min" -> "false",
+        "spark.sql.statistics.colStats.cbool.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cbool.version" -> "1",
+        "spark.sql.statistics.colStats.cbyte.avgLen" -> "1",
+        "spark.sql.statistics.colStats.cbyte.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cbyte.max" -> "2",
+        "spark.sql.statistics.colStats.cbyte.maxLen" -> "1",
+        "spark.sql.statistics.colStats.cbyte.min" -> "1",
+        "spark.sql.statistics.colStats.cbyte.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cbyte.version" -> "1",
+        "spark.sql.statistics.colStats.cdate.avgLen" -> "4",
+        "spark.sql.statistics.colStats.cdate.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cdate.max" -> "2016-05-09",
+        "spark.sql.statistics.colStats.cdate.maxLen" -> "4",
+        "spark.sql.statistics.colStats.cdate.min" -> "2016-05-08",
+        "spark.sql.statistics.colStats.cdate.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cdate.version" -> "1",
+        "spark.sql.statistics.colStats.cdecimal.avgLen" -> "16",
+        "spark.sql.statistics.colStats.cdecimal.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cdecimal.max" -> "8.000000000000000000",
+        "spark.sql.statistics.colStats.cdecimal.maxLen" -> "16",
+        "spark.sql.statistics.colStats.cdecimal.min" -> "1.000000000000000000",
+        "spark.sql.statistics.colStats.cdecimal.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cdecimal.version" -> "1",
+        "spark.sql.statistics.colStats.cdouble.avgLen" -> "8",
+        "spark.sql.statistics.colStats.cdouble.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cdouble.max" -> "6.0",
+        "spark.sql.statistics.colStats.cdouble.maxLen" -> "8",
+        "spark.sql.statistics.colStats.cdouble.min" -> "1.0",
+        "spark.sql.statistics.colStats.cdouble.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cdouble.version" -> "1",
+        "spark.sql.statistics.colStats.cfloat.avgLen" -> "4",
+        "spark.sql.statistics.colStats.cfloat.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cfloat.max" -> "7.0",
+        "spark.sql.statistics.colStats.cfloat.maxLen" -> "4",
+        "spark.sql.statistics.colStats.cfloat.min" -> "1.0",
+        "spark.sql.statistics.colStats.cfloat.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cfloat.version" -> "1",
+        "spark.sql.statistics.colStats.cint.avgLen" -> "4",
+        "spark.sql.statistics.colStats.cint.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cint.max" -> "4",
+        "spark.sql.statistics.colStats.cint.maxLen" -> "4",
+        "spark.sql.statistics.colStats.cint.min" -> "1",
+        "spark.sql.statistics.colStats.cint.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cint.version" -> "1",
+        "spark.sql.statistics.colStats.clong.avgLen" -> "8",
+        "spark.sql.statistics.colStats.clong.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.clong.max" -> "5",
+        "spark.sql.statistics.colStats.clong.maxLen" -> "8",
+        "spark.sql.statistics.colStats.clong.min" -> "1",
+        "spark.sql.statistics.colStats.clong.nullCount" -> "1",
+        "spark.sql.statistics.colStats.clong.version" -> "1",
+        "spark.sql.statistics.colStats.cshort.avgLen" -> "2",
+        "spark.sql.statistics.colStats.cshort.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cshort.max" -> "3",
+        "spark.sql.statistics.colStats.cshort.maxLen" -> "2",
+        "spark.sql.statistics.colStats.cshort.min" -> "1",
+        "spark.sql.statistics.colStats.cshort.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cshort.version" -> "1",
+        "spark.sql.statistics.colStats.cstring.avgLen" -> "3",
+        "spark.sql.statistics.colStats.cstring.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.cstring.maxLen" -> "3",
+        "spark.sql.statistics.colStats.cstring.nullCount" -> "1",
+        "spark.sql.statistics.colStats.cstring.version" -> "1",
+        "spark.sql.statistics.colStats.ctimestamp.avgLen" -> "8",
+        "spark.sql.statistics.colStats.ctimestamp.distinctCount" -> "2",
+        "spark.sql.statistics.colStats.ctimestamp.max" -> "2016-05-09 00:00:02.0",
+        "spark.sql.statistics.colStats.ctimestamp.maxLen" -> "8",
+        "spark.sql.statistics.colStats.ctimestamp.min" -> "2016-05-08 00:00:01.0",
+        "spark.sql.statistics.colStats.ctimestamp.nullCount" -> "1",
+        "spark.sql.statistics.colStats.ctimestamp.version" -> "1"
+      ))
+    }
+  }
+
   private def testUpdatingTableStats(tableDescription: String, createTableCmd: String): Unit = {
     test("test table-level statistics for " + tableDescription) {
       val parquetTable = "parquetTable"
@@ -319,7 +383,8 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
           TableIdentifier(parquetTable))
         assert(DDLUtils.isDatasourceTable(catalogTable))
 
-        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        // Add a filter to avoid creating too many partitions
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src WHERE key < 10")
         checkTableStats(
           parquetTable, isDataSourceTable = true, hasSizeInBytes = false, expectedRowCounts = None)
 
@@ -328,7 +393,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
         val fetchedStats1 = checkTableStats(
           parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
 
-        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
+        sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src WHERE key < 10")
         sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
         val fetchedStats2 = checkTableStats(
           parquetTable, isDataSourceTable = true, hasSizeInBytes = true, expectedRowCounts = None)
@@ -340,7 +405,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
           parquetTable,
           isDataSourceTable = true,
           hasSizeInBytes = true,
-          expectedRowCounts = Some(1000))
+          expectedRowCounts = Some(20))
         assert(fetchedStats3.get.sizeInBytes == fetchedStats2.get.sizeInBytes)
       }
     }
@@ -369,6 +434,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     }
   }
 
+  /** Used to test refreshing cached metadata once table stats are updated. */
   private def getStatsBeforeAfterUpdate(isAnalyzeColumns: Boolean): (Statistics, Statistics) = {
     val tableName = "tbl"
     var statsBeforeUpdate: Statistics = null
@@ -411,145 +477,6 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
     assert(statsAfterUpdate.rowCount == Some(2))
   }
 
-  test("test refreshing column stats of cached data source table by `ANALYZE TABLE` statement") {
-    val (statsBeforeUpdate, statsAfterUpdate) = getStatsBeforeAfterUpdate(isAnalyzeColumns = true)
-
-    assert(statsBeforeUpdate.sizeInBytes > 0)
-    assert(statsBeforeUpdate.rowCount == Some(1))
-    StatisticsTest.checkColStat(
-      dataType = IntegerType,
-      colStat = statsBeforeUpdate.colStats("key"),
-      expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
-      rsd = spark.sessionState.conf.ndvMaxError)
-
-    assert(statsAfterUpdate.sizeInBytes > statsBeforeUpdate.sizeInBytes)
-    assert(statsAfterUpdate.rowCount == Some(2))
-    StatisticsTest.checkColStat(
-      dataType = IntegerType,
-      colStat = statsAfterUpdate.colStats("key"),
-      expectedColStat = ColumnStat(InternalRow(0L, 2, 1, 2L)),
-      rsd = spark.sessionState.conf.ndvMaxError)
-  }
-
-  private lazy val (testDataFrame, expectedColStatsSeq) = {
-    import testImplicits._
-
-    val intSeq = Seq(1, 2)
-    val stringSeq = Seq("a", "bb")
-    val binarySeq = Seq("a", "bb").map(_.getBytes)
-    val booleanSeq = Seq(true, false)
-    val data = intSeq.indices.map { i =>
-      (intSeq(i), stringSeq(i), binarySeq(i), booleanSeq(i))
-    }
-    val df: DataFrame = data.toDF("c1", "c2", "c3", "c4")
-    val expectedColStatsSeq: Seq[(StructField, ColumnStat)] = df.schema.map { f =>
-      val colStat = f.dataType match {
-        case IntegerType =>
-          ColumnStat(InternalRow(0L, intSeq.max, intSeq.min, intSeq.distinct.length.toLong))
-        case StringType =>
-          ColumnStat(InternalRow(0L, stringSeq.map(_.length).sum / stringSeq.length.toDouble,
-            stringSeq.map(_.length).max.toInt, stringSeq.distinct.length.toLong))
-        case BinaryType =>
-          ColumnStat(InternalRow(0L, binarySeq.map(_.length).sum / binarySeq.length.toDouble,
-            binarySeq.map(_.length).max.toInt))
-        case BooleanType =>
-          ColumnStat(InternalRow(0L, booleanSeq.count(_.equals(true)).toLong,
-            booleanSeq.count(_.equals(false)).toLong))
-      }
-      (f, colStat)
-    }
-    (df, expectedColStatsSeq)
-  }
-
-  private def checkColStats(
-      tableName: String,
-      isDataSourceTable: Boolean,
-      expectedColStatsSeq: Seq[(StructField, ColumnStat)]): Unit = {
-    val readback = spark.table(tableName)
-    val stats = readback.queryExecution.analyzed.collect {
-      case rel: MetastoreRelation =>
-        assert(!isDataSourceTable, "Expected a Hive serde table, but got a data source table")
-        rel.catalogTable.stats.get
-      case rel: LogicalRelation =>
-        assert(isDataSourceTable, "Expected a data source table, but got a Hive serde table")
-        rel.catalogTable.get.stats.get
-    }
-    assert(stats.length == 1)
-    val columnStats = stats.head.colStats
-    assert(columnStats.size == expectedColStatsSeq.length)
-    expectedColStatsSeq.foreach { case (field, expectedColStat) =>
-      StatisticsTest.checkColStat(
-        dataType = field.dataType,
-        colStat = columnStats(field.name),
-        expectedColStat = expectedColStat,
-        rsd = spark.sessionState.conf.ndvMaxError)
-    }
-  }
-
-  test("generate and load column-level stats for data source table") {
-    val dsTable = "dsTable"
-    withTable(dsTable) {
-      testDataFrame.write.format("parquet").saveAsTable(dsTable)
-      sql(s"ANALYZE TABLE $dsTable COMPUTE STATISTICS FOR COLUMNS c1, c2, c3, c4")
-      checkColStats(dsTable, isDataSourceTable = true, expectedColStatsSeq)
-    }
-  }
-
-  test("generate and load column-level stats for hive serde table") {
-    val hTable = "hTable"
-    val tmp = "tmp"
-    withTable(hTable, tmp) {
-      testDataFrame.write.format("parquet").saveAsTable(tmp)
-      sql(s"CREATE TABLE $hTable (c1 int, c2 string, c3 binary, c4 boolean) STORED AS TEXTFILE")
-      sql(s"INSERT INTO $hTable SELECT * FROM $tmp")
-      sql(s"ANALYZE TABLE $hTable COMPUTE STATISTICS FOR COLUMNS c1, c2, c3, c4")
-      checkColStats(hTable, isDataSourceTable = false, expectedColStatsSeq)
-    }
-  }
-
-  // When caseSensitive is on, for columns with only case difference, they are different columns
-  // and we should generate column stats for all of them.
-  private def checkCaseSensitiveColStats(columnName: String): Unit = {
-    val tableName = "tbl"
-    withTable(tableName) {
-      val column1 = columnName.toLowerCase
-      val column2 = columnName.toUpperCase
-      withSQLConf("spark.sql.caseSensitive" -> "true") {
-        sql(s"CREATE TABLE $tableName (`$column1` int, `$column2` double) USING PARQUET")
-        sql(s"INSERT INTO $tableName SELECT 1, 3.0")
-        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS `$column1`, `$column2`")
-        val readback = spark.table(tableName)
-        val relations = readback.queryExecution.analyzed.collect { case rel: LogicalRelation =>
-          val columnStats = rel.catalogTable.get.stats.get.colStats
-          assert(columnStats.size == 2)
-          StatisticsTest.checkColStat(
-            dataType = IntegerType,
-            colStat = columnStats(column1),
-            expectedColStat = ColumnStat(InternalRow(0L, 1, 1, 1L)),
-            rsd = spark.sessionState.conf.ndvMaxError)
-          StatisticsTest.checkColStat(
-            dataType = DoubleType,
-            colStat = columnStats(column2),
-            expectedColStat = ColumnStat(InternalRow(0L, 3.0d, 3.0d, 1L)),
-            rsd = spark.sessionState.conf.ndvMaxError)
-          rel
-        }
-        assert(relations.size == 1)
-      }
-    }
-  }
-
-  test("check column statistics for case sensitive column names") {
-    checkCaseSensitiveColStats(columnName = "c1")
-  }
-
-  test("check column statistics for case sensitive non-ascii column names") {
-    // scalastyle:off
-    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
-    checkCaseSensitiveColStats(columnName = "列c")
-    // scalastyle:on
-  }
-
   test("estimates the size of a test MetastoreRelation") {
     val df = sql("""SELECT * FROM src""")
     val sizes = df.queryExecution.analyzed.collect { case mr: MetastoreRelation =>

From 835f03f344f2dea2134409d09e06b34feaae09f9 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 23 Nov 2016 12:54:18 -0500
Subject: [PATCH 184/534] [SPARK-18050][SQL] do not create default database if
 it already exists

## What changes were proposed in this pull request?

When we try to create the default database, we ask hive to do nothing if it already exists. However, Hive will log an error message instead of doing nothing, and the error message is quite annoying and confusing.

In this PR, we only create default database if it doesn't exist.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15993 from cloud-fan/default-db.

(cherry picked from commit f129ebcd302168b628f47705f4a7d6b7e7b057b0)
Signed-off-by: Andrew Or <andrewor14@gmail.com>
---
 .../scala/org/apache/spark/sql/internal/SharedState.scala | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index 6232c18b1cea8..8de95fe64e663 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -92,8 +92,12 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
   {
     val defaultDbDefinition = CatalogDatabase(
       SessionCatalog.DEFAULT_DATABASE, "default database", warehousePath, Map())
-    // Initialize default database if it doesn't already exist
-    externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true)
+    // Initialize default database if it doesn't exist
+    if (!externalCatalog.databaseExists(SessionCatalog.DEFAULT_DATABASE)) {
+      // There may be another Spark application creating default database at the same time, here we
+      // set `ignoreIfExists = true` to avoid `DatabaseAlreadyExists` exception.
+      externalCatalog.createDatabase(defaultDbDefinition, ignoreIfExists = true)
+    }
   }
 
   /**

From 15d2cf26427084c0398f8d9303c218f360c52bb7 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Wed, 23 Nov 2016 11:48:59 -0800
Subject: [PATCH 185/534] [SPARK-18510] Fix data corruption from inferred
 partition column dataTypes

## What changes were proposed in this pull request?

### The Issue

If I specify my schema when doing
```scala
spark.read
  .schema(someSchemaWherePartitionColumnsAreStrings)
```
but if the partition inference can infer it as IntegerType or I assume LongType or DoubleType (basically fixed size types), then once UnsafeRows are generated, your data will be corrupted.

### Proposed solution

The partition handling code path is kind of a mess. In my fix I'm probably adding to the mess, but at least trying to standardize the code path.

The real issue is that a user that uses the `spark.read` code path can never clearly specify what the partition columns are. If you try to specify the fields in `schema`, we practically ignore what the user provides, and fall back to our inferred data types. What happens in the end is data corruption.

My solution tries to fix this by always trying to infer partition columns the first time you specify the table. Once we find what the partition columns are, we try to find them in the user specified schema and use the dataType provided there, or fall back to the smallest common data type.

We will ALWAYS append partition columns to the user's schema, even if they didn't ask for it. We will only use the data type they provided if they specified it. While this is confusing, this has been the behavior since Spark 1.6, and I didn't want to change this behavior in the QA period of Spark 2.1. We may revisit this decision later.

A side effect of this PR is that we won't need https://github.com/apache/spark/pull/15942 if this PR goes in.

## How was this patch tested?

Regression tests

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #15951 from brkyvz/partition-corruption.

(cherry picked from commit 0d1bf2b6c8ac4d4141d7cef0552c22e586843c57)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 R/pkg/inst/tests/testthat/test_sparkSQL.R     |   2 +-
 .../execution/datasources/DataSource.scala    | 159 ++++++++++++------
 .../sql/execution/command/DDLSuite.scala      |   2 +-
 .../sql/streaming/FileStreamSourceSuite.scala |   2 +-
 .../test/DataStreamReaderWriterSuite.scala    |  45 ++++-
 .../sql/test/DataFrameReaderWriterSuite.scala |  38 ++++-
 6 files changed, 190 insertions(+), 58 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index ee48baa59c7af..c669c2e2e26ef 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2684,7 +2684,7 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
   # It makes sure that we can omit path argument in read.df API and then it calls
   # DataFrameWriter.load() without path.
   expect_error(read.df(source = "json"),
-               paste("Error in loadDF : analysis error - Unable to infer schema for JSON at .",
+               paste("Error in loadDF : analysis error - Unable to infer schema for JSON.",
                      "It must be specified manually"))
   expect_error(read.df("arbitrary_path"), "Error in loadDF : analysis error - Path does not exist")
   expect_error(read.json("arbitrary_path"), "Error in json : analysis error - Path does not exist")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 84fde0bbf9268..dbc3e712332f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -61,8 +61,12 @@ import org.apache.spark.util.Utils
  *              qualified. This option only works when reading from a [[FileFormat]].
  * @param userSpecifiedSchema An optional specification of the schema of the data. When present
  *                            we skip attempting to infer the schema.
- * @param partitionColumns A list of column names that the relation is partitioned by. When this
- *                         list is empty, the relation is unpartitioned.
+ * @param partitionColumns A list of column names that the relation is partitioned by. This list is
+ *                         generally empty during the read path, unless this DataSource is managed
+ *                         by Hive. In these cases, during `resolveRelation`, we will call
+ *                         `getOrInferFileFormatSchema` for file based DataSources to infer the
+ *                         partitioning. In other cases, if this list is empty, then this table
+ *                         is unpartitioned.
  * @param bucketSpec An optional specification for bucketing (hash-partitioning) of the data.
  * @param catalogTable Optional catalog table reference that can be used to push down operations
  *                     over the datasource to the catalog service.
@@ -84,30 +88,106 @@ case class DataSource(
   private val caseInsensitiveOptions = new CaseInsensitiveMap(options)
 
   /**
-   * Infer the schema of the given FileFormat, returns a pair of schema and partition column names.
+   * Get the schema of the given FileFormat, if provided by `userSpecifiedSchema`, or try to infer
+   * it. In the read path, only managed tables by Hive provide the partition columns properly when
+   * initializing this class. All other file based data sources will try to infer the partitioning,
+   * and then cast the inferred types to user specified dataTypes if the partition columns exist
+   * inside `userSpecifiedSchema`, otherwise we can hit data corruption bugs like SPARK-18510.
+   * This method will try to skip file scanning whether `userSpecifiedSchema` and
+   * `partitionColumns` are provided. Here are some code paths that use this method:
+   *   1. `spark.read` (no schema): Most amount of work. Infer both schema and partitioning columns
+   *   2. `spark.read.schema(userSpecifiedSchema)`: Parse partitioning columns, cast them to the
+   *     dataTypes provided in `userSpecifiedSchema` if they exist or fallback to inferred
+   *     dataType if they don't.
+   *   3. `spark.readStream.schema(userSpecifiedSchema)`: For streaming use cases, users have to
+   *     provide the schema. Here, we also perform partition inference like 2, and try to use
+   *     dataTypes in `userSpecifiedSchema`. All subsequent triggers for this stream will re-use
+   *     this information, therefore calls to this method should be very cheap, i.e. there won't
+   *     be any further inference in any triggers.
+   *   4. `df.saveAsTable(tableThatExisted)`: In this case, we call this method to resolve the
+   *     existing table's partitioning scheme. This is achieved by not providing
+   *     `userSpecifiedSchema`. For this case, we add the boolean `justPartitioning` for an early
+   *     exit, if we don't care about the schema of the original table.
+   *
+   * @param format the file format object for this DataSource
+   * @param justPartitioning Whether to exit early and provide just the schema partitioning.
+   * @return A pair of the data schema (excluding partition columns) and the schema of the partition
+   *         columns. If `justPartitioning` is `true`, then the dataSchema will be provided as
+   *         `null`.
    */
-  private def inferFileFormatSchema(format: FileFormat): (StructType, Seq[String]) = {
-    userSpecifiedSchema.map(_ -> partitionColumns).orElse {
-      val allPaths = caseInsensitiveOptions.get("path")
+  private def getOrInferFileFormatSchema(
+      format: FileFormat,
+      justPartitioning: Boolean = false): (StructType, StructType) = {
+    // the operations below are expensive therefore try not to do them if we don't need to
+    lazy val tempFileCatalog = {
+      val allPaths = caseInsensitiveOptions.get("path") ++ paths
+      val hadoopConf = sparkSession.sessionState.newHadoopConf()
       val globbedPaths = allPaths.toSeq.flatMap { path =>
         val hdfsPath = new Path(path)
-        val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+        val fs = hdfsPath.getFileSystem(hadoopConf)
         val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
         SparkHadoopUtil.get.globPathIfNecessary(qualified)
       }.toArray
-      val fileCatalog = new InMemoryFileIndex(sparkSession, globbedPaths, options, None)
-      val partitionSchema = fileCatalog.partitionSpec().partitionColumns
-      val inferred = format.inferSchema(
+      new InMemoryFileIndex(sparkSession, globbedPaths, options, None)
+    }
+    val partitionSchema = if (partitionColumns.isEmpty && catalogTable.isEmpty) {
+      // Try to infer partitioning, because no DataSource in the read path provides the partitioning
+      // columns properly unless it is a Hive DataSource
+      val resolved = tempFileCatalog.partitionSchema.map { partitionField =>
+        val equality = sparkSession.sessionState.conf.resolver
+        // SPARK-18510: try to get schema from userSpecifiedSchema, otherwise fallback to inferred
+        userSpecifiedSchema.flatMap(_.find(f => equality(f.name, partitionField.name))).getOrElse(
+          partitionField)
+      }
+      StructType(resolved)
+    } else {
+      // in streaming mode, we have already inferred and registered partition columns, we will
+      // never have to materialize the lazy val below
+      lazy val inferredPartitions = tempFileCatalog.partitionSchema
+      // maintain old behavior before SPARK-18510. If userSpecifiedSchema is empty used inferred
+      // partitioning
+      if (userSpecifiedSchema.isEmpty) {
+        inferredPartitions
+      } else {
+        val partitionFields = partitionColumns.map { partitionColumn =>
+          userSpecifiedSchema.flatMap(_.find(_.name == partitionColumn)).orElse {
+            val inferredOpt = inferredPartitions.find(_.name == partitionColumn)
+            if (inferredOpt.isDefined) {
+              logDebug(
+                s"""Type of partition column: $partitionColumn not found in specified schema
+                   |for $format.
+                   |User Specified Schema
+                   |=====================
+                   |${userSpecifiedSchema.orNull}
+                   |
+                   |Falling back to inferred dataType if it exists.
+                 """.stripMargin)
+            }
+            inferredPartitions.find(_.name == partitionColumn)
+          }.getOrElse {
+            throw new AnalysisException(s"Failed to resolve the schema for $format for " +
+              s"the partition column: $partitionColumn. It must be specified manually.")
+          }
+        }
+        StructType(partitionFields)
+      }
+    }
+    if (justPartitioning) {
+      return (null, partitionSchema)
+    }
+    val dataSchema = userSpecifiedSchema.map { schema =>
+      val equality = sparkSession.sessionState.conf.resolver
+      StructType(schema.filterNot(f => partitionSchema.exists(p => equality(p.name, f.name))))
+    }.orElse {
+      format.inferSchema(
         sparkSession,
         caseInsensitiveOptions,
-        fileCatalog.allFiles())
-
-      inferred.map { inferredSchema =>
-        StructType(inferredSchema ++ partitionSchema) -> partitionSchema.map(_.name)
-      }
+        tempFileCatalog.allFiles())
     }.getOrElse {
-      throw new AnalysisException("Unable to infer schema. It must be specified manually.")
+      throw new AnalysisException(
+        s"Unable to infer schema for $format. It must be specified manually.")
     }
+    (dataSchema, partitionSchema)
   }
 
   /** Returns the name and schema of the source that can be used to continually read data. */
@@ -144,8 +224,8 @@ case class DataSource(
               "you may be able to create a static DataFrame on that directory with " +
               "'spark.read.load(directory)' and infer schema from it.")
         }
-        val (schema, partCols) = inferFileFormatSchema(format)
-        SourceInfo(s"FileSource[$path]", schema, partCols)
+        val (schema, partCols) = getOrInferFileFormatSchema(format)
+        SourceInfo(s"FileSource[$path]", StructType(schema ++ partCols), partCols.fieldNames)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -272,7 +352,7 @@ case class DataSource(
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = fileCatalog.partitionSpec().partitionColumns,
+          partitionSchema = fileCatalog.partitionSchema,
           dataSchema = dataSchema,
           bucketSpec = None,
           format,
@@ -281,9 +361,10 @@ case class DataSource(
       // This is a non-streaming file based datasource.
       case (format: FileFormat, _) =>
         val allPaths = caseInsensitiveOptions.get("path") ++ paths
+        val hadoopConf = sparkSession.sessionState.newHadoopConf()
         val globbedPaths = allPaths.flatMap { path =>
           val hdfsPath = new Path(path)
-          val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+          val fs = hdfsPath.getFileSystem(hadoopConf)
           val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
           val globPath = SparkHadoopUtil.get.globPathIfNecessary(qualified)
 
@@ -291,23 +372,14 @@ case class DataSource(
             throw new AnalysisException(s"Path does not exist: $qualified")
           }
           // Sufficient to check head of the globPath seq for non-glob scenario
+          // Don't need to check once again if files exist in streaming mode
           if (checkFilesExist && !fs.exists(globPath.head)) {
             throw new AnalysisException(s"Path does not exist: ${globPath.head}")
           }
           globPath
         }.toArray
 
-        // If they gave a schema, then we try and figure out the types of the partition columns
-        // from that schema.
-        val partitionSchema = userSpecifiedSchema.map { schema =>
-          StructType(
-            partitionColumns.map { c =>
-              // TODO: Case sensitivity.
-              schema
-                  .find(_.name.toLowerCase() == c.toLowerCase())
-                  .getOrElse(throw new AnalysisException(s"Invalid partition column '$c'"))
-            })
-        }
+        val (dataSchema, inferredPartitionSchema) = getOrInferFileFormatSchema(format)
 
         val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
             catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
@@ -316,27 +388,12 @@ case class DataSource(
             catalogTable.get,
             catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
         } else {
-          new InMemoryFileIndex(
-            sparkSession, globbedPaths, options, partitionSchema)
-        }
-
-        val dataSchema = userSpecifiedSchema.map { schema =>
-          val equality = sparkSession.sessionState.conf.resolver
-          StructType(schema.filterNot(f => partitionColumns.exists(equality(_, f.name))))
-        }.orElse {
-          format.inferSchema(
-            sparkSession,
-            caseInsensitiveOptions,
-            fileCatalog.asInstanceOf[InMemoryFileIndex].allFiles())
-        }.getOrElse {
-          throw new AnalysisException(
-            s"Unable to infer schema for $format at ${allPaths.take(2).mkString(",")}. " +
-              "It must be specified manually")
+          new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(inferredPartitionSchema))
         }
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = fileCatalog.partitionSchema,
+          partitionSchema = inferredPartitionSchema,
           dataSchema = dataSchema.asNullable,
           bucketSpec = bucketSpec,
           format,
@@ -384,11 +441,7 @@ case class DataSource(
         // up.  If we fail to load the table for whatever reason, ignore the check.
         if (mode == SaveMode.Append) {
           val existingPartitionColumns = Try {
-            resolveRelation()
-              .asInstanceOf[HadoopFsRelation]
-              .partitionSchema
-              .fieldNames
-              .toSeq
+            getOrInferFileFormatSchema(format, justPartitioning = true)._2.fieldNames.toList
           }.getOrElse(Seq.empty[String])
           // TODO: Case sensitivity.
           val sameColumns =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 02d9d15684904..10843e9ba5753 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -274,7 +274,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           pathToPartitionedTable,
           userSpecifiedSchema = Option("num int, str string"),
           userSpecifiedPartitionCols = partitionCols,
-          expectedSchema = new StructType().add("num", IntegerType).add("str", StringType),
+          expectedSchema = new StructType().add("str", StringType).add("num", IntegerType),
           expectedPartitionCols = partitionCols.map(Seq(_)).getOrElse(Seq.empty[String]))
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index a099153d2e58e..bad6642ea4058 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -282,7 +282,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
           createFileStreamSourceAndGetSchema(
             format = Some("json"), path = Some(src.getCanonicalPath), schema = None)
         }
-        assert("Unable to infer schema. It must be specified manually.;" === e.getMessage)
+        assert("Unable to infer schema for JSON. It must be specified manually.;" === e.getMessage)
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index 5630464f40803..0eb95a02432fb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, StreamingQuery, StreamTest}
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 object LastOptions {
@@ -532,4 +532,47 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
     assert(e.getMessage.contains("does not support recovering"))
     assert(e.getMessage.contains("checkpoint location"))
   }
+
+  test("SPARK-18510: use user specified types for partition columns in file sources") {
+    import org.apache.spark.sql.functions.udf
+    import testImplicits._
+    withTempDir { src =>
+      val createArray = udf { (length: Long) =>
+        for (i <- 1 to length.toInt) yield i.toString
+      }
+      spark.range(4).select(createArray('id + 1) as 'ex, 'id, 'id % 4 as 'part).coalesce(1).write
+        .partitionBy("part", "id")
+        .mode("overwrite")
+        .parquet(src.toString)
+      // Specify a random ordering of the schema, partition column in the middle, etc.
+      // Also let's say that the partition columns are Strings instead of Longs.
+      // partition columns should go to the end
+      val schema = new StructType()
+        .add("id", StringType)
+        .add("ex", ArrayType(StringType))
+
+      val sdf = spark.readStream
+        .schema(schema)
+        .format("parquet")
+        .load(src.toString)
+
+      assert(sdf.schema.toList === List(
+        StructField("ex", ArrayType(StringType)),
+        StructField("part", IntegerType), // inferred partitionColumn dataType
+        StructField("id", StringType))) // used user provided partitionColumn dataType
+
+      val sq = sdf.writeStream
+        .queryName("corruption_test")
+        .format("memory")
+        .start()
+      sq.processAllAvailable()
+      checkAnswer(
+        spark.table("corruption_test"),
+        // notice how `part` is ordered before `id`
+        Row(Array("1"), 0, "0") :: Row(Array("1", "2"), 1, "1") ::
+          Row(Array("1", "2", "3"), 2, "2") :: Row(Array("1", "2", "3", "4"), 3, "3") :: Nil
+      )
+      sq.stop()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index a7fda01098560..e0887e0f1c7de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -24,7 +24,7 @@ import org.scalatest.BeforeAndAfter
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
 
@@ -573,4 +573,40 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
       }
     }
   }
+
+  test("SPARK-18510: use user specified types for partition columns in file sources") {
+    import org.apache.spark.sql.functions.udf
+    import testImplicits._
+    withTempDir { src =>
+      val createArray = udf { (length: Long) =>
+        for (i <- 1 to length.toInt) yield i.toString
+      }
+      spark.range(4).select(createArray('id + 1) as 'ex, 'id, 'id % 4 as 'part).coalesce(1).write
+        .partitionBy("part", "id")
+        .mode("overwrite")
+        .parquet(src.toString)
+      // Specify a random ordering of the schema, partition column in the middle, etc.
+      // Also let's say that the partition columns are Strings instead of Longs.
+      // partition columns should go to the end
+      val schema = new StructType()
+        .add("id", StringType)
+        .add("ex", ArrayType(StringType))
+      val df = spark.read
+        .schema(schema)
+        .format("parquet")
+        .load(src.toString)
+
+      assert(df.schema.toList === List(
+        StructField("ex", ArrayType(StringType)),
+        StructField("part", IntegerType), // inferred partitionColumn dataType
+        StructField("id", StringType))) // used user provided partitionColumn dataType
+
+      checkAnswer(
+        df,
+        // notice how `part` is ordered before `id`
+        Row(Array("1"), 0, "0") :: Row(Array("1", "2"), 1, "1") ::
+          Row(Array("1", "2", "3"), 2, "2") :: Row(Array("1", "2", "3", "4"), 3, "3") :: Nil
+      )
+    }
+  }
 }

From 27d81d0007f4358480148fa6f3f6b079a5431a81 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 23 Nov 2016 16:15:35 -0800
Subject: [PATCH 186/534] [SPARK-18510][SQL] Follow up to address comments in
 #15951

## What changes were proposed in this pull request?

This PR addressed the rest comments in #15951.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #15997 from zsxwing/SPARK-18510-follow-up.

(cherry picked from commit 223fa218e1f637f0d62332785a3bee225b65b990)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../execution/datasources/DataSource.scala    | 35 +++++++++++--------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index dbc3e712332f7..ccfc759c8fa7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -118,8 +118,10 @@ case class DataSource(
   private def getOrInferFileFormatSchema(
       format: FileFormat,
       justPartitioning: Boolean = false): (StructType, StructType) = {
-    // the operations below are expensive therefore try not to do them if we don't need to
-    lazy val tempFileCatalog = {
+    // the operations below are expensive therefore try not to do them if we don't need to, e.g.,
+    // in streaming mode, we have already inferred and registered partition columns, we will
+    // never have to materialize the lazy val below
+    lazy val tempFileIndex = {
       val allPaths = caseInsensitiveOptions.get("path") ++ paths
       val hadoopConf = sparkSession.sessionState.newHadoopConf()
       val globbedPaths = allPaths.toSeq.flatMap { path =>
@@ -133,7 +135,7 @@ case class DataSource(
     val partitionSchema = if (partitionColumns.isEmpty && catalogTable.isEmpty) {
       // Try to infer partitioning, because no DataSource in the read path provides the partitioning
       // columns properly unless it is a Hive DataSource
-      val resolved = tempFileCatalog.partitionSchema.map { partitionField =>
+      val resolved = tempFileIndex.partitionSchema.map { partitionField =>
         val equality = sparkSession.sessionState.conf.resolver
         // SPARK-18510: try to get schema from userSpecifiedSchema, otherwise fallback to inferred
         userSpecifiedSchema.flatMap(_.find(f => equality(f.name, partitionField.name))).getOrElse(
@@ -141,17 +143,17 @@ case class DataSource(
       }
       StructType(resolved)
     } else {
-      // in streaming mode, we have already inferred and registered partition columns, we will
-      // never have to materialize the lazy val below
-      lazy val inferredPartitions = tempFileCatalog.partitionSchema
       // maintain old behavior before SPARK-18510. If userSpecifiedSchema is empty used inferred
       // partitioning
       if (userSpecifiedSchema.isEmpty) {
+        val inferredPartitions = tempFileIndex.partitionSchema
         inferredPartitions
       } else {
         val partitionFields = partitionColumns.map { partitionColumn =>
-          userSpecifiedSchema.flatMap(_.find(_.name == partitionColumn)).orElse {
-            val inferredOpt = inferredPartitions.find(_.name == partitionColumn)
+          val equality = sparkSession.sessionState.conf.resolver
+          userSpecifiedSchema.flatMap(_.find(c => equality(c.name, partitionColumn))).orElse {
+            val inferredPartitions = tempFileIndex.partitionSchema
+            val inferredOpt = inferredPartitions.find(p => equality(p.name, partitionColumn))
             if (inferredOpt.isDefined) {
               logDebug(
                 s"""Type of partition column: $partitionColumn not found in specified schema
@@ -163,7 +165,7 @@ case class DataSource(
                    |Falling back to inferred dataType if it exists.
                  """.stripMargin)
             }
-            inferredPartitions.find(_.name == partitionColumn)
+            inferredOpt
           }.getOrElse {
             throw new AnalysisException(s"Failed to resolve the schema for $format for " +
               s"the partition column: $partitionColumn. It must be specified manually.")
@@ -182,7 +184,7 @@ case class DataSource(
       format.inferSchema(
         sparkSession,
         caseInsensitiveOptions,
-        tempFileCatalog.allFiles())
+        tempFileIndex.allFiles())
     }.getOrElse {
       throw new AnalysisException(
         s"Unable to infer schema for $format. It must be specified manually.")
@@ -224,8 +226,11 @@ case class DataSource(
               "you may be able to create a static DataFrame on that directory with " +
               "'spark.read.load(directory)' and infer schema from it.")
         }
-        val (schema, partCols) = getOrInferFileFormatSchema(format)
-        SourceInfo(s"FileSource[$path]", StructType(schema ++ partCols), partCols.fieldNames)
+        val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format)
+        SourceInfo(
+          s"FileSource[$path]",
+          StructType(dataSchema ++ partitionSchema),
+          partitionSchema.fieldNames)
 
       case _ =>
         throw new UnsupportedOperationException(
@@ -379,7 +384,7 @@ case class DataSource(
           globPath
         }.toArray
 
-        val (dataSchema, inferredPartitionSchema) = getOrInferFileFormatSchema(format)
+        val (dataSchema, partitionSchema) = getOrInferFileFormatSchema(format)
 
         val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
             catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
@@ -388,12 +393,12 @@ case class DataSource(
             catalogTable.get,
             catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
         } else {
-          new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(inferredPartitionSchema))
+          new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(partitionSchema))
         }
 
         HadoopFsRelation(
           fileCatalog,
-          partitionSchema = inferredPartitionSchema,
+          partitionSchema = partitionSchema,
           dataSchema = dataSchema.asNullable,
           bucketSpec = bucketSpec,
           format,

From 04ec74f1274a164b2f72b31e2c147e042bf41bd9 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 24 Nov 2016 05:46:05 -0800
Subject: [PATCH 187/534] [SPARK-18520][ML] Add missing setXXXCol methods for
 BisectingKMeansModel and GaussianMixtureModel

## What changes were proposed in this pull request?
add `setFeaturesCol` and `setPredictionCol` for BiKModel and GMModel
add `setProbabilityCol` for GMModel
## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #15957 from zhengruifeng/bikm_set.

(cherry picked from commit 2dfabec38c24174e7f747c27c7144f7738483ec1)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../apache/spark/ml/clustering/BisectingKMeans.scala |  8 ++++++++
 .../apache/spark/ml/clustering/GaussianMixture.scala | 12 ++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index e6ca3aedffd9d..cf11ba37abb58 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -98,6 +98,14 @@ class BisectingKMeansModel private[ml] (
     copied.setSummary(trainingSummary).setParent(this.parent)
   }
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 92d0b7d085f12..19998ca44b115 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -87,6 +87,18 @@ class GaussianMixtureModel private[ml] (
     @Since("2.0.0") val gaussians: Array[MultivariateGaussian])
   extends Model[GaussianMixtureModel] with GaussianMixtureParams with MLWritable {
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setProbabilityCol(value: String): this.type = set(probabilityCol, value)
+
   @Since("2.0.0")
   override def copy(extra: ParamMap): GaussianMixtureModel = {
     val copied = copyValues(new GaussianMixtureModel(uid, weights, gaussians), extra)

From a7f414561325a7140557562d45fecc5ccbc8d7ff Mon Sep 17 00:00:00 2001
From: Nattavut Sutyanyong <nsy.can@gmail.com>
Date: Thu, 24 Nov 2016 12:07:55 -0800
Subject: [PATCH 188/534] [SPARK-18578][SQL] Full outer join in correlated
 subquery returns incorrect results

## What changes were proposed in this pull request?

- Raise Analysis exception when correlated predicates exist in the descendant operators of either operand of a Full outer join in a subquery as well as in a FOJ operator itself
- Raise Analysis exception when correlated predicates exists in a Window operator (a side effect inadvertently introduced by SPARK-17348)

## How was this patch tested?

Run sql/test catalyst/test and new test cases, added to SubquerySuite, showing the reported incorrect results.

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #16005 from nsyca/FOJ-incorrect.1.

(cherry picked from commit a367d5ff005884322fb8bb43a1cfa4d4bf54b31a)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 10 +++++
 .../org/apache/spark/sql/SubquerySuite.scala  | 45 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 2918e9d158829..2d272762b384f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1017,6 +1017,10 @@ class Analyzer(
 
       // Simplify the predicates before pulling them out.
       val transformed = BooleanSimplification(sub) transformUp {
+        // WARNING:
+        // Only Filter can host correlated expressions at this time
+        // Anyone adding a new "case" below needs to add the call to
+        // "failOnOuterReference" to disallow correlated expressions in it.
         case f @ Filter(cond, child) =>
           // Find all predicates with an outer reference.
           val (correlated, local) = splitConjunctivePredicates(cond).partition(containsOuter)
@@ -1057,12 +1061,18 @@ class Analyzer(
             a
           }
         case w : Window =>
+          failOnOuterReference(w)
           failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, w)
           w
         case j @ Join(left, _, RightOuter, _) =>
           failOnOuterReference(j)
           failOnOuterReferenceInSubTree(left, "a RIGHT OUTER JOIN")
           j
+        // SPARK-18578: Do not allow any correlated predicate
+        // in a Full (Outer) Join operator and its descendants
+        case j @ Join(_, _, FullOuter, _) =>
+          failOnOuterReferenceInSubTree(j, "a FULL OUTER JOIN")
+          j
         case j @ Join(_, right, jt, _) if !jt.isInstanceOf[InnerLike] =>
           failOnOuterReference(j)
           failOnOuterReferenceInSubTree(right, "a LEFT (OUTER) JOIN")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index f1dd1c620e660..73a53944964fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -744,4 +744,49 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+  // This restriction applies to
+  // the permutation of { LOJ, ROJ, FOJ } x { EXISTS, IN, scalar subquery }
+  // where correlated predicates appears in right operand of LOJ,
+  // or in left operand of ROJ, or in either operand of FOJ.
+  // The test cases below cover the representatives of the patterns
+  test("Correlated subqueries in outer joins") {
+    withTempView("t1", "t2", "t3") {
+      Seq(1).toDF("c1").createOrReplaceTempView("t1")
+      Seq(2).toDF("c1").createOrReplaceTempView("t2")
+      Seq(1).toDF("c1").createOrReplaceTempView("t3")
+
+      // Left outer join (LOJ) in IN subquery context
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  1 IN (select 1
+            |              from   t3 left outer join
+            |                     (select c1 from t2 where t1.c1 = 2) t2
+            |                     on t2.c1 = t3.c1)""".stripMargin).collect()
+      }
+      // Right outer join (ROJ) in EXISTS subquery context
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select t1.c1
+            | from   t1
+            | where  exists (select 1
+            |                from   (select c1 from t2 where t1.c1 = 2) t2
+            |                       right outer join t3
+            |                       on t2.c1 = t3.c1)""".stripMargin).collect()
+      }
+      // SPARK-18578: Full outer join (FOJ) in scalar subquery context
+      intercept[AnalysisException] {
+        sql(
+          """
+            | select (select max(1)
+            |         from   (select c1 from  t2 where t1.c1 = 2 and t1.c1=t2.c1) t2
+            |                full join t3
+            |                on t2.c1=t3.c1)
+            | from   t1""".stripMargin).collect()
+      }
+    }
+  }
 }

From 57dbc682dfafc87076dcaafd29c637cb16ace91a Mon Sep 17 00:00:00 2001
From: uncleGen <hustyugm@gmail.com>
Date: Fri, 25 Nov 2016 09:10:17 +0000
Subject: [PATCH 189/534] [SPARK-18575][WEB] Keep same style: adjust the
 position of driver log links

## What changes were proposed in this pull request?

NOT BUG, just adjust the position of driver log link to keep the same style with other executors log link.

![image](https://cloud.githubusercontent.com/assets/7402327/20590092/f8bddbb8-b25b-11e6-9aaf-3b5b3073df10.png)

## How was this patch tested?
 no

Author: uncleGen <hustyugm@gmail.com>

Closes #16001 from uncleGen/SPARK-18575.

(cherry picked from commit f58a8aa20106ea36386db79a8a66f529a8da75c9)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../spark/scheduler/cluster/YarnClusterSchedulerBackend.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index ced597bed36d9..4f3d5ebf403e0 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -55,8 +55,8 @@ private[spark] class YarnClusterSchedulerBackend(
       val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
       logDebug(s"Base URL for logs: $baseUrl")
       driverLogs = Some(Map(
-        "stderr" -> s"$baseUrl/stderr?start=-4096",
-        "stdout" -> s"$baseUrl/stdout?start=-4096"))
+        "stdout" -> s"$baseUrl/stdout?start=-4096",
+        "stderr" -> s"$baseUrl/stderr?start=-4096"))
     } catch {
       case e: Exception =>
         logInfo("Error while building AM log links, so AM" +

From a49dfa93e160d63e806f35cb6b6953367916f44b Mon Sep 17 00:00:00 2001
From: "n.fraison" <n.fraison@criteo.com>
Date: Fri, 25 Nov 2016 09:45:51 +0000
Subject: [PATCH 190/534] [SPARK-18119][SPARK-CORE] Namenode safemode check is
 only performed on one namenode which can stuck the startup of SparkHistory
 server

## What changes were proposed in this pull request?

Instead of using the setSafeMode method that check the first namenode used the one which permitts to check only for active NNs
## How was this patch tested?

manual tests

Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request.

This commit is contributed by Criteo SA under the Apache v2 licence.

Author: n.fraison <n.fraison@criteo.com>

Closes #15648 from ashangit/SPARK-18119.

(cherry picked from commit f42db0c0c1434bfcccaa70d0db55e16c4396af04)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../org/apache/spark/deploy/history/FsHistoryProvider.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index ca38a47639422..8ef69b142cd15 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -663,9 +663,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       false
   }
 
-  // For testing.
   private[history] def isFsInSafeMode(dfs: DistributedFileSystem): Boolean = {
-    dfs.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_GET)
+    /* true to check only for Active NNs status */
+    dfs.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_GET, true)
   }
 
   /**

From 69856f28361022812d2af83128d8591694bcef4b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 25 Nov 2016 11:27:07 +0000
Subject: [PATCH 191/534] [SPARK-3359][BUILD][DOCS] More changes to resolve
 javadoc 8 errors that will help unidoc/genjavadoc compatibility

## What changes were proposed in this pull request?

This PR only tries to fix things that looks pretty straightforward and were fixed in other previous PRs before.

This PR roughly fixes several things as below:

- Fix unrecognisable class and method links in javadoc by changing it from `[[..]]` to `` `...` ``

  ```
  [error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/DataStreamReader.java:226: error: reference not found
  [error]    * Loads text files and returns a {link DataFrame} whose schema starts with a string column named
  ```

- Fix an exception annotation and remove code backticks in `throws` annotation

  Currently, sbt unidoc with Java 8 complains as below:

  ```
  [error] .../java/org/apache/spark/sql/streaming/StreamingQuery.java:72: error: unexpected text
  [error]    * throws StreamingQueryException, if <code>this</code> query has terminated with an exception.
  ```

  `throws` should specify the correct class name from `StreamingQueryException,` to `StreamingQueryException` without backticks. (see [JDK-8007644](https://bugs.openjdk.java.net/browse/JDK-8007644)).

- Fix `[[http..]]` to `<a href="http..."></a>`.

  ```diff
  -   * [[https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https Oracle
  -   * blog page]].
  +   * <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">
  +   * Oracle blog page</a>.
  ```

   `[[http...]]` link markdown in scaladoc is unrecognisable in javadoc.

- It seems class can't have `return` annotation. So, two cases of this were removed.

  ```
  [error] .../java/org/apache/spark/mllib/regression/IsotonicRegression.java:27: error: invalid use of return
  [error]    * return New instance of IsotonicRegression.
  ```

- Fix < to `&lt;` and > to `&gt;` according to HTML rules.

- Fix `</p>` complaint

- Exclude unrecognisable in javadoc, `constructor`, `todo` and `groupname`.

## How was this patch tested?

Manually tested by `jekyll build` with Java 7 and 8

```
java version "1.7.0_80"
Java(TM) SE Runtime Environment (build 1.7.0_80-b15)
Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode)
```

```
java version "1.8.0_45"
Java(TM) SE Runtime Environment (build 1.8.0_45-b14)
Java HotSpot(TM) 64-Bit Server VM (build 25.45-b02, mixed mode)
```

Note: this does not yet make sbt unidoc suceed with Java 8 yet but it reduces the number of errors with Java 8.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #15999 from HyukjinKwon/SPARK-3359-errors.

(cherry picked from commit 51b1c1551d3a7147403b9e821fcc7c8f57b4824c)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/SSLOptions.scala   |  4 +-
 .../apache/spark/api/java/JavaPairRDD.scala   |  6 +-
 .../org/apache/spark/api/java/JavaRDD.scala   | 10 +--
 .../spark/api/java/JavaSparkContext.scala     | 14 ++--
 .../apache/spark/io/CompressionCodec.scala    |  2 +-
 .../main/scala/org/apache/spark/rdd/RDD.scala | 18 ++---
 .../spark/security/CryptoStreamUtils.scala    |  4 +-
 .../spark/serializer/KryoSerializer.scala     |  3 +-
 .../storage/BlockReplicationPolicy.scala      |  7 +-
 .../scala/org/apache/spark/ui/UIUtils.scala   |  4 +-
 .../org/apache/spark/util/AccumulatorV2.scala |  2 +-
 .../org/apache/spark/util/RpcUtils.scala      |  2 +-
 .../org/apache/spark/util/StatCounter.scala   |  4 +-
 .../org/apache/spark/util/ThreadUtils.scala   |  6 +-
 .../scala/org/apache/spark/util/Utils.scala   | 10 +--
 .../spark/util/io/ChunkedByteBuffer.scala     |  2 +-
 .../scala/org/apache/spark/graphx/Graph.scala |  4 +-
 .../org/apache/spark/graphx/GraphLoader.scala |  2 +-
 .../spark/graphx/impl/EdgeRDDImpl.scala       |  2 +-
 .../apache/spark/graphx/lib/PageRank.scala    |  4 +-
 .../apache/spark/graphx/lib/SVDPlusPlus.scala |  3 +-
 .../spark/graphx/lib/TriangleCount.scala      |  2 +-
 .../distribution/MultivariateGaussian.scala   |  3 +-
 .../scala/org/apache/spark/ml/Predictor.scala |  2 +-
 .../spark/ml/attribute/AttributeGroup.scala   |  2 +-
 .../spark/ml/attribute/attributes.scala       |  4 +-
 .../classification/LogisticRegression.scala   | 74 +++++++++----------
 .../MultilayerPerceptronClassifier.scala      |  1 -
 .../spark/ml/classification/NaiveBayes.scala  |  8 +-
 .../RandomForestClassifier.scala              |  6 +-
 .../spark/ml/clustering/BisectingKMeans.scala | 14 ++--
 .../ml/clustering/ClusteringSummary.scala     |  2 +-
 .../spark/ml/clustering/GaussianMixture.scala |  6 +-
 .../apache/spark/ml/clustering/KMeans.scala   |  8 +-
 .../org/apache/spark/ml/clustering/LDA.scala  | 42 +++++------
 .../org/apache/spark/ml/feature/DCT.scala     |  3 +-
 .../org/apache/spark/ml/feature/MinHash.scala |  5 +-
 .../spark/ml/feature/MinMaxScaler.scala       |  4 +-
 .../ml/feature/PolynomialExpansion.scala      | 14 ++--
 .../spark/ml/feature/RandomProjection.scala   |  4 +-
 .../spark/ml/feature/StandardScaler.scala     |  4 +-
 .../spark/ml/feature/StopWordsRemover.scala   |  5 +-
 .../org/apache/spark/ml/feature/package.scala |  3 +-
 .../IterativelyReweightedLeastSquares.scala   |  7 +-
 .../spark/ml/param/shared/sharedParams.scala  | 12 +--
 .../ml/regression/AFTSurvivalRegression.scala | 27 +++----
 .../ml/regression/DecisionTreeRegressor.scala |  4 +-
 .../spark/ml/regression/GBTRegressor.scala    |  4 +-
 .../GeneralizedLinearRegression.scala         | 12 +--
 .../ml/regression/LinearRegression.scala      | 38 +++++-----
 .../ml/regression/RandomForestRegressor.scala |  5 +-
 .../ml/source/libsvm/LibSVMDataSource.scala   | 13 ++--
 .../ml/tree/impl/GradientBoostedTrees.scala   | 10 +--
 .../spark/ml/tree/impl/RandomForest.scala     |  2 +-
 .../org/apache/spark/ml/tree/treeParams.scala |  6 +-
 .../spark/ml/tuning/CrossValidator.scala      |  4 +-
 .../org/apache/spark/ml/util/ReadWrite.scala  | 10 +--
 .../mllib/classification/NaiveBayes.scala     | 28 +++----
 .../mllib/clustering/BisectingKMeans.scala    | 21 +++---
 .../clustering/BisectingKMeansModel.scala     |  4 +-
 .../mllib/clustering/GaussianMixture.scala    |  6 +-
 .../clustering/GaussianMixtureModel.scala     |  2 +-
 .../apache/spark/mllib/clustering/LDA.scala   | 24 +++---
 .../spark/mllib/clustering/LDAModel.scala     |  2 +-
 .../spark/mllib/clustering/LDAOptimizer.scala |  2 +-
 .../clustering/PowerIterationClustering.scala | 13 ++--
 .../mllib/clustering/StreamingKMeans.scala    |  4 +-
 .../mllib/evaluation/RegressionMetrics.scala  | 10 ++-
 .../org/apache/spark/mllib/fpm/FPGrowth.scala | 12 +--
 .../apache/spark/mllib/fpm/PrefixSpan.scala   |  7 +-
 .../linalg/distributed/BlockMatrix.scala      | 20 ++---
 .../linalg/distributed/CoordinateMatrix.scala |  4 +-
 .../linalg/distributed/IndexedRowMatrix.scala |  4 +-
 .../mllib/linalg/distributed/RowMatrix.scala  |  2 +-
 .../spark/mllib/optimization/Gradient.scala   | 24 +++---
 .../mllib/optimization/GradientDescent.scala  |  4 +-
 .../spark/mllib/optimization/LBFGS.scala      |  7 +-
 .../spark/mllib/optimization/NNLS.scala       |  2 +-
 .../spark/mllib/optimization/Updater.scala    |  6 +-
 .../org/apache/spark/mllib/package.scala      |  4 +-
 .../apache/spark/mllib/rdd/RDDFunctions.scala |  2 +-
 .../spark/mllib/recommendation/ALS.scala      |  7 +-
 .../MatrixFactorizationModel.scala            |  6 +-
 .../mllib/regression/IsotonicRegression.scala |  9 +--
 .../stat/MultivariateOnlineSummarizer.scala   |  7 +-
 .../apache/spark/mllib/stat/Statistics.scala  | 11 +--
 .../distribution/MultivariateGaussian.scala   |  3 +-
 .../mllib/tree/GradientBoostedTrees.scala     |  2 +-
 .../spark/mllib/tree/RandomForest.scala       |  8 +-
 .../apache/spark/mllib/tree/model/Split.scala |  2 +-
 .../org/apache/spark/mllib/util/MLUtils.scala | 10 +--
 .../spark/mllib/util/modelSaveLoad.scala      |  2 +-
 pom.xml                                       | 12 +++
 project/SparkBuild.scala                      |  5 +-
 .../main/scala/org/apache/spark/sql/Row.scala |  2 +-
 .../aggregate/CentralMomentAgg.scala          |  4 +-
 .../apache/spark/sql/types/BinaryType.scala   |  2 +-
 .../apache/spark/sql/types/BooleanType.scala  |  2 +-
 .../org/apache/spark/sql/types/ByteType.scala |  2 +-
 .../sql/types/CalendarIntervalType.scala      |  2 +-
 .../org/apache/spark/sql/types/DateType.scala |  2 +-
 .../apache/spark/sql/types/DecimalType.scala  |  4 +-
 .../apache/spark/sql/types/DoubleType.scala   |  2 +-
 .../apache/spark/sql/types/FloatType.scala    |  2 +-
 .../apache/spark/sql/types/IntegerType.scala  |  2 +-
 .../org/apache/spark/sql/types/LongType.scala |  2 +-
 .../org/apache/spark/sql/types/MapType.scala  |  2 +-
 .../org/apache/spark/sql/types/NullType.scala |  2 +-
 .../apache/spark/sql/types/ShortType.scala    |  2 +-
 .../apache/spark/sql/types/StringType.scala   |  2 +-
 .../spark/sql/types/TimestampType.scala       |  2 +-
 .../apache/spark/sql/DataFrameReader.scala    | 17 +++--
 .../spark/sql/DataFrameStatFunctions.scala    | 16 ++--
 .../apache/spark/sql/DataFrameWriter.scala    |  4 +-
 .../org/apache/spark/sql/SQLContext.scala     | 62 ++++++++--------
 .../sql/execution/stat/FrequentItems.scala    |  3 +-
 .../sql/execution/stat/StatFunctions.scala    |  4 +-
 .../spark/sql/expressions/Aggregator.scala    |  8 +-
 .../sql/expressions/UserDefinedFunction.scala |  2 +-
 .../apache/spark/sql/expressions/Window.scala | 16 ++--
 .../spark/sql/expressions/WindowSpec.scala    | 16 ++--
 .../sql/expressions/scalalang/typed.scala     |  2 +-
 .../apache/spark/sql/expressions/udaf.scala   | 24 +++---
 .../apache/spark/sql/jdbc/JdbcDialects.scala  |  6 +-
 .../sql/streaming/DataStreamReader.scala      | 20 ++---
 .../sql/streaming/DataStreamWriter.scala      |  8 +-
 .../spark/sql/streaming/StreamingQuery.scala  | 10 ++-
 .../sql/streaming/StreamingQueryManager.scala |  8 +-
 .../sql/util/QueryExecutionListener.scala     |  2 +-
 .../hive/execution/InsertIntoHiveTable.scala  |  4 +-
 .../spark/sql/hive/orc/OrcFileFormat.scala    |  4 +-
 .../spark/sql/hive/orc/OrcFileOperator.scala  |  2 +-
 132 files changed, 558 insertions(+), 499 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SSLOptions.scala b/core/src/main/scala/org/apache/spark/SSLOptions.scala
index be19179b00a49..5f14102c3c366 100644
--- a/core/src/main/scala/org/apache/spark/SSLOptions.scala
+++ b/core/src/main/scala/org/apache/spark/SSLOptions.scala
@@ -150,8 +150,8 @@ private[spark] object SSLOptions extends Logging {
    * $ - `[ns].enabledAlgorithms` - a comma separated list of ciphers
    *
    * For a list of protocols and ciphers supported by particular Java versions, you may go to
-   * [[https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https Oracle
-   * blog page]].
+   * <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">
+   * Oracle blog page</a>.
    *
    * You can optionally specify the default configuration. If you do, for each setting which is
    * missing in SparkConf, the corresponding setting is used from the default configuration.
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index bff5a29bb60f1..d7e3a1b1be48c 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -405,7 +405,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * partitioning of the resulting key-value pair RDD by passing a Partitioner.
    *
    * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
+   * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey`
    * will provide much better performance.
    */
   def groupByKey(partitioner: Partitioner): JavaPairRDD[K, JIterable[V]] =
@@ -416,7 +416,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * resulting RDD with into `numPartitions` partitions.
    *
    * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
+   * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey`
    * will provide much better performance.
    */
   def groupByKey(numPartitions: Int): JavaPairRDD[K, JIterable[V]] =
@@ -546,7 +546,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
    * resulting RDD with the existing partitioner/parallelism level.
    *
    * @note If you are grouping in order to perform an aggregation (such as a sum or average) over
-   * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]]
+   * each key, using `JavaPairRDD.reduceByKey` or `JavaPairRDD.combineByKey`
    * will provide much better performance.
    */
   def groupByKey(): JavaPairRDD[K, JIterable[V]] =
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index ccd94f876e0b8..a20d264be5afd 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -103,10 +103,10 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be &gt;= 0
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
-   * of the given [[RDD]].
+   * of the given `RDD`.
    */
   def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] =
     sample(withReplacement, fraction, Utils.random.nextLong)
@@ -117,11 +117,11 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be &gt;= 0
    * @param seed seed for the random number generator
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
-   * of the given [[RDD]].
+   * of the given `RDD`.
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] =
     wrapRDD(rdd.sample(withReplacement, fraction, seed))
@@ -167,7 +167,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * Return an RDD with the elements from `this` that are not in `other`.
    *
    * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
-   * RDD will be <= us.
+   * RDD will be &lt;= us.
    */
   def subtract(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.subtract(other))
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 38d347aeab8c6..9481156bc93a5 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -238,7 +238,9 @@ class JavaSparkContext(val sc: SparkContext)
    * }}}
    *
    * Do
-   * `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
+   * {{{
+   *   JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")
+   * }}}
    *
    * then `rdd` contains
    * {{{
@@ -270,7 +272,9 @@ class JavaSparkContext(val sc: SparkContext)
    * }}}
    *
    * Do
-   * `JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
+   * {{{
+   *   JavaPairRDD<String, byte[]> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")
+   * }}},
    *
    * then `rdd` contains
    * {{{
@@ -749,7 +753,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Get a local property set in this thread, or null if it is missing. See
-   * [[org.apache.spark.api.java.JavaSparkContext.setLocalProperty]].
+   * `org.apache.spark.api.java.JavaSparkContext.setLocalProperty`.
    */
   def getLocalProperty(key: String): String = sc.getLocalProperty(key)
 
@@ -769,7 +773,7 @@ class JavaSparkContext(val sc: SparkContext)
    * Application programmers can use this method to group all those jobs together and give a
    * group description. Once set, the Spark web UI will associate such jobs with this group.
    *
-   * The application can also use [[org.apache.spark.api.java.JavaSparkContext.cancelJobGroup]]
+   * The application can also use `org.apache.spark.api.java.JavaSparkContext.cancelJobGroup`
    * to cancel all running jobs in this group. For example,
    * {{{
    * // In the main thread:
@@ -802,7 +806,7 @@ class JavaSparkContext(val sc: SparkContext)
 
   /**
    * Cancel active jobs for the specified group. See
-   * [[org.apache.spark.api.java.JavaSparkContext.setJobGroup]] for more information.
+   * `org.apache.spark.api.java.JavaSparkContext.setJobGroup` for more information.
    */
   def cancelJobGroup(groupId: String): Unit = sc.cancelJobGroup(groupId)
 
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 6ba79e506a648..2e991ce394c42 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -172,7 +172,7 @@ private final object SnappyCompressionCodec {
 }
 
 /**
- * Wrapper over [[SnappyOutputStream]] which guards against write-after-close and double-close
+ * Wrapper over `SnappyOutputStream` which guards against write-after-close and double-close
  * issues. See SPARK-7660 for more details. This wrapping can be removed if we upgrade to a version
  * of snappy-java that contains the fix for https://github.com/xerial/snappy-java/issues/107.
  */
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index bff2b8f1d06c9..8e673447581cf 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -70,8 +70,8 @@ import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, Poi
  * All of the scheduling and execution in Spark is done based on these methods, allowing each RDD
  * to implement its own way of computing itself. Indeed, users can implement custom RDDs (e.g. for
  * reading data from a new storage system) by overriding these functions. Please refer to the
- * [[http://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf Spark paper]] for more details
- * on RDD internals.
+ * <a href="http://people.csail.mit.edu/matei/papers/2012/nsdi_spark.pdf">Spark paper</a>
+ * for more details on RDD internals.
  */
 abstract class RDD[T: ClassTag](
     @transient private var _sc: SparkContext,
@@ -469,7 +469,7 @@ abstract class RDD[T: ClassTag](
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be &gt;= 0
    * @param seed seed for the random number generator
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
@@ -675,8 +675,8 @@ abstract class RDD[T: ClassTag](
    * may even differ each time the resulting RDD is evaluated.
    *
    * @note This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupBy[K](f: T => K)(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])] = withScope {
     groupBy[K](f, defaultPartitioner(this))
@@ -688,8 +688,8 @@ abstract class RDD[T: ClassTag](
    * may even differ each time the resulting RDD is evaluated.
    *
    * @note This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupBy[K](
       f: T => K,
@@ -703,8 +703,8 @@ abstract class RDD[T: ClassTag](
    * may even differ each time the resulting RDD is evaluated.
    *
    * @note This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupBy[K](f: T => K, p: Partitioner)(implicit kt: ClassTag[K], ord: Ordering[K] = null)
       : RDD[(K, Iterable[T])] = withScope {
diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
index 8f15f50bee814..f41fc38be2080 100644
--- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
+++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
@@ -46,7 +46,7 @@ private[spark] object CryptoStreamUtils extends Logging {
   val COMMONS_CRYPTO_CONF_PREFIX = "commons.crypto."
 
   /**
-   * Helper method to wrap [[OutputStream]] with [[CryptoOutputStream]] for encryption.
+   * Helper method to wrap `OutputStream` with `CryptoOutputStream` for encryption.
    */
   def createCryptoOutputStream(
       os: OutputStream,
@@ -62,7 +62,7 @@ private[spark] object CryptoStreamUtils extends Logging {
   }
 
   /**
-   * Helper method to wrap [[InputStream]] with [[CryptoInputStream]] for decryption.
+   * Helper method to wrap `InputStream` with `CryptoInputStream` for decryption.
    */
   def createCryptoInputStream(
       is: InputStream,
diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 19e020c968a9a..7eb2da1c2748c 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -43,7 +43,8 @@ import org.apache.spark.util.{BoundedPriorityQueue, SerializableConfiguration, S
 import org.apache.spark.util.collection.CompactBuffer
 
 /**
- * A Spark serializer that uses the [[https://code.google.com/p/kryo/ Kryo serialization library]].
+ * A Spark serializer that uses the <a href="https://code.google.com/p/kryo/">
+ * Kryo serialization library</a>.
  *
  * @note This serializer is not guaranteed to be wire-compatible across different versions of
  * Spark. It is intended to be used to serialize/de-serialize data within a single
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
index bf087af16a5b1..bb8a684b4c7a8 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockReplicationPolicy.scala
@@ -89,17 +89,18 @@ class RandomBlockReplicationPolicy
     prioritizedPeers
   }
 
+  // scalastyle:off line.size.limit
   /**
    * Uses sampling algorithm by Robert Floyd. Finds a random sample in O(n) while
-   * minimizing space usage
-   * [[http://math.stackexchange.com/questions/178690/
-   * whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin]]
+   * minimizing space usage. Please see <a href="http://math.stackexchange.com/questions/178690/whats-the-proof-of-correctness-for-robert-floyds-algorithm-for-selecting-a-sin">
+   * here</a>.
    *
    * @param n total number of indices
    * @param m number of samples needed
    * @param r random number generator
    * @return list of m random unique indices
    */
+  // scalastyle:on line.size.limit
   private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = {
     val indices = (n - m + 1 to n).foldLeft(Set.empty[Int]) {case (set, i) =>
       val t = r.nextInt(i) + 1
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 57f6f2f0a9be5..dbeb970c81dfe 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -422,8 +422,8 @@ private[spark] object UIUtils extends Logging {
    * the whole string will rendered as a simple escaped text.
    *
    * Note: In terms of security, only anchor tags with root relative links are supported. So any
-   * attempts to embed links outside Spark UI, or other tags like <script> will cause in the whole
-   * description to be treated as plain text.
+   * attempts to embed links outside Spark UI, or other tags like &lt;script&gt; will cause in
+   * the whole description to be treated as plain text.
    *
    * @param desc        the original job or stage description string, which may contain html tags.
    * @param basePathUri with which to prepend the relative links; this is used when plainText is
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 1326f0977c241..00e0cf257cd4a 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -224,7 +224,7 @@ private[spark] object AccumulatorContext {
    * Registers an [[AccumulatorV2]] created on the driver such that it can be used on the executors.
    *
    * All accumulators registered here can later be used as a container for accumulating partial
-   * values across multiple tasks. This is what [[org.apache.spark.scheduler.DAGScheduler]] does.
+   * values across multiple tasks. This is what `org.apache.spark.scheduler.DAGScheduler` does.
    * Note: if an accumulator is registered here, it should also be registered with the active
    * context cleaner for cleanup so as to avoid memory leaks.
    *
diff --git a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
index e3b588374ce1a..46a5cb2cff5a5 100644
--- a/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/RpcUtils.scala
@@ -23,7 +23,7 @@ import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv, RpcTimeout}
 private[spark] object RpcUtils {
 
   /**
-   * Retrieve a [[RpcEndpointRef]] which is located in the driver via its name.
+   * Retrieve a `RpcEndpointRef` which is located in the driver via its name.
    */
   def makeDriverRef(name: String, conf: SparkConf, rpcEnv: RpcEnv): RpcEndpointRef = {
     val driverHost: String = conf.get("spark.driver.host", "localhost")
diff --git a/core/src/main/scala/org/apache/spark/util/StatCounter.scala b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
index 45381365f1e52..1e02638591f8b 100644
--- a/core/src/main/scala/org/apache/spark/util/StatCounter.scala
+++ b/core/src/main/scala/org/apache/spark/util/StatCounter.scala
@@ -22,8 +22,8 @@ import org.apache.spark.annotation.Since
 /**
  * A class for tracking the statistics of a set of numbers (count, mean and variance) in a
  * numerically robust way. Includes support for merging two StatCounters. Based on Welford
- * and Chan's [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance algorithms]]
- * for running variance.
+ * and Chan's <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * algorithms</a> for running variance.
  *
  * @constructor Initialize the StatCounter with the given values.
  */
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index d093e7bfc3dac..60a6e82c6f90d 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -180,8 +180,8 @@ private[spark] object ThreadUtils {
 
   // scalastyle:off awaitresult
   /**
-   * Preferred alternative to [[Await.result()]]. This method wraps and re-throws any exceptions
-   * thrown by the underlying [[Await]] call, ensuring that this thread's stack trace appears in
+   * Preferred alternative to `Await.result()`. This method wraps and re-throws any exceptions
+   * thrown by the underlying `Await` call, ensuring that this thread's stack trace appears in
    * logs.
    */
   @throws(classOf[SparkException])
@@ -196,7 +196,7 @@ private[spark] object ThreadUtils {
   }
 
   /**
-   * Calls [[Awaitable.result]] directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps
+   * Calls `Awaitable.result` directly to avoid using `ForkJoinPool`'s `BlockingContext`, wraps
    * and re-throws any exceptions with nice stack track.
    *
    * Codes running in the user's thread may be in a thread of Scala ForkJoinPool. As concurrent
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index a2386d6b9e12f..acad2fdf733c8 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1673,8 +1673,8 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * NaN-safe version of [[java.lang.Double.compare()]] which allows NaN values to be compared
-   * according to semantics where NaN == NaN and NaN > any non-NaN double.
+   * NaN-safe version of `java.lang.Double.compare()` which allows NaN values to be compared
+   * according to semantics where NaN == NaN and NaN &gt; any non-NaN double.
    */
   def nanSafeCompareDoubles(x: Double, y: Double): Int = {
     val xIsNan: Boolean = java.lang.Double.isNaN(x)
@@ -1687,8 +1687,8 @@ private[spark] object Utils extends Logging {
   }
 
   /**
-   * NaN-safe version of [[java.lang.Float.compare()]] which allows NaN values to be compared
-   * according to semantics where NaN == NaN and NaN > any non-NaN float.
+   * NaN-safe version of `java.lang.Float.compare()` which allows NaN values to be compared
+   * according to semantics where NaN == NaN and NaN &gt; any non-NaN float.
    */
   def nanSafeCompareFloats(x: Float, y: Float): Int = {
     val xIsNan: Boolean = java.lang.Float.isNaN(x)
@@ -2340,7 +2340,7 @@ private[spark] object Utils extends Logging {
    * A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains
    * host and port.
    *
-   * @throws SparkException if `sparkUrl` is invalid.
+   * @note Throws `SparkException` if sparkUrl is invalid.
    */
   def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = {
     try {
diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index 89b0874e3865a..da08661d137d0 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -148,7 +148,7 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
 /**
  * Reads data from a ChunkedByteBuffer.
  *
- * @param dispose if true, [[ChunkedByteBuffer.dispose()]] will be called at the end of the stream
+ * @param dispose if true, `ChunkedByteBuffer.dispose()` will be called at the end of the stream
  *                in order to close any memory-mapped files which back the buffer.
  */
 private class ChunkedByteBufferInputStream(
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index 922ec7955fd6d..c55a5885ba805 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -54,8 +54,8 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    *
    * @return an RDD containing the edges in this graph
    *
-   * @see [[Edge]] for the edge type.
-   * @see [[Graph#triplets]] to get an RDD which contains all the edges
+   * @see `Edge` for the edge type.
+   * @see `Graph#triplets` to get an RDD which contains all the edges
    * along with their vertex data.
    *
    */
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
index f678e5f1238fb..add21f41ea3ba 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@@ -32,7 +32,7 @@ object GraphLoader extends Logging {
    * id and a target id. Skips lines that begin with `#`.
    *
    * If desired the edges can be automatically oriented in the positive
-   * direction (source Id < target Id) by setting `canonicalOrientation` to
+   * direction (source Id &lt; target Id) by setting `canonicalOrientation` to
    * true.
    *
    * @example Loads a file in the following format:
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
index 98e082cc44e1a..faa985594ec08 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
@@ -41,7 +41,7 @@ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
 
   /**
    * If `partitionsRDD` already has a partitioner, use it. Otherwise assume that the
-   * [[PartitionID]]s in `partitionsRDD` correspond to the actual partitions and create a new
+   * `PartitionID`s in `partitionsRDD` correspond to the actual partitions and create a new
    * partitioner that allows co-partitioning with `partitionsRDD`.
    */
   override val partitioner =
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index f926984aa6335..feb3f47667f8c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -28,7 +28,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
 /**
  * PageRank algorithm implementation. There are two implementations of PageRank implemented.
  *
- * The first implementation uses the standalone [[Graph]] interface and runs PageRank
+ * The first implementation uses the standalone `Graph` interface and runs PageRank
  * for a fixed number of iterations:
  * {{{
  * var PR = Array.fill(n)( 1.0 )
@@ -41,7 +41,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
  * }
  * }}}
  *
- * The second implementation uses the [[Pregel]] interface and runs PageRank until
+ * The second implementation uses the `Pregel` interface and runs PageRank until
  * convergence:
  *
  * {{{
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index bb2ffab0f60f8..59fdd855e6f37 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -42,7 +42,8 @@ object SVDPlusPlus {
   /**
    * Implement SVD++ based on "Factorization Meets the Neighborhood:
    * a Multifaceted Collaborative Filtering Model",
-   * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]].
+   * available at <a href="http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf">
+   * here</a>.
    *
    * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^^-0.5^^*sum(y)),
    * see the details on page 6.
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
index 34e9e22c3a35a..21b22968a1a69 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
@@ -36,7 +36,7 @@ import org.apache.spark.graphx._
  * self cycles and canonicalizes the graph to ensure that the following conditions hold:
  * <ul>
  * <li> There are no self edges</li>
- * <li> All edges are oriented src > dst</li>
+ * <li> All edges are oriented src &gt; dst</li>
  * <li> There are no duplicate edges</li>
  * </ul>
  * However, the canonicalization procedure is costly as it requires repartitioning the graph.
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
index 0be28677eff31..3167e0c286d47 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/stat/distribution/MultivariateGaussian.scala
@@ -28,7 +28,8 @@ import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}
  * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
  * the event that the covariance matrix is singular, the density will be computed in a
  * reduced dimensional subspace under which the distribution is supported.
- * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case">
+ * here</a>)
  *
  * @param mean The mean vector of the distribution
  * @param cov The covariance matrix of the distribution
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index aa92edde7acd1..4b43a3aa5b700 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -59,7 +59,7 @@ private[ml] trait PredictorParams extends Params
 /**
  * :: DeveloperApi ::
  * Abstraction for prediction problems (regression and classification). It accepts all NumericType
- * labels and will automatically cast it to DoubleType in [[fit()]].
+ * labels and will automatically cast it to DoubleType in `fit()`.
  *
  * @tparam FeaturesType  Type of features.
  *                       E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 12b9732a4c3d2..527cb2d547b63 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -239,7 +239,7 @@ object AttributeGroup {
     }
   }
 
-  /** Creates an attribute group from a [[StructField]] instance. */
+  /** Creates an attribute group from a `StructField` instance. */
   def fromStructField(field: StructField): AttributeGroup = {
     require(field.dataType == new VectorUDT)
     if (field.metadata.contains(ML_ATTR)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index 27554acdf3c26..cc7e8bc301ad3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -98,7 +98,7 @@ sealed abstract class Attribute extends Serializable {
   def toMetadata(): Metadata = toMetadata(Metadata.empty)
 
   /**
-   * Converts to a [[StructField]] with some existing metadata.
+   * Converts to a `StructField` with some existing metadata.
    * @param existingMetadata existing metadata to carry over
    */
   def toStructField(existingMetadata: Metadata): StructField = {
@@ -109,7 +109,7 @@ sealed abstract class Attribute extends Serializable {
     StructField(name.get, DoubleType, nullable = false, newMetadata)
   }
 
-  /** Converts to a [[StructField]]. */
+  /** Converts to a `StructField`. */
   def toStructField(): StructField = toStructField(Metadata.empty)
 
   override def toString: String = toMetadataImpl(withType = true).toString
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index d07b4adebb08f..fe29926e0d994 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -56,13 +56,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Set threshold in binary classification, in range [0, 1].
    *
-   * If the estimated probability of class label 1 is > threshold, then predict 1, else 0.
+   * If the estimated probability of class label 1 is &gt; threshold, then predict 1, else 0.
    * A high threshold encourages the model to predict 0 more often;
    * a low threshold encourages the model to predict 1 more often.
    *
    * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`.
-   *       When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared.
-   *       If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be
+   *       When `setThreshold()` is called, any user-set value for `thresholds` will be cleared.
+   *       If both `threshold` and `thresholds` are set in a ParamMap, then they must be
    *       equivalent.
    *
    * Default is 0.5.
@@ -101,12 +101,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Get threshold for binary classification.
    *
-   * If [[thresholds]] is set with length 2 (i.e., binary classification),
+   * If `thresholds` is set with length 2 (i.e., binary classification),
    * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}.
-   * Otherwise, returns [[threshold]] if set, or its default value if unset.
+   * Otherwise, returns `threshold` if set, or its default value if unset.
    *
    * @group getParam
-   * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2.
+   * @throws IllegalArgumentException if `thresholds` is set to an array of length other than 2.
    */
   override def getThreshold: Double = {
     checkThresholdConsistency()
@@ -122,13 +122,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 
   /**
    * Set thresholds in multiclass (or binary) classification to adjust the probability of
-   * predicting each class. Array must have length equal to the number of classes, with values > 0,
-   * excepting that at most one value may be 0.
+   * predicting each class. Array must have length equal to the number of classes,
+   * with values &gt; 0, excepting that at most one value may be 0.
    * The class with largest value p/t is predicted, where p is the original probability of that
    * class and t is the class's threshold.
    *
-   * Note: When [[setThresholds()]] is called, any user-set value for [[threshold]] will be cleared.
-   *       If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be
+   * Note: When `setThresholds()` is called, any user-set value for `threshold` will be cleared.
+   *       If both `threshold` and `thresholds` are set in a ParamMap, then they must be
    *       equivalent.
    *
    * @group setParam
@@ -141,8 +141,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Get thresholds for binary or multiclass classification.
    *
-   * If [[thresholds]] is set, return its value.
-   * Otherwise, if [[threshold]] is set, return the equivalent thresholds for binary
+   * If `thresholds` is set, return its value.
+   * Otherwise, if `threshold` is set, return the equivalent thresholds for binary
    * classification: (1-threshold, threshold).
    * If neither are set, throw an exception.
    *
@@ -159,9 +159,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   }
 
   /**
-   * If [[threshold]] and [[thresholds]] are both set, ensures they are consistent.
+   * If `threshold` and `thresholds` are both set, ensures they are consistent.
    *
-   * @throws IllegalArgumentException if [[threshold]] and [[thresholds]] are not equivalent
+   * @throws IllegalArgumentException if `threshold` and `thresholds` are not equivalent
    */
   protected def checkThresholdConsistency(): Unit = {
     if (isSet(threshold) && isSet(thresholds)) {
@@ -207,7 +207,7 @@ class LogisticRegression @Since("1.2.0") (
   /**
    * Set the ElasticNet mixing parameter.
    * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+   * For 0 &lt; alpha &lt; 1, the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
    * @group setParam
@@ -294,7 +294,7 @@ class LogisticRegression @Since("1.2.0") (
   override def getThresholds: Array[Double] = super.getThresholds
 
   /**
-   * Suggested depth for treeAggregate (>= 2).
+   * Suggested depth for treeAggregate (&gt;= 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
@@ -815,7 +815,7 @@ class LogisticRegressionModel private[spark] (
 
   /**
    * Predict label for the given feature vector.
-   * The behavior of this can be adjusted using [[thresholds]].
+   * The behavior of this can be adjusted using `thresholds`.
    */
   override protected def predict(features: Vector): Double = if (isMultinomial) {
     super.predict(features)
@@ -1274,7 +1274,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  *
  * The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1}
  *       e^{\vec{x}_i^T \vec{\beta}_k}} \\
@@ -1283,7 +1283,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1}
  *       e^{\vec{x}_i^T \vec{\beta}_k}}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix
  * which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not
@@ -1292,7 +1292,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  * Note that the coefficients in the model above lack identifiability. That is, any constant scalar
  * can be added to all of the coefficients and the probabilities remain the same.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1}
@@ -1302,7 +1302,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * However, when regularization is added to the loss function, the coefficients are indeed
  * identifiable because there is only one set of coefficients which minimizes the regularization
@@ -1314,7 +1314,7 @@ class BinaryLogisticRegressionSummary private[classification] (
  * The loss of objective function for a single instance of data (we do not include the
  * regularization term here for simplicity) can be written as
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\
@@ -1322,14 +1322,14 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$.
  *
  * For optimization, we have to calculate the first derivative of the loss function, and a simple
  * calculation shows that
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}}
@@ -1338,54 +1338,54 @@ class BinaryLogisticRegressionSummary private[classification] (
  *    &= x_{i, j} \cdot w_i \cdot multiplier_k
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function
  *
- *  <p><blockquote>
+ *  <blockquote>
  *    $$
  *    I_{y=k} = \begin{cases}
  *          1 & y = k \\
  *          0 & else
  *       \end{cases}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * and
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1}
  *       e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right)
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
  * function will suffer from arithmetic overflow. This issue occurs when there are outliers in
  * data which are far away from the hyperplane, and this will cause the failing of training once
- * infinity is introduced. Note that this is only a concern when max(margins) > 0.
+ * infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
  *
- * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can easily
- * be rewritten into the following equivalent numerically stable formula.
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can
+ * easily be rewritten into the following equivalent numerically stable formula.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) -
  *       margins_{y} + maxMargin
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a
  * result, overflow will not happen with this formula.
  *
  * For $multiplier$, a similar trick can be applied as the following,
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1}
  *       e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right)
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * @param bcCoefficients The broadcast coefficients corresponding to the features.
  * @param bcFeaturesStd The broadcast standard deviation values of the features.
@@ -1513,7 +1513,7 @@ private class LogisticAggregator(
     }
 
     /**
-     * When maxMargin > 0, the original formula could cause overflow.
+     * When maxMargin &gt; 0, the original formula could cause overflow.
      * We address this by subtracting maxMargin from all the margins, so it's guaranteed
      * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
      */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 88fe7cb4a6e0f..1b45eafbaca23 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -289,7 +289,6 @@ object MultilayerPerceptronClassifier
  * @param uid uid
  * @param layers array of layer sizes including input and output layers
  * @param weights the weights of layers
- * @return prediction model
  */
 @Since("1.5.0")
 @Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index f1a7676c74b0e..a2ac7000003d4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -60,16 +60,20 @@ private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
   final def getModelType: String = $(modelType)
 }
 
+// scalastyle:off line.size.limit
 /**
  * Naive Bayes Classifiers.
  * It supports Multinomial NB
- * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]])
+ * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html">
+ * here</a>)
  * which can handle finitely supported discrete data. For example, by converting documents into
  * TF-IDF vectors, it can be used for document classification. By making every vector a
  * binary (0/1) data, it can also be used as Bernoulli NB
- * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]).
+ * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html">
+ * here</a>).
  * The input feature values must be nonnegative.
  */
+// scalastyle:on line.size.limit
 @Since("1.5.0")
 class NaiveBayes @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 52345b0626c47..907c73e2e4d0a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.functions._
 
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] learning algorithm for
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> learning algorithm for
  * classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
@@ -144,7 +144,7 @@ object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifi
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] model for classification.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  *
@@ -249,7 +249,7 @@ class RandomForestClassificationModel private[ml] (
    * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
    * and follows the implementation from scikit-learn.
    *
-   * @see [[DecisionTreeClassificationModel.featureImportances]]
+   * @see `DecisionTreeClassificationModel.featureImportances`
    */
   @Since("1.5.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index cf11ba37abb58..c7a170ddc7351 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -42,7 +42,7 @@ private[clustering] trait BisectingKMeansParams extends Params
   with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
 
   /**
-   * The desired number of leaf clusters. Must be > 1. Default: 4.
+   * The desired number of leaf clusters. Must be &gt; 1. Default: 4.
    * The actual number could be smaller if there are no divisible leaf clusters.
    * @group param
    */
@@ -55,8 +55,8 @@ private[clustering] trait BisectingKMeansParams extends Params
   def getK: Int = $(k)
 
   /**
-   * The minimum number of points (if >= 1.0) or the minimum proportion
-   * of points (if < 1.0) of a divisible cluster (default: 1.0).
+   * The minimum number of points (if &gt;= 1.0) or the minimum proportion
+   * of points (if &lt; 1.0) of a divisible cluster (default: 1.0).
    * @group expertParam
    */
   @Since("2.0.0")
@@ -208,9 +208,9 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
  * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters,
  * larger clusters get higher priority.
  *
- * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
- *     Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
- *     KDD Workshop on Text Mining, 2000.]]
+ * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
+ * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ * KDD Workshop on Text Mining, 2000.</a>
  */
 @Since("2.0.0")
 @Experimental
@@ -296,7 +296,7 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
  * :: Experimental ::
  * Summary of BisectingKMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]].
+ * @param predictions  `DataFrame` produced by `BisectingKMeansModel.transform()`.
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
index 8b5f525194f28..44e832b058b62 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.{DataFrame, Row}
  * :: Experimental ::
  * Summary of clustering algorithms.
  *
- * @param predictions  [[DataFrame]] produced by model.transform().
+ * @param predictions  `DataFrame` produced by model.transform().
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 19998ca44b115..74109344aac08 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -44,7 +44,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
   with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol {
 
   /**
-   * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2.
+   * Number of independent Gaussians in the mixture model. Must be &gt; 1. Default: 2.
    * @group param
    */
   @Since("2.0.0")
@@ -76,7 +76,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
  * @param weights Weight for each Gaussian distribution in the mixture.
  *                This is a multinomial probability distribution over the k Gaussians,
  *                where weights(i) is the weight for Gaussian i, and weights sum to 1.
- * @param gaussians Array of [[MultivariateGaussian]] where gaussians(i) represents
+ * @param gaussians Array of `MultivariateGaussian` where gaussians(i) represents
  *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
  */
 @Since("2.0.0")
@@ -374,7 +374,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
  * :: Experimental ::
  * Summary of GaussianMixture.
  *
- * @param predictions  [[DataFrame]] produced by [[GaussianMixtureModel.transform()]].
+ * @param predictions  `DataFrame` produced by `GaussianMixtureModel.transform()`.
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param probabilityCol  Name for column of predicted probability of each cluster
  *                        in `predictions`.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 152bd13b7a17a..6e124eb6ddca0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -42,7 +42,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
   with HasSeed with HasPredictionCol with HasTol {
 
   /**
-   * The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than
+   * The number of clusters to create (k). Must be &gt; 1. Note that it is possible for fewer than
    * k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
    * Default: 2.
    * @group param
@@ -72,7 +72,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
 
   /**
    * Param for the number of steps for the k-means|| initialization mode. This is an advanced
-   * setting -- the default of 2 is almost always enough. Must be > 0. Default: 2.
+   * setting -- the default of 2 is almost always enough. Must be &gt; 0. Default: 2.
    * @group expertParam
    */
   @Since("1.5.0")
@@ -250,7 +250,7 @@ object KMeansModel extends MLReadable[KMeansModel] {
  * :: Experimental ::
  * K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
  *
- * @see [[http://dx.doi.org/10.14778/2180912.2180915 Bahmani et al., Scalable k-means++.]]
+ * @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
  */
 @Since("1.5.0")
 @Experimental
@@ -346,7 +346,7 @@ object KMeans extends DefaultParamsReadable[KMeans] {
  * :: Experimental ::
  * Summary of KMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[KMeansModel.transform()]].
+ * @param predictions  `DataFrame` produced by `KMeansModel.transform()`.
  * @param predictionCol  Name for column of predicted clusters in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 7773802854c00..6032ab3db9350 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -50,7 +50,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
   with HasSeed with HasCheckpointInterval {
 
   /**
-   * Param for the number of topics (clusters) to infer. Must be > 1. Default: 10.
+   * Param for the number of topics (clusters) to infer. Must be &gt; 1. Default: 10.
    *
    * @group param
    */
@@ -78,13 +78,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *  - EM
    *     - Currently only supports symmetric distributions, so all values in the vector should be
    *       the same.
-   *     - Values should be > 1.0
+   *     - Values should be &gt; 1.0
    *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
    *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Values should be >= 0
+   *     - Values should be &gt;= 0
    *     - default = uniformly (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    *
    * @group param
    */
@@ -120,13 +120,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be > 1.0
+   *     - Value should be &gt; 1.0
    *     - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
    *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be >= 0
+   *     - Value should be &gt;= 0
    *     - default = (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    *
    * @group param
    */
@@ -162,11 +162,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *  - Online LDA:
    *     Hoffman, Blei and Bach.  "Online Learning for Latent Dirichlet Allocation."
    *     Neural Information Processing Systems, 2010.
-   *     [[http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf]]
+   *     See <a href="http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf">here</a>
    *  - EM:
    *     Asuncion et al.  "On Smoothing and Inference for Topic Models."
    *     Uncertainty in Artificial Intelligence, 2009.
-   *     [[http://arxiv.org/pdf/1205.2662.pdf]]
+   *     See <a href="http://arxiv.org/pdf/1205.2662.pdf">here</a>
    *
    * @group param
    */
@@ -245,9 +245,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    * Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent,
    * in range (0, 1].
    *
-   * Note that this should be adjusted in synch with [[LDA.maxIter]]
+   * Note that this should be adjusted in synch with `LDA.maxIter`
    * so the entire corpus is used.  Specifically, set both so that
-   * maxIterations * miniBatchFraction >= 1.
+   * maxIterations * miniBatchFraction &gt;= 1.
    *
    * Note: This is the same as the `miniBatchFraction` parameter in
    *       [[org.apache.spark.mllib.clustering.OnlineLDAOptimizer]].
@@ -293,8 +293,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    * cause failures if a data partition is lost, so set this bit with care.
    * Note that checkpoints will be cleaned up via reference counting, regardless.
    *
-   * See [[DistributedLDAModel.getCheckpointFiles]] for getting remaining checkpoints and
-   * [[DistributedLDAModel.deleteCheckpointFiles]] for removing remaining checkpoints.
+   * See `DistributedLDAModel.getCheckpointFiles` for getting remaining checkpoints and
+   * `DistributedLDAModel.deleteCheckpointFiles` for removing remaining checkpoints.
    *
    * Default: true
    *
@@ -431,7 +431,7 @@ sealed abstract class LDAModel private[ml] (
   private[ml] def getEffectiveTopicConcentration: Double = getModel.topicConcentration
 
   /**
-   * The features for LDA should be a [[Vector]] representing the word counts in a document.
+   * The features for LDA should be a `Vector` representing the word counts in a document.
    * The vector should be of length vocabSize, with counts for each term (word).
    *
    * @group setParam
@@ -650,7 +650,7 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] {
  * for each training document.
  *
  * @param oldLocalModelOption  Used to implement [[oldLocalModel]] as a lazy val, but keeping
- *                             [[copy()]] cheap.
+ *                             `copy()` cheap.
  */
 @Since("1.6.0")
 @Experimental
@@ -701,7 +701,7 @@ class DistributedLDAModel private[ml] (
    *  - Even with [[logPrior]], this is NOT the same as the data log likelihood given the
    *    hyperparameters.
    *  - This is computed from the topic distributions computed during training. If you call
-   *    [[logLikelihood()]] on the same training dataset, the topic distributions will be computed
+   *    `logLikelihood()` on the same training dataset, the topic distributions will be computed
    *    again, possibly giving different results.
    */
   @Since("1.6.0")
@@ -719,7 +719,7 @@ class DistributedLDAModel private[ml] (
   /**
    * :: DeveloperApi ::
    *
-   * If using checkpointing and [[LDA.keepLastCheckpoint]] is set to true, then there may be
+   * If using checkpointing and `LDA.keepLastCheckpoint` is set to true, then there may be
    * saved checkpoint files.  This method is provided so that users can manage those files.
    *
    * Note that removing the checkpoints can cause failures if a partition is lost and is needed
@@ -804,13 +804,13 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] {
  *
  * Input data (featuresCol):
  *  LDA is given a collection of documents as input data, via the featuresCol parameter.
- *  Each document is specified as a [[Vector]] of length vocabSize, where each entry is the
+ *  Each document is specified as a `Vector` of length vocabSize, where each entry is the
  *  count for the corresponding term (word) in the document.  Feature transformers such as
  *  [[org.apache.spark.ml.feature.Tokenizer]] and [[org.apache.spark.ml.feature.CountVectorizer]]
  *  can be useful for converting text to word count vectors.
  *
- * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
- *       (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">
+ * Latent Dirichlet allocation (Wikipedia)</a>
  */
 @Since("1.6.0")
 @Experimental
@@ -826,7 +826,7 @@ class LDA @Since("1.6.0") (
     optimizeDocConcentration -> true, keepLastCheckpoint -> true)
 
   /**
-   * The features for LDA should be a [[Vector]] representing the word counts in a document.
+   * The features for LDA should be a `Vector` representing the word counts in a document.
    * The vector should be of length vocabSize, with counts for each term (word).
    *
    * @group setParam
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
index 6ff36b35ca4c1..682787a830113 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.types.DataType
  * It returns a real vector of the same length representing the DCT. The return vector is scaled
  * such that the transform matrix is unitary (aka scaled DCT-II).
  *
- * More information on [[https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia]].
+ * More information on <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II">
+ * DCT-II in Discrete cosine transform (Wikipedia)</a>.
  */
 @Since("1.5.0")
 class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
index d9d0f32254e24..f37233e1ab9c8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
@@ -37,7 +37,8 @@ import org.apache.spark.sql.types.StructType
  * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
  *
  * Reference:
- * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]]
+ * <a href="https://en.wikipedia.org/wiki/Perfect_hash_function">
+ * Wikipedia on Perfect Hash Function</a>
  *
  * @param numEntries The number of entries of the hash functions.
  * @param randCoefficients An array of random coefficients, each used by one hash function.
@@ -98,7 +99,7 @@ class MinHashModel private[ml] (
  * as binary "1" values.
  *
  * References:
- * [[https://en.wikipedia.org/wiki/MinHash Wikipedia on MinHash]]
+ * <a href="https://en.wikipedia.org/wiki/MinHash">Wikipedia on MinHash</a>
  */
 @Experimental
 @Since("2.1.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index ccfb0ce8f85ca..19978c97d2cfd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -78,11 +78,11 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
  * statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
  * feature E is calculated as:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * For the case $E_{max} == E_{min}$, $Rescaled(e_i) = 0.5 * (max + min)$.
  *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 25fb6be5afd81..4be17da3e9f76 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -30,10 +30,12 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
- * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an
- * expansion of a product of sums expresses it as a sum of products by using the fact that
- * multiplication distributes over addition". Take a 2-variable feature vector as an example:
- * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
+ * which is available at
+ * <a href="http://en.wikipedia.org/wiki/Polynomial_expansion">Polynomial expansion (Wikipedia)</a>
+ * , "In mathematics, an expansion of a product of sums expresses it as a sum of products by using
+ * the fact that multiplication distributes over addition". Take a 2-variable feature vector
+ * as an example: `(x, y)`, if we want to expand it with degree 2, then we get
+ * `(x, x * x, y, x * y, y * y)`.
  */
 @Since("1.4.0")
 class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: String)
@@ -76,11 +78,11 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
  * (n + d choose d) (including 1 and first-order values). For example, let f([a, b, c], 3) be the
  * function that expands [a, b, c] to their monomials of degree 3. We have the following recursion:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    f([a, b, c], 3) &= f([a, b], 3) ++ f([a, b], 2) * c ++ f([a, b], 1) * c^2 ++ [c^3]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
  * current index and increment it properly for sparse input.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
index 1b524c6710b42..2bff59a0da173 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
@@ -113,8 +113,8 @@ class RandomProjectionModel private[ml] (
  *
  * References:
  *
- * 1. [[https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions
- * Wikipedia on Stable Distributions]]
+ * 1. <a href="https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions">
+ * Wikipedia on Stable Distributions</a>
  *
  * 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint
  * arXiv:1408.2927 (2014).
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index d76d556280e96..8f125d8fd51d2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -79,8 +79,8 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
  * statistics on the samples in the training set.
  *
  * The "unit std" is computed using the
- * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation
- *   corrected sample standard deviation]],
+ * <a href="https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
+ * corrected sample standard deviation</a>,
  * which is computed as the square root of the unbiased sample variance.
  */
 @Since("1.2.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 0ced21365ff6f..a55816249c74b 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
  * @note null values from input array are preserved unless adding null to stopWords
  * explicitly.
  *
- * @see [[http://en.wikipedia.org/wiki/Stop_words]]
+ * @see <a href="http://en.wikipedia.org/wiki/Stop_words">Stop words (Wikipedia)</a>
  */
 @Since("1.5.0")
 class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String)
@@ -132,7 +132,8 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] {
    * Loads the default stop words for the given language.
    * Supported languages: danish, dutch, english, finnish, french, german, hungarian,
    * italian, norwegian, portuguese, russian, spanish, swedish, turkish
-   * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]]
+   * @see <a href="http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/">
+   * here</a>
    */
   @Since("2.0.0")
   def loadDefaultStopWords(language: String): Array[String] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
index b94187ae787cc..5dd648aecc95c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
@@ -84,6 +84,7 @@ import org.apache.spark.sql.DataFrame
  * input dataset, while MLlib's feature transformers operate lazily on individual columns,
  * which is more efficient and flexible to handle large and complex datasets.
  *
- * @see [[http://scikit-learn.org/stable/modules/preprocessing.html scikit-learn.preprocessing]]
+ * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html">
+ * scikit-learn.preprocessing</a>
  */
 package object feature
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
index 8a6b862cda170..143bf539b0afe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
@@ -50,9 +50,10 @@ private[ml] class IterativelyReweightedLeastSquaresModel(
  * @param maxIter maximum number of iterations.
  * @param tol the convergence tolerance.
  *
- * @see [[http://www.jstor.org/stable/2345503 P. J. Green, Iteratively Reweighted Least Squares
- *     for Maximum Likelihood Estimation, and some Robust and Resistant Alternatives,
- *     Journal of the Royal Statistical Society. Series B, 1984.]]
+ * @see <a href="http://www.jstor.org/stable/2345503">P. J. Green, Iteratively
+ * Reweighted Least Squares for Maximum Likelihood Estimation, and some Robust
+ * and Resistant Alternatives, Journal of the Royal Statistical Society.
+ * Series B, 1984.</a>
  */
 private[ml] class IterativelyReweightedLeastSquares(
     val initialModel: WeightedLeastSquaresModel,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index fa4530927e8b0..e3e03dfd43dd6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -29,7 +29,7 @@ import org.apache.spark.ml.param._
 private[ml] trait HasRegParam extends Params {
 
   /**
-   * Param for regularization parameter (>= 0).
+   * Param for regularization parameter (&gt;= 0).
    * @group param
    */
   final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0))
@@ -44,7 +44,7 @@ private[ml] trait HasRegParam extends Params {
 private[ml] trait HasMaxIter extends Params {
 
   /**
-   * Param for maximum number of iterations (>= 0).
+   * Param for maximum number of iterations (&gt;= 0).
    * @group param
    */
   final val maxIter: IntParam = new IntParam(this, "maxIter", "maximum number of iterations (>= 0)", ParamValidators.gtEq(0))
@@ -238,7 +238,7 @@ private[ml] trait HasOutputCol extends Params {
 private[ml] trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * @group param
    */
   final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
@@ -334,7 +334,7 @@ private[ml] trait HasElasticNetParam extends Params {
 private[ml] trait HasTol extends Params {
 
   /**
-   * Param for the convergence tolerance for iterative algorithms (>= 0).
+   * Param for the convergence tolerance for iterative algorithms (&gt;= 0).
    * @group param
    */
   final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms (>= 0)", ParamValidators.gtEq(0))
@@ -349,7 +349,7 @@ private[ml] trait HasTol extends Params {
 private[ml] trait HasStepSize extends Params {
 
   /**
-   * Param for Step size to be used for each iteration of optimization (> 0).
+   * Param for Step size to be used for each iteration of optimization (&gt; 0).
    * @group param
    */
   final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization (> 0)", ParamValidators.gt(0))
@@ -396,7 +396,7 @@ private[ml] trait HasSolver extends Params {
 private[ml] trait HasAggregationDepth extends Params {
 
   /**
-   * Param for suggested depth for treeAggregate (>= 2).
+   * Param for suggested depth for treeAggregate (&gt;= 2).
    * @group expertParam
    */
   final val aggregationDepth: IntParam = new IntParam(this, "aggregationDepth", "suggested depth for treeAggregate (>= 2)", ParamValidators.gtEq(2))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 9d5ba999781f6..d6ad1ea6d1096 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -119,7 +119,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
 /**
  * :: Experimental ::
  * Fit a parametric survival regression model named accelerated failure time (AFT) model
- * ([[https://en.wikipedia.org/wiki/Accelerated_failure_time_model]])
+ * (see <a href="https://en.wikipedia.org/wiki/Accelerated_failure_time_model">
+ * Accelerated failure time model (Wikipedia)</a>)
  * based on the Weibull distribution of the survival time.
  */
 @Experimental
@@ -432,24 +433,24 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
  * Given the values of the covariates $x^{'}$, for random lifetime $t_{i}$ of subjects i = 1,..,n,
  * with possible right-censoring, the likelihood function under the AFT model is given as
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0}
  *      (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0}
  *    (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Where $\delta_{i}$ is the indicator of the event has occurred i.e. uncensored or not.
  * Using $\epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}$, the log-likelihood function
  * assumes the form
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+
  *    \delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  * Where $S_{0}(\epsilon_{i})$ is the baseline survivor function,
  * and $f_{0}(\epsilon_{i})$ is corresponding density function.
  *
@@ -458,34 +459,34 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
  * to extreme value distribution for log of the lifetime,
  * and the $S_{0}(\epsilon)$ function is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}})
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * and the $f_{0}(\epsilon_{i})$ function is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}})
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * The log-likelihood function for Weibull distribution of lifetime is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \iota(\beta,\sigma)=
  *    -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability,
  * the loss function we use to optimize is $-\iota(\beta,\sigma)$.
  * The gradient functions for $\beta$ and $\log\sigma$ respectively are
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \frac{\partial (-\iota)}{\partial \beta}=
  *    \sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma} \\
@@ -493,7 +494,7 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
  *    \frac{\partial (-\iota)}{\partial (\log\sigma)}=
  *    \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * @param bcParameters The broadcasted value includes three part: The log of scale parameter,
  *                     the intercept and regression coefficients corresponding to the features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 1419da874709f..894b6a2ca2041 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -38,8 +38,8 @@ import org.apache.spark.sql.functions._
 
 
 /**
- * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm
- * for regression.
+ * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">Decision tree</a>
+ * learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
 @Since("1.4.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index fa69d60836e68..ed2d05525d611 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.functions._
 
 /**
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
  * learning algorithm for regression.
  * It supports both continuous and categorical features.
  *
@@ -151,7 +151,7 @@ object GBTRegressor extends DefaultParamsReadable[GBTRegressor] {
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
  * model for regression.
  * It supports both continuous and categorical features.
  * @param _trees  Decision trees in the ensemble.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index f33dd0fd294ba..1201ecd5e4e61 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -123,9 +123,11 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
 /**
  * :: Experimental ::
  *
- * Fit a Generalized Linear Model ([[https://en.wikipedia.org/wiki/Generalized_linear_model]])
- * specified by giving a symbolic description of the linear predictor (link function) and
- * a description of the error distribution (family).
+ * Fit a Generalized Linear Model
+ * (see <a href="https://en.wikipedia.org/wiki/Generalized_linear_model">
+ * Generalized linear model (Wikipedia)</a>)
+ * specified by giving a symbolic description of the linear
+ * predictor (link function) and a description of the error distribution (family).
  * It supports "gaussian", "binomial", "poisson" and "gamma" as family.
  * Valid link functions for each family is listed below. The first link function of each family
  * is the default one.
@@ -196,11 +198,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
   /**
    * Sets the regularization parameter for L2 regularization.
    * The regularization term is
-   * <p><blockquote>
+   * <blockquote>
    *    $$
    *    0.5 * regParam * L2norm(coefficients)^2
    *    $$
-   * </blockquote></p>
+   * </blockquote>
    * Default is 0.0.
    *
    * @group setParam
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 8ea5e1e6c453a..eb4e38cc83c19 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -624,7 +624,8 @@ class LinearRegressionSummary private[regression] (
   /**
    * Returns the explained variance regression score.
    * explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
-   * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+   * Reference: <a href="http://en.wikipedia.org/wiki/Explained_variation">
+   * Wikipedia explain variation</a>
    *
    * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
    * This will change in later Spark versions.
@@ -664,7 +665,8 @@ class LinearRegressionSummary private[regression] (
 
   /**
    * Returns R^2^, the coefficient of determination.
-   * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+   * Reference: <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
+   * Wikipedia coefficient of determination</a>
    *
    * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
    * This will change in later Spark versions.
@@ -805,11 +807,11 @@ class LinearRegressionSummary private[regression] (
  * When training with intercept enabled,
  * The objective function in the scaled space is given by
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $\bar{x_i}$ is the mean of $x_i$, $\hat{x_i}$ is the standard deviation of $x_i$,
  * $\bar{y}$ is the mean of label, and $\hat{y}$ is the standard deviation of label.
@@ -820,7 +822,7 @@ class LinearRegressionSummary private[regression] (
  *
  * This can be rewritten as
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *     L &= 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
@@ -828,34 +830,34 @@ class LinearRegressionSummary private[regression] (
  *       &= 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $w_i^\prime$ is the effective coefficients defined by $w_i/\hat{x_i}$, offset is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}.
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * and diff is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \sum_i w_i^\prime x_i - y / \hat{y} + offset
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Note that the effective coefficients and offset don't depend on training dataset,
  * so they can be precomputed.
  *
  * Now, the first derivative of the objective function in scaled space is
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \frac{\partial L}{\partial w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * However, $(x_i - \bar{x_i})$ will densify the computation, so it's not
  * an ideal formula when the training dataset is sparse format.
@@ -865,7 +867,7 @@ class LinearRegressionSummary private[regression] (
  * objective function from all the samples is
  *
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *       \frac{\partial L}{\partial w_i} &=
@@ -874,14 +876,14 @@ class LinearRegressionSummary private[regression] (
  *         &= 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i)
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $correction_i = - diffSum \bar{x_i} / \hat{x_i}$
  *
  * A simple math can show that diffSum is actually zero, so we don't even
  * need to add the correction terms in the end. From the definition of diff,
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *       diffSum &= \sum_j (\sum_i w_i(x_{ij} - \bar{x_i})
@@ -890,17 +892,17 @@ class LinearRegressionSummary private[regression] (
  *         &= 0
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * As a result, the first derivative of the total objective function only depends on
  * the training dataset, which can be easily computed in distributed fashion, and is
  * sparse format friendly.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \frac{\partial L}{\partial w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i})
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * @param bcCoefficients The broadcast coefficients corresponding to the features.
  * @param labelStd The standard deviation value of the label.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 0ad00aa6f9280..d60f05eed58d9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -37,7 +37,8 @@ import org.apache.spark.sql.functions._
 
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] learning algorithm for regression.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
+ * learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
 @Since("1.4.0")
@@ -132,7 +133,7 @@ object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]] model for regression.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for regression.
  * It supports both continuous and categorical features.
  *
  * @param _trees  Decision trees in the ensemble.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
index e1376927030e4..e4de8483cfa3c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.ml.source.libsvm
 
-import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.sql.{DataFrame, DataFrameReader}
-
 /**
- * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]].
- * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and
- * `features` containing feature vectors stored as [[Vector]]s.
+ * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as `DataFrame`.
+ * The loaded `DataFrame` has two columns: `label` containing labels stored as doubles and
+ * `features` containing feature vectors stored as `Vector`s.
  *
- * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and
+ * To use LIBSVM data source, you need to set "libsvm" as the format in `DataFrameReader` and
  * optionally specify options, for example:
  * {{{
  *   // Scala
@@ -51,6 +48,6 @@ import org.apache.spark.sql.{DataFrame, DataFrameReader}
  * @note This class is public for documentation purpose. Please don't use this class directly.
  * Rather, use the data source API as illustrated above.
  *
- * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
+ * @see <a href="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">LIBSVM datasets</a>
  */
 class LibSVMDataSource private() {}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index 0a0bc4c006389..f3bace8181570 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -34,7 +34,7 @@ private[spark] object GradientBoostedTrees extends Logging {
 
   /**
    * Method to train a gradient boosting model
-   * @param input Training dataset: RDD of [[LabeledPoint]].
+   * @param input Training dataset: RDD of `LabeledPoint`.
    * @param seed Random seed.
    * @return tuple of ensemble models and weights:
    *         (array of decision tree models, array of model weights)
@@ -59,12 +59,12 @@ private[spark] object GradientBoostedTrees extends Logging {
 
   /**
    * Method to validate a gradient boosting model
-   * @param input Training dataset: RDD of [[LabeledPoint]].
+   * @param input Training dataset: RDD of `LabeledPoint`.
    * @param validationInput Validation dataset.
    *                        This dataset should be different from the training dataset,
    *                        but it should follow the same distribution.
    *                        E.g., these two datasets could be created from an original dataset
-   *                        by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+   *                        by using `org.apache.spark.rdd.RDD.randomSplit()`
    * @param seed Random seed.
    * @return tuple of ensemble models and weights:
    *         (array of decision tree models, array of model weights)
@@ -162,7 +162,7 @@ private[spark] object GradientBoostedTrees extends Logging {
    * Method to calculate error of the base learner for the gradient boosting calculation.
    * Note: This method is not used by the gradient boosting algorithm but is useful for debugging
    * purposes.
-   * @param data Training dataset: RDD of [[LabeledPoint]].
+   * @param data Training dataset: RDD of `LabeledPoint`.
    * @param trees Boosted Decision Tree models
    * @param treeWeights Learning rates at each boosting iteration.
    * @param loss evaluation metric.
@@ -184,7 +184,7 @@ private[spark] object GradientBoostedTrees extends Logging {
   /**
    * Method to compute error or loss for every iteration of gradient boosting.
    *
-   * @param data RDD of [[LabeledPoint]]
+   * @param data RDD of `LabeledPoint`
    * @param trees Boosted Decision Tree models
    * @param treeWeights Learning rates at each boosting iteration.
    * @param loss evaluation metric.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 8ae5ca3c84b0e..a61ea374cbd46 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -82,7 +82,7 @@ private[spark] object RandomForest extends Logging {
   /**
    * Train a random forest.
    *
-   * @param input Training data: RDD of [[LabeledPoint]]
+   * @param input Training data: RDD of `LabeledPoint`
    * @return an unweighted set of trees
    */
   def run(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 5a551533be9ca..40510ad804ef0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -342,9 +342,9 @@ private[ml] trait HasFeatureSubsetStrategy extends Params {
    *  - sqrt: recommended by Breiman manual for random forests
    *  - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
    *    package.
-   * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf  Breiman (2001)]]
-   * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf  Breiman manual for
-   *     random forests]]
+   * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
+   * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
+   * Breiman manual for random forests</a>
    *
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 6ea52ef7f025f..85191d46fd360 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types.StructType
  */
 private[ml] trait CrossValidatorParams extends ValidatorParams {
   /**
-   * Param for number of folds for cross validation.  Must be >= 2.
+   * Param for number of folds for cross validation.  Must be &gt;= 2.
    * Default: 3
    *
    * @group param
@@ -198,7 +198,7 @@ object CrossValidator extends MLReadable[CrossValidator] {
  *
  * @param bestModel The best model selected from k-fold cross validation.
  * @param avgMetrics Average cross-validation metrics for each paramMap in
- *                   [[CrossValidator.estimatorParamMaps]], in the corresponding order.
+ *                   `CrossValidator.estimatorParamMaps`, in the corresponding order.
  */
 @Since("1.2.0")
 class CrossValidatorModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index e5fa5d53e3fca..5b7e5ec75c842 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -163,7 +163,7 @@ trait MLWritable {
 /**
  * :: DeveloperApi ::
  *
- * Helper trait for making simple [[Params]] types writable.  If a [[Params]] class stores
+ * Helper trait for making simple `Params` types writable.  If a `Params` class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide
  * a default implementation of writing saved instances of the class.
  * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
@@ -231,7 +231,7 @@ trait MLReadable[T] {
 /**
  * :: DeveloperApi ::
  *
- * Helper trait for making simple [[Params]] types readable.  If a [[Params]] class stores
+ * Helper trait for making simple `Params` types readable.  If a `Params` class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide
  * a default implementation of reading saved instances of the class.
  * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
@@ -360,7 +360,7 @@ private[ml] object DefaultParamsReader {
 
     /**
      * Get the JSON value of the [[org.apache.spark.ml.param.Param]] of the given name.
-     * This can be useful for getting a Param value before an instance of [[Params]]
+     * This can be useful for getting a Param value before an instance of `Params`
      * is available.
      */
     def getParamValue(paramName: String): JValue = {
@@ -438,7 +438,7 @@ private[ml] object DefaultParamsReader {
   }
 
   /**
-   * Load a [[Params]] instance from the given path, and return it.
+   * Load a `Params` instance from the given path, and return it.
    * This assumes the instance implements [[MLReadable]].
    */
   def loadParamsInstance[T](path: String, sc: SparkContext): T = {
@@ -454,7 +454,7 @@ private[ml] object DefaultParamsReader {
 private[ml] object MetaAlgorithmReadWrite {
   /**
    * Examine the given estimator (which may be a compound estimator) and extract a mapping
-   * from UIDs to corresponding [[Params]] instances.
+   * from UIDs to corresponding `Params` instances.
    */
   def getUidMap(instance: Params): Map[String, Params] = {
     val uidList = getUidMapImpl(instance)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 767d056861a8b..fa46ba3ace508 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -302,10 +302,11 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
 /**
  * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
  *
- * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of
- * discrete data.  For example, by converting documents into TF-IDF vectors, it can be used for
- * document classification.  By making every vector a 0-1 vector, it can also be used as
- * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative.
+ * This is the Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>) which can
+ * handle all kinds of discrete data. For example, by converting documents into TF-IDF
+ * vectors, it can be used for document classification. By making every vector a 0-1 vector,
+ * it can also be used as Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>).
+ * The input feature values must be nonnegative.
  */
 @Since("0.9.0")
 class NaiveBayes private (
@@ -402,9 +403,9 @@ object NaiveBayes {
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
    *
-   * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all
-   * kinds of discrete data.  For example, by converting documents into TF-IDF vectors, it
-   * can be used for document classification.
+   * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>)
+   * which can handle all kinds of discrete data. For example, by converting documents into
+   * TF-IDF vectors, it can be used for document classification.
    *
    * This version of the method uses a default smoothing parameter of 1.0.
    *
@@ -419,9 +420,9 @@ object NaiveBayes {
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
    *
-   * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all
-   * kinds of discrete data.  For example, by converting documents into TF-IDF vectors, it
-   * can be used for document classification.
+   * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>)
+   * which can handle all kinds of discrete data. For example, by converting documents
+   * into TF-IDF vectors, it can be used for document classification.
    *
    * @param input RDD of `(label, array of features)` pairs.  Every vector should be a frequency
    *              vector or a count vector.
@@ -435,9 +436,10 @@ object NaiveBayes {
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
    *
-   * The model type can be set to either Multinomial NB ([[http://tinyurl.com/lsdw6p]])
-   * or Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The Multinomial NB can handle
-   * discrete count data and can be called by setting the model type to "multinomial".
+   * The model type can be set to either Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">
+   * here</a>) or Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>).
+   * The Multinomial NB can handle discrete count data and can be called by setting the model
+   * type to "multinomial".
    * For example, it can be used with word counts or TF_IDF vectors of documents.
    * The Bernoulli model fits presence or absence (0-1) counts. By making every vector a
    * 0-1 vector and setting the model type to "bernoulli", the  fits and predicts as
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index e6b89712e219d..31f51417528b3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -43,13 +43,14 @@ import org.apache.spark.storage.StorageLevel
  * @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if
  *          there are no divisible leaf clusters.
  * @param maxIterations the max number of k-means iterations to split clusters (default: 20)
- * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum proportion
- *                                of points (if < 1.0) of a divisible cluster (default: 1)
+ * @param minDivisibleClusterSize the minimum number of points (if &gt;= 1.0) or the minimum
+ *                                proportion of points (if &lt; 1.0) of a divisible cluster
+ *                                (default: 1)
  * @param seed a random seed (default: hash value of the class name)
  *
- * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
- *     Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
- *     KDD Workshop on Text Mining, 2000.]]
+ * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
+ * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ * KDD Workshop on Text Mining, 2000.</a>
  */
 @Since("1.6.0")
 class BisectingKMeans private (
@@ -100,8 +101,8 @@ class BisectingKMeans private (
   def getMaxIterations: Int = this.maxIterations
 
   /**
-   * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points
-   * (if < `1.0`) of a divisible cluster (default: 1).
+   * Sets the minimum number of points (if &gt;= `1.0`) or the minimum proportion of points
+   * (if &lt; `1.0`) of a divisible cluster (default: 1).
    */
   @Since("1.6.0")
   def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = {
@@ -112,8 +113,8 @@ class BisectingKMeans private (
   }
 
   /**
-   * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points
-   * (if < `1.0`) of a divisible cluster.
+   * Gets the minimum number of points (if &gt;= `1.0`) or the minimum proportion of points
+   * (if &lt; `1.0`) of a divisible cluster.
    */
   @Since("1.6.0")
   def getMinDivisibleClusterSize: Double = minDivisibleClusterSize
@@ -218,7 +219,7 @@ class BisectingKMeans private (
   }
 
   /**
-   * Java-friendly version of [[run()]].
+   * Java-friendly version of `run()`.
    */
   def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 8438015ccecea..6f1ab091b2317 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -71,7 +71,7 @@ class BisectingKMeansModel private[clustering] (
   }
 
   /**
-   * Java-friendly version of [[predict()]].
+   * Java-friendly version of `predict()`.
    */
   @Since("1.6.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
@@ -95,7 +95,7 @@ class BisectingKMeansModel private[clustering] (
   }
 
   /**
-   * Java-friendly version of [[computeCost()]].
+   * Java-friendly version of `computeCost()`.
    */
   @Since("1.6.0")
   def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 56cdeea5f7a3f..6873d4277a8db 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -234,7 +234,7 @@ class GaussianMixture private (
   }
 
   /**
-   * Java-friendly version of [[run()]]
+   * Java-friendly version of `run()`
    */
   @Since("1.3.0")
   def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
@@ -273,8 +273,8 @@ class GaussianMixture private (
 
 private[clustering] object GaussianMixture {
   /**
-   * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when
-   * d > 25 except for when k is very small.
+   * Heuristic to distribute the computation of the `MultivariateGaussian`s, approximately when
+   * d &gt; 25 except for when k is very small.
    * @param k  Number of topics
    * @param d  Number of features
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index c30cc3e2398e4..afbe4f978b286 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -80,7 +80,7 @@ class GaussianMixtureModel @Since("1.3.0") (
   }
 
   /**
-   * Java-friendly version of [[predict()]]
+   * Java-friendly version of `predict()`
    */
   @Since("1.4.0")
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 7c52abdeaac22..16742bd284e69 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -39,8 +39,8 @@ import org.apache.spark.util.Utils
  *  - Original LDA paper (journal version):
  *    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
  *
- * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
- *       (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">
+ * Latent Dirichlet allocation (Wikipedia)</a>
  */
 @Since("1.3.0")
 class LDA private (
@@ -113,20 +113,20 @@ class LDA private (
    *
    * If set to a singleton vector Vector(-1), then docConcentration is set automatically. If set to
    * singleton vector Vector(t) where t != -1, then t is replicated to a vector of length k during
-   * [[LDAOptimizer.initialize()]]. Otherwise, the [[docConcentration]] vector must be length k.
+   * `LDAOptimizer.initialize()`. Otherwise, the [[docConcentration]] vector must be length k.
    * (default = Vector(-1) = automatic)
    *
    * Optimizer-specific parameter settings:
    *  - EM
    *     - Currently only supports symmetric distributions, so all values in the vector should be
    *       the same.
-   *     - Values should be > 1.0
+   *     - Values should be &gt; 1.0
    *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
    *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Values should be >= 0
+   *     - Values should be &gt;= 0
    *     - default = uniformly (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    */
   @Since("1.5.0")
   def setDocConcentration(docConcentration: Vector): this.type = {
@@ -158,13 +158,13 @@ class LDA private (
   def getAlpha: Double = getDocConcentration
 
   /**
-   * Alias for [[setDocConcentration()]]
+   * Alias for `setDocConcentration()`
    */
   @Since("1.5.0")
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
 
   /**
-   * Alias for [[setDocConcentration()]]
+   * Alias for `setDocConcentration()`
    */
   @Since("1.3.0")
   def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
@@ -195,13 +195,13 @@ class LDA private (
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be > 1.0
+   *     - Value should be &gt; 1.0
    *     - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
    *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be >= 0
+   *     - Value should be &gt;= 0
    *     - default = (1.0 / k), following the implementation from
-   *       [[https://github.com/Blei-Lab/onlineldavb]].
+   *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    */
   @Since("1.3.0")
   def setTopicConcentration(topicConcentration: Double): this.type = {
@@ -321,7 +321,7 @@ class LDA private (
    * @param documents  RDD of documents, which are term (word) count vectors paired with IDs.
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary
    *                   (where the vocabulary size is the length of the vector).
-   *                   Document IDs must be unique and >= 0.
+   *                   Document IDs must be unique and &gt;= 0.
    * @return  Inferred LDA model
    */
   @Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index b5b0e64a2a6c6..017fbc6feb0d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -171,7 +171,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary
    *                   (where the vocabulary size is the length of the vector).
    *                   This must use the same vocabulary (ordering of term counts) as in training.
-   *                   Document IDs must be unique and >= 0.
+   *                   Document IDs must be unique and &gt;= 0.
    * @return  Estimated topic distribution for each document.
    *          The returned RDD may be zipped with the given RDD, where each returned vector
    *          is a multinomial distribution over topics.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 7365ea1f200da..9687fc8804e89 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -563,7 +563,7 @@ private[clustering] object OnlineLDAOptimizer {
    *
    * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
    * avoids explicit computation of variational parameter `phi`.
-   * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
+   * @see <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566">here</a>
    *
    * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` -
    *         statistics for updating lambda and `ids` - list of termCounts vector indices.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index c760ddd6ad40b..4d3e265455da6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -36,7 +36,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * Model produced by [[PowerIterationClustering]].
  *
  * @param k number of clusters
- * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
+ * @param assignments an RDD of clustering `PowerIterationClustering#Assignment`s
  */
 @Since("1.3.0")
 class PowerIterationClusteringModel @Since("1.3.0") (
@@ -103,9 +103,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
 
 /**
  * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
- * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
- * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
- * similarity matrix of the data.
+ * <a href="http://www.icml2010.org/papers/387.pdf">Lin and Cohen</a>. From the abstract: PIC finds
+ * a very low-dimensional embedding of a dataset using truncated power iteration on a normalized
+ * pair-wise similarity matrix of the data.
  *
  * @param k Number of clusters.
  * @param maxIterations Maximum number of iterations of the PIC algorithm.
@@ -113,7 +113,8 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
  *                 as vertex properties, or "degree" to use normalized sum similarities.
  *                 Default: random.
  *
- * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Spectral_clustering">
+ * Spectral clustering (Wikipedia)</a>
  */
 @Since("1.3.0")
 class PowerIterationClustering private[clustering] (
@@ -210,7 +211,7 @@ class PowerIterationClustering private[clustering] (
   }
 
   /**
-   * A Java-friendly version of [[PowerIterationClustering.run]].
+   * A Java-friendly version of `PowerIterationClustering.run`.
    */
   @Since("1.3.0")
   def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)])
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index f20ab09bf0b42..85c37c438d93a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -39,14 +39,14 @@ import org.apache.spark.util.random.XORShiftRandom
  * generalized to incorporate forgetfullness (i.e. decay).
  * The update rule (for each cluster) is:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *     c_t+1 &= [(c_t * n_t * a) + (x_t * m_t)] / [n_t + m_t] \\
  *     n_t+t &= n_t * a + m_t
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * Where c_t is the previously estimated centroid for that cluster,
  * n_t is the number of points assigned to it thus far, x_t is the centroid
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 8f777cc35b93f..ad99b00a31fd5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -74,7 +74,8 @@ class RegressionMetrics @Since("2.0.0") (
   /**
    * Returns the variance explained by regression.
    * explainedVariance = $\sum_i (\hat{y_i} - \bar{y})^2^ / n$
-   * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
+   * @see <a href="https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained">
+   * Fraction of variance unexplained (Wikipedia)</a>
    */
   @Since("1.2.0")
   def explainedVariance: Double = {
@@ -110,10 +111,11 @@ class RegressionMetrics @Since("2.0.0") (
 
   /**
    * Returns R^2^, the unadjusted coefficient of determination.
-   * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+   * @see <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
+   * Coefficient of determination (Wikipedia)</a>
    * In case of regression through the origin, the definition of R^2^ is to be modified.
-   * @see J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)
-   * [[https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf]]
+   * @see <a href="https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf">
+   * J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)</a>
    */
   @Since("1.2.0")
   def r2: Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 0f7fbe9556c5d..b53386012280d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -147,18 +147,18 @@ object FPGrowthModel extends Loader[FPGrowthModel[_]] {
 
 /**
  * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
- * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query
- *  Recommendation]]. PFP distributes computation in such a way that each worker executes an
+ * <a href="http://dx.doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query
+ * Recommendation</a>. PFP distributes computation in such a way that each worker executes an
  * independent group of mining tasks. The FP-Growth algorithm is described in
- * [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate
- *  generation]].
+ * <a href="http://dx.doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without
+ * candidate generation</a>.
  *
  * @param minSupport the minimal support level of the frequent pattern, any pattern that appears
  *                   more than (minSupport * size-of-the-dataset) times will be output
  * @param numPartitions number of partitions used by parallel FP-growth
  *
- * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
- *       (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Association_rule_learning">
+ * Association rule learning (Wikipedia)</a>
  *
  */
 @Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index 7382000791cfb..a5641672218dd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -44,7 +44,8 @@ import org.apache.spark.storage.StorageLevel
 /**
  * A parallel PrefixSpan algorithm to mine frequent sequential patterns.
  * The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns
- * Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]).
+ * Efficiently by Prefix-Projected Pattern Growth
+ * (see <a href="http://doi.org/10.1109/ICDE.2001.914830">here</a>).
  *
  * @param minSupport the minimal support level of the sequential pattern, any pattern that appears
  *                   more than (minSupport * size-of-the-dataset) times will be output
@@ -55,8 +56,8 @@ import org.apache.spark.storage.StorageLevel
  *                           processing. If a projected database exceeds this size, another
  *                           iteration of distributed prefix growth is run.
  *
- * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining
- *       (Wikipedia)]]
+ * @see <a href="https://en.wikipedia.org/wiki/Sequential_Pattern_Mining">Sequential Pattern Mining
+ * (Wikipedia)</a>
  */
 @Since("1.5.0")
 class PrefixSpan private (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 03866753b50ee..9e75217410d36 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -385,10 +385,10 @@ class BlockMatrix @Since("1.3.0") (
   /**
    * Adds the given block matrix `other` to `this` block matrix: `this + other`.
    * The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock`
-   * values. If one of the blocks that are being added are instances of [[SparseMatrix]],
-   * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being added
-   * to a [[DenseMatrix]]. If two dense matrices are added, the output will also be a
-   * [[DenseMatrix]].
+   * values. If one of the blocks that are being added are instances of `SparseMatrix`,
+   * the resulting sub matrix will also be a `SparseMatrix`, even if it is being added
+   * to a `DenseMatrix`. If two dense matrices are added, the output will also be a
+   * `DenseMatrix`.
    */
   @Since("1.3.0")
   def add(other: BlockMatrix): BlockMatrix =
@@ -397,10 +397,10 @@ class BlockMatrix @Since("1.3.0") (
   /**
    * Subtracts the given block matrix `other` from `this` block matrix: `this - other`.
    * The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock`
-   * values. If one of the blocks that are being subtracted are instances of [[SparseMatrix]],
-   * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being subtracted
-   * from a [[DenseMatrix]]. If two dense matrices are subtracted, the output will also be a
-   * [[DenseMatrix]].
+   * values. If one of the blocks that are being subtracted are instances of `SparseMatrix`,
+   * the resulting sub matrix will also be a `SparseMatrix`, even if it is being subtracted
+   * from a `DenseMatrix`. If two dense matrices are subtracted, the output will also be a
+   * `DenseMatrix`.
    */
   @Since("2.0.0")
   def subtract(other: BlockMatrix): BlockMatrix =
@@ -447,8 +447,8 @@ class BlockMatrix @Since("1.3.0") (
   /**
    * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
    * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
-   * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
-   * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
+   * `SparseMatrix`, they will have to be converted to a `DenseMatrix`. The output
+   * [[BlockMatrix]] will only consist of blocks of `DenseMatrix`. This may cause
    * some performance issues until support for multiplying two sparse matrices is added.
    *
    * @note The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 008b03d1cc334..d2c5b14a5b128 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -101,14 +101,14 @@ class CoordinateMatrix @Since("1.0.0") (
     toIndexedRowMatrix().toRowMatrix()
   }
 
-  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+  /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
 
   /**
-   * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]].
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
    * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index 809906a158337..590e959daa1f4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -90,14 +90,14 @@ class IndexedRowMatrix @Since("1.0.0") (
     new RowMatrix(rows.map(_.vector), 0L, nCols)
   }
 
-  /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+  /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
   }
 
   /**
-   * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]].
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
    * @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
    *                     a smaller value. Must be an integer value greater than 0.
    * @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 4b120332ab8d8..78a8810052aef 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -531,7 +531,7 @@ class RowMatrix @Since("1.0.0") (
    * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape.
    * Reference:
    *  Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
-   *  architectures"  ([[http://dx.doi.org/10.1145/1996092.1996103]])
+   *  architectures" (see <a href="http://dx.doi.org/10.1145/1996092.1996103">here</a>)
    *
    * @param computeQ whether to computeQ
    * @return QRDecomposition(Q, R), Q = null if computeQ = false.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index c49e72646bf13..0efce3c76f15a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -67,14 +67,14 @@ abstract class Gradient extends Serializable {
  * http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
  * multinomial logistic regression model. A simple calculation shows that
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    ...\\
  *    P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * for K classes multiclass classification problem.
  *
@@ -83,7 +83,7 @@ abstract class Gradient extends Serializable {
  * will be (K-1) * N.
  *
  * As a result, the loss of objective function for a single instance of data can be written as
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *    l(w, x) &= -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w) \\
@@ -91,7 +91,7 @@ abstract class Gradient extends Serializable {
  *            &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $\alpha(i) = 1$ if $i \ne 0$, and
  *       $\alpha(i) = 0$ if $i == 0$,
@@ -100,7 +100,7 @@ abstract class Gradient extends Serializable {
  * For optimization, we have to calculate the first derivative of the loss function, and
  * a simple calculation shows that
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *      \frac{\partial l(w, x)}{\partial w_{ij}} &=
@@ -108,7 +108,7 @@ abstract class Gradient extends Serializable {
  *                                               &= multiplier_i * x_j
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where $\delta_{i, j} = 1$ if $i == j$,
  *       $\delta_{i, j} = 0$ if $i != j$, and
@@ -118,12 +118,12 @@ abstract class Gradient extends Serializable {
  * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
  * function will be suffered from arithmetic overflow. This issue occurs when there are outliers
  * in data which are far away from hyperplane, and this will cause the failing of training once
- * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0.
+ * infinity / infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
  *
- * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can be
  * easily rewritten into the following equivalent numerically stable formula.
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *      l(w, x) &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} \\
@@ -132,7 +132,7 @@ abstract class Gradient extends Serializable {
  *              &= log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
 
  * where sum = $\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1$.
  *
@@ -141,7 +141,7 @@ abstract class Gradient extends Serializable {
  *
  * For multiplier, similar trick can be applied as the following,
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    \begin{align}
  *      multiplier
@@ -150,7 +150,7 @@ abstract class Gradient extends Serializable {
  *       &= \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
  *    \end{align}
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * where each term in $\exp$ is also smaller than zero, so overflow is not a concern.
  *
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 123e0bb3e607a..67da88e804da2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -88,10 +88,10 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * convergenceTol is a condition which decides iteration termination.
    * The end of iteration is decided based on below logic.
    *
-   *  - If the norm of the new solution vector is >1, the diff of solution vectors
+   *  - If the norm of the new solution vector is &gt;1, the diff of solution vectors
    *    is compared to relative tolerance which means normalizing by the norm of
    *    the new solution vector.
-   *  - If the norm of the new solution vector is <=1, the diff of solution vectors
+   *  - If the norm of the new solution vector is &lt;=1, the diff of solution vectors
    *    is compared to absolute tolerance which is not normalizing.
    *
    * Must be between 0.0 and 1.0 inclusively.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index e49363c2c64d9..6232ff30a747e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD
 /**
  * :: DeveloperApi ::
  * Class used to solve an optimization problem using Limited-memory BFGS.
- * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Limited-memory_BFGS">
+ * Wikipedia on Limited-memory BFGS</a>
  * @param gradient Gradient function to be used.
  * @param updater Updater to be used to update weights after every iteration.
  */
@@ -48,8 +49,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * Set the number of corrections used in the LBFGS update. Default 10.
    * Values of numCorrections less than 3 are not recommended; large values
    * of numCorrections will result in excessive computing time.
-   * 3 < numCorrections < 10 is recommended.
-   * Restriction: numCorrections > 0
+   * 3 &lt; numCorrections &lt; 10 is recommended.
+   * Restriction: numCorrections &gt; 0
    */
   def setNumCorrections(corrections: Int): this.type = {
     require(corrections > 0,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
index 64d52bae00907..b7c9fcfbfe60f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
@@ -54,7 +54,7 @@ private[spark] object NNLS {
    *
    * We solve the problem
    *   min_x      1/2 x^T ata x^T - x^T atb
-   *   subject to x >= 0
+   *   subject to x &gt;= 0
    *
    * The method used is similar to one described by Polyak (B. T. Polyak, The conjugate gradient
    * method in extremal problems, Zh. Vychisl. Mat. Mat. Fiz. 9(4)(1969), pp. 94-112) for bound-
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
index 67d484575db52..aa7dd1aaa60fe 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
@@ -95,9 +95,9 @@ class SimpleUpdater extends Updater {
  * The corresponding proximal operator for the L1 norm is the soft-thresholding
  * function. That is, each weight component is shrunk towards 0 by shrinkageVal.
  *
- * If w >  shrinkageVal, set weight component to w-shrinkageVal.
- * If w < -shrinkageVal, set weight component to w+shrinkageVal.
- * If -shrinkageVal < w < shrinkageVal, set weight component to 0.
+ * If w &gt; shrinkageVal, set weight component to w-shrinkageVal.
+ * If w &lt; -shrinkageVal, set weight component to w+shrinkageVal.
+ * If -shrinkageVal &lt; w &lt; shrinkageVal, set weight component to 0.
  *
  * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/package.scala
index 9810b6f668064..8323afcb6a833 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/package.scala
@@ -32,7 +32,7 @@ package org.apache.spark
  * to reach feature parity with the RDD-based APIs.
  * And once we reach feature parity, this package will be deprecated.
  *
- * @see [[https://issues.apache.org/jira/browse/SPARK-4591 SPARK-4591]] to track the progress of
- *     feature parity
+ * @see <a href="https://issues.apache.org/jira/browse/SPARK-4591">SPARK-4591</a> to track
+ * the progress of feature parity
  */
 package object mllib
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
index 005119616f063..32e6ecf6308e0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -48,7 +48,7 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable {
   }
 
   /**
-   * [[sliding(Int, Int)*]] with step = 1.
+   * `sliding(Int, Int)*` with step = 1.
    */
   def sliding(windowSize: Int): RDD[Array[T]] = sliding(windowSize, 1)
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index cc9ee15738ad6..d215885797176 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -54,11 +54,12 @@ case class Rating @Since("0.8.0") (
  *
  * For implicit preference data, the algorithm used is based on
  * "Collaborative Filtering for Implicit Feedback Datasets", available at
- * [[http://dx.doi.org/10.1109/ICDM.2008.22]], adapted for the blocked approach used here.
+ * <a href="http://dx.doi.org/10.1109/ICDM.2008.22">here</a>, adapted for the blocked approach
+ * used here.
  *
  * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
  * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of
+ * r &gt; 0 and 0 if r &lt;= 0. The ratings then act as 'confidence' values related to strength of
  * indicated user
  * preferences rather than explicit ratings given to items.
  */
@@ -280,7 +281,7 @@ class ALS private (
   }
 
   /**
-   * Java-friendly version of [[ALS.run]].
+   * Java-friendly version of `ALS.run`.
    */
   @Since("1.3.0")
   def run(ratings: JavaRDD[Rating]): MatrixFactorizationModel = run(ratings.rdd)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 24e4dcccc843f..23045fa2b6863 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -146,7 +146,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
   }
 
   /**
-   * Java-friendly version of [[MatrixFactorizationModel.predict]].
+   * Java-friendly version of `MatrixFactorizationModel.predict`.
    */
   @Since("1.2.0")
   def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = {
@@ -195,7 +195,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
    *  - human-readable (JSON) model metadata to path/metadata/
    *  - Parquet formatted data to path/data/
    *
-   * The model may be loaded using [[Loader.load]].
+   * The model may be loaded using `Loader.load`.
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
@@ -320,7 +320,7 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
   /**
    * Load a model from the given path.
    *
-   * The model should have been saved by [[Saveable.save]].
+   * The model should have been saved by `Saveable.save`.
    *
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 377326f8739b7..36894d52346af 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -238,23 +238,22 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
  * Sequential PAV implementation based on:
  * Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
  *   "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61.
- *   Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]]
+ *   Available from <a href="http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf">here</a>
  *
  * Sequential PAV parallelization based on:
  * Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
  *   "An approach to parallelizing isotonic regression."
  *   Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
- *   Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]]
+ *   Available from <a href="http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf">here</a>
  *
- * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Isotonic_regression">Isotonic regression
+ * (Wikipedia)</a>
  */
 @Since("1.3.0")
 class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
 
   /**
    * Constructs IsotonicRegression instance with default parameter isotonic = true.
-   *
-   * @return New instance of IsotonicRegression.
    */
   @Since("1.3.0")
   def this() = this(true)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 7a2a7a35a91cd..7dc0c459ec032 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -30,12 +30,15 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
  * the corresponding joint dataset.
  *
  * A numerically stable algorithm is implemented to compute the mean and variance of instances:
- * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * variance-wiki</a>
  * Zero elements (including explicit zero values) are skipped when calling add(),
  * to have time complexity O(nnz) instead of O(n) for each column.
  *
  * For weighted instances, the unbiased estimation of variance is defined by the reliability
- * weights: [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]].
+ * weights:
+ * see <a href="https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights">
+ * Reliability weights (Wikipedia)</a>.
  */
 @Since("1.1.0")
 @DeveloperApi
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 925fdf4d7e7bc..7ba9b292969e7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -88,7 +88,7 @@ object Statistics {
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
 
   /**
-   * Java-friendly version of [[corr()]]
+   * Java-friendly version of `corr()`
    */
   @Since("1.4.1")
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
@@ -112,7 +112,7 @@ object Statistics {
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
 
   /**
-   * Java-friendly version of [[corr()]]
+   * Java-friendly version of `corr()`
    */
   @Since("1.4.1")
   def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
@@ -176,7 +176,7 @@ object Statistics {
     ChiSqTest.chiSquaredFeatures(data)
   }
 
-  /** Java-friendly version of [[chiSqTest()]] */
+  /** Java-friendly version of `chiSqTest()` */
   @Since("1.5.0")
   def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd)
 
@@ -186,7 +186,8 @@ object Statistics {
    * distribution of the sample data and the theoretical distribution we can provide a test for the
    * the null hypothesis that the sample data comes from that theoretical distribution.
    * For more information on KS Test:
-   * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+   * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test">
+   * Kolmogorov-Smirnov test (Wikipedia)</a>
    *
    * @param data an `RDD[Double]` containing the sample of data to test
    * @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value
@@ -217,7 +218,7 @@ object Statistics {
     KolmogorovSmirnovTest.testOneSample(data, distName, params: _*)
   }
 
-  /** Java-friendly version of [[kolmogorovSmirnovTest()]] */
+  /** Java-friendly version of `kolmogorovSmirnovTest()` */
   @Since("1.5.0")
   @varargs
   def kolmogorovSmirnovTest(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index 39c3644450d6d..4cf662e036346 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -28,7 +28,8 @@ import org.apache.spark.mllib.util.MLUtils
  * This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
  * the event that the covariance matrix is singular, the density will be computed in a
  * reduced dimensional subspace under which the distribution is supported.
- * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case">
+ * Degenerate case in Multivariate normal distribution (Wikipedia)</a>)
  *
  * @param mu The mean vector of the distribution
  * @param sigma The covariance matrix of the distribution
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index ece1e41d986d0..cdeef16135015 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
 
 /**
  * A class that implements
- * [[http://en.wikipedia.org/wiki/Gradient_boosting  Stochastic Gradient Boosting]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Stochastic Gradient Boosting</a>
  * for regression and binary classification.
  *
  * The implementation is based upon:
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 14f11ce51b878..428af21406092 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -36,7 +36,7 @@ import org.apache.spark.util.Utils
 
 
 /**
- * A class that implements a [[http://en.wikipedia.org/wiki/Random_forest  Random Forest]]
+ * A class that implements a <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
  * learning algorithm for classification and regression.
  * It supports both continuous and categorical features.
  *
@@ -46,9 +46,9 @@ import org.apache.spark.util.Utils
  *  - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
  *    package.
  *
- * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf  Breiman (2001)]]
- * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf  Breiman manual for
- *     random forests]]
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
+ * Breiman manual for random forests</a>
  * @param strategy The configuration parameters for the random forest algorithm which specify
  *                 the type of random forest (classification or regression), feature type
  *                 (continuous, categorical), depth of the tree, quantile calculation strategy,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index 5cef9d0631b59..be2704df3444f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -25,7 +25,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
  * Split applied to a feature
  * @param feature feature index
  * @param threshold Threshold for continuous feature.
- *                  Split left if feature <= threshold, else right.
+ *                  Split left if feature &lt;= threshold, else right.
  * @param featureType type of feature -- categorical or continuous
  * @param categories Split left if categorical feature value is in this set, else right.
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index e96c2bc6edfc3..6bb3271aacb44 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -213,7 +213,7 @@ object MLUtils extends Logging {
   }
 
   /**
-   * Version of [[kFold()]] taking a Long seed.
+   * Version of `kFold()` taking a Long seed.
    */
   @Since("2.0.0")
   def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Long): Array[(RDD[T], RDD[T])] = {
@@ -262,7 +262,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of vector columns to be converted. New vector columns will be ignored. If
    *             unspecified, all old vector columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with old vector columns converted to the new vector type
+   * @return the input `DataFrame` with old vector columns converted to the new vector type
    */
   @Since("2.0.0")
   @varargs
@@ -314,7 +314,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of vector columns to be converted. Old vector columns will be ignored. If
    *             unspecified, all new vector columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with new vector columns converted to the old vector type
+   * @return the input `DataFrame` with new vector columns converted to the old vector type
    */
   @Since("2.0.0")
   @varargs
@@ -366,7 +366,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of matrix columns to be converted. New matrix columns will be ignored. If
    *             unspecified, all old matrix columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with old matrix columns converted to the new matrix type
+   * @return the input `DataFrame` with old matrix columns converted to the new matrix type
    */
   @Since("2.0.0")
   @varargs
@@ -416,7 +416,7 @@ object MLUtils extends Logging {
    * @param dataset input dataset
    * @param cols a list of matrix columns to be converted. Old matrix columns will be ignored. If
    *             unspecified, all new matrix columns will be converted except nested ones.
-   * @return the input [[DataFrame]] with new matrix columns converted to the old matrix type
+   * @return the input `DataFrame` with new matrix columns converted to the old matrix type
    */
   @Since("2.0.0")
   @varargs
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index c881c8ea50c09..da0eb04764c57 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -72,7 +72,7 @@ trait Loader[M <: Saveable] {
   /**
    * Load a model from the given path.
    *
-   * The model should have been saved by [[Saveable.save]].
+   * The model should have been saved by `Saveable.save`.
    *
    * @param sc  Spark context used for loading model files.
    * @param path  Path specifying the directory to which the model was saved.
diff --git a/pom.xml b/pom.xml
index 7c0b0b59dc62b..5c417d2b35727 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2495,6 +2495,18 @@
                   <name>tparam</name>
                   <placement>X</placement>
                 </tag>
+                <tag>
+                  <name>constructor</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>todo</name>
+                  <placement>X</placement>
+                </tag>
+                <tag>
+                  <name>groupname</name>
+                  <placement>X</placement>
+                </tag>
               </tags>
             </configuration>
           </plugin>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 429a163d22a6d..e3fbe0379fb7b 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -745,7 +745,10 @@ object Unidoc {
       "-tag", """example:a:Example\:""",
       "-tag", """note:a:Note\:""",
       "-tag", "group:X",
-      "-tag", "tparam:X"
+      "-tag", "tparam:X",
+      "-tag", "constructor:X",
+      "-tag", "todo:X",
+      "-tag", "groupname:X"
     ),
 
     // Use GitHub repository for Scaladoc source links
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 65f91429648c1..a821d2ca34579 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -343,7 +343,7 @@ trait Row extends Serializable {
   }
 
   /**
-   * Returns a Map(name -> value) for the requested fieldNames
+   * Returns a Map(name -&gt; value) for the requested fieldNames
    * For primitive types if value is null it returns 'zero value' specific for primitive
    * ie. 0 for Int - use isNullAt to ensure that value is not null
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
index 302054708ccb5..1a93f4590331b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -37,8 +37,8 @@ import org.apache.spark.sql.types._
  *  - Xiangrui Meng.  "Simpler Online Updates for Arbitrary-Order Central Moments."
  *      2015. http://arxiv.org/abs/1510.04923
  *
- * @see [[https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- *     Algorithms for calculating variance (Wikipedia)]]
+ * @see <a href="https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * Algorithms for calculating variance (Wikipedia)</a>
  *
  * @param child to compute central moments of.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
index a4a358a242c70..02c8318b4d413 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BinaryType.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.util.TypeUtils
 
 /**
  * The data type representing `Array[Byte]` values.
- * Please use the singleton [[DataTypes.BinaryType]].
+ * Please use the singleton `DataTypes.BinaryType`.
  */
 @InterfaceStability.Stable
 class BinaryType private() extends AtomicType {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
index 059f89f9cda32..cee78f4b4ac1a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/BooleanType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 
 /**
- * The data type representing `Boolean` values. Please use the singleton [[DataTypes.BooleanType]].
+ * The data type representing `Boolean` values. Please use the singleton `DataTypes.BooleanType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
index bc6251f024e58..b1dd5eda36bd6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ByteType.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
- * The data type representing `Byte` values. Please use the singleton [[DataTypes.ByteType]].
+ * The data type representing `Byte` values. Please use the singleton `DataTypes.ByteType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
index 21f3497ba06fb..2342036a57460 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/CalendarIntervalType.scala
@@ -23,7 +23,7 @@ import org.apache.spark.annotation.InterfaceStability
  * The data type representing calendar time intervals. The calendar time interval is stored
  * internally in two components: number of months the number of microseconds.
  *
- * Please use the singleton [[DataTypes.CalendarIntervalType]].
+ * Please use the singleton `DataTypes.CalendarIntervalType`.
  *
  * @note Calendar intervals are not comparable.
  *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
index 8d0ecc051f4ce..0c0574b845536 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DateType.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 /**
  * A date type, supporting "0001-01-01" through "9999-12-31".
  *
- * Please use the singleton [[DataTypes.DateType]].
+ * Please use the singleton `DataTypes.DateType`.
  *
  * Internally, this is represented as the number of days from 1970-01-01.
  *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index d7ca0cbeedcd3..cecad3b7b4c0a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression
  *
  * The default precision and scale is (10, 0).
  *
- * Please use [[DataTypes.createDecimalType()]] to create a specific instance.
+ * Please use `DataTypes.createDecimalType()` to create a specific instance.
  *
  * @since 1.3.0
  */
@@ -92,7 +92,7 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
   }
 
   /**
-   * The default size of a value of the DecimalType is 8 bytes (precision <= 18) or 16 bytes.
+   * The default size of a value of the DecimalType is 8 bytes (precision &lt;= 18) or 16 bytes.
    */
   override def defaultSize: Int = if (precision <= Decimal.MAX_LONG_DIGITS) 8 else 16
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
index c21ac0e43eee0..400f7aed6ae72 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DoubleType.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.util.Utils
 
 /**
- * The data type representing `Double` values. Please use the singleton [[DataTypes.DoubleType]].
+ * The data type representing `Double` values. Please use the singleton `DataTypes.DoubleType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
index c5bf8883bad93..b9812b236d575 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/FloatType.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.util.Utils
 
 /**
- * The data type representing `Float` values. Please use the singleton [[DataTypes.FloatType]].
+ * The data type representing `Float` values. Please use the singleton `DataTypes.FloatType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
index 724e59c0bcbf4..dca612ecbfed9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/IntegerType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 
 /**
- * The data type representing `Int` values. Please use the singleton [[DataTypes.IntegerType]].
+ * The data type representing `Int` values. Please use the singleton `DataTypes.IntegerType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
index 42285a9d0aa29..396c3355701c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/LongType.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
- * The data type representing `Long` values. Please use the singleton [[DataTypes.LongType]].
+ * The data type representing `Long` values. Please use the singleton `DataTypes.LongType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index 3a32aa43d1c3a..fbf3a61786251 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.InterfaceStability
 /**
  * The data type for Maps. Keys in a map are not allowed to have `null` values.
  *
- * Please use [[DataTypes.createMapType()]] to create a specific instance.
+ * Please use `DataTypes.createMapType()` to create a specific instance.
  *
  * @param keyType The data type of map keys.
  * @param valueType The data type of map values.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala
index bdf9a819d007b..494225b47a270 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/NullType.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.InterfaceStability
 
 
 /**
- * The data type representing `NULL` values. Please use the singleton [[DataTypes.NullType]].
+ * The data type representing `NULL` values. Please use the singleton `DataTypes.NullType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
index 3fee299d578cc..1410d5ba0e0b0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ShortType.scala
@@ -24,7 +24,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
- * The data type representing `Short` values. Please use the singleton [[DataTypes.ShortType]].
+ * The data type representing `Short` values. Please use the singleton `DataTypes.ShortType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
index 5d5a6f52a305b..d1c0da3479d76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.unsafe.types.UTF8String
 
 /**
- * The data type representing `String` values. Please use the singleton [[DataTypes.StringType]].
+ * The data type representing `String` values. Please use the singleton `DataTypes.StringType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
index 4540d8358acad..2875995420053 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/TimestampType.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.ScalaReflectionLock
 
 /**
  * The data type representing `java.sql.Timestamp` values.
- * Please use the singleton [[DataTypes.TimestampType]].
+ * Please use the singleton `DataTypes.TimestampType`.
  *
  * @since 1.3.0
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index a77937efd7e15..5be9a99369997 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -239,8 +239,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]])
-   * and returns the result as a [[DataFrame]].
+   * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
+   * newline-delimited JSON</a>) and returns the result as a [[DataFrame]].
    * See the documentation on the overloaded `json()` method with varargs for more details.
    *
    * @since 1.4.0
@@ -251,8 +251,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a JSON file ([[http://jsonlines.org/ JSON Lines text format or newline-delimited JSON]])
-   * and returns the result as a [[DataFrame]].
+   * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
+   * newline-delimited JSON</a>) and returns the result as a [[DataFrame]].
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -297,8 +297,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def json(paths: String*): DataFrame = format("json").load(paths : _*)
 
   /**
-   * Loads a `JavaRDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format
-   * or newline-delimited JSON]]) and returns the result as a [[DataFrame]].
+   * Loads a `JavaRDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON
+   * Lines text format or newline-delimited JSON</a>) and returns the result as
+   * a [[DataFrame]].
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
@@ -309,8 +310,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def json(jsonRDD: JavaRDD[String]): DataFrame = json(jsonRDD.rdd)
 
   /**
-   * Loads an `RDD[String]` storing JSON objects ([[http://jsonlines.org/ JSON Lines text format or
-   * newline-delimited JSON]]) and returns the result as a [[DataFrame]].
+   * Loads an `RDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON Lines
+   * text format or newline-delimited JSON</a>) and returns the result as a [[DataFrame]].
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 6335fc4579a28..a9a861c4635b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -48,8 +48,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *
    * This method implements a variation of the Greenwald-Khanna algorithm (with some speed
    * optimizations).
-   * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient
-   * Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+   * The algorithm was first present in <a href="http://dx.doi.org/10.1145/375663.375670">
+   * Space-efficient Online Computation of Quantile Summaries</a> by Greenwald and Khanna.
    *
    * @param col the name of the numerical column
    * @param probabilities a list of quantile probabilities
@@ -184,7 +184,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp,
+   * Schenker, and Papadimitriou.
    * The `support` should be greater than 1e-4.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
@@ -230,7 +231,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp,
+   * Schenker, and Papadimitriou.
    * Uses a `default` support of 1%.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
@@ -248,7 +250,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker,
+   * and Papadimitriou.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
    * backward compatibility of the schema of the resulting [[DataFrame]].
@@ -291,7 +294,8 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   /**
    * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker,
+   * and Papadimitriou.
    * Uses a `default` support of 1%.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 15281f24fa628..2d863422fbabe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -442,8 +442,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in JSON format ([[http://jsonlines.org/ JSON Lines text
-   * format or newline-delimited JSON]]) at the specified path.
+   * Saves the content of the [[DataFrame]] in JSON format (<a href="http://jsonlines.org/">
+   * JSON Lines text format or newline-delimited JSON</a>) at the specified path.
    * This is equivalent to:
    * {{{
    *   format("json").save(path)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 2fae93651b344..858fa4c7609b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -172,7 +172,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   def experimental: ExperimentalMethods = sparkSession.experimental
 
   /**
-   * Returns a [[DataFrame]] with no rows or columns.
+   * Returns a `DataFrame` with no rows or columns.
    *
    * @group basic
    * @since 1.3.0
@@ -254,7 +254,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   /**
    * :: Experimental ::
    * (Scala-specific) Implicit methods available in Scala for converting
-   * common Scala objects into [[DataFrame]]s.
+   * common Scala objects into `DataFrame`s.
    *
    * {{{
    *   val sqlContext = new SQLContext(sc)
@@ -298,7 +298,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]].
+   * Convert a [[BaseRelation]] created for external data sources into a `DataFrame`.
    *
    * @group dataframes
    * @since 1.3.0
@@ -309,7 +309,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from an [[RDD]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    * Example:
@@ -438,7 +438,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a [[JavaRDD]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -453,7 +453,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided List matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -504,7 +504,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * {{{
    *   sqlContext.read.parquet("/path/to/file.parquet")
    *   sqlContext.read.schema(schema).json("/path/to/file.json")
@@ -518,7 +518,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]].
+   * Returns a [[DataStreamReader]] that can be used to read streaming data in as a `DataFrame`.
    * {{{
    *   sparkSession.readStream.parquet("/path/to/directory/of/parquet/files")
    *   sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files")
@@ -617,7 +617,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Registers the given [[DataFrame]] as a temporary table in the catalog. Temporary tables exist
+   * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist
    * only during the lifetime of this instance of SQLContext.
    */
   private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = {
@@ -638,7 +638,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in a range from 0 to `end` (exclusive) with step value 1.
    *
    * @since 1.4.1
@@ -650,7 +650,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with step value 1.
    *
    * @since 1.4.0
@@ -662,7 +662,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with a step value.
    *
    * @since 2.0.0
@@ -676,7 +676,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a `DataFrame` with a single [[LongType]] column named `id`, containing elements
    * in an range from `start` to `end` (exclusive) with an step value, with partition number
    * specified.
    *
@@ -690,7 +690,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is
+   * Executes a SQL query using Spark, returning the result as a `DataFrame`. The dialect that is
    * used for SQL parsing can be configured with 'spark.sql.dialect'.
    *
    * @group basic
@@ -699,7 +699,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   def sql(sqlText: String): DataFrame = sparkSession.sql(sqlText)
 
   /**
-   * Returns the specified table as a [[DataFrame]].
+   * Returns the specified table as a `DataFrame`.
    *
    * @group ddl_ops
    * @since 1.3.0
@@ -709,7 +709,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Returns a [[DataFrame]] containing names of existing tables in the current database.
+   * Returns a `DataFrame` containing names of existing tables in the current database.
    * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
    *
@@ -721,7 +721,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Returns a [[DataFrame]] containing names of existing tables in the given database.
+   * Returns a `DataFrame` containing names of existing tables in the given database.
    * The returned DataFrame has two columns, tableName and isTemporary (a Boolean
    * indicating if a table is a temporary one or not).
    *
@@ -799,8 +799,8 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty
-   * [[DataFrame]] if no paths are passed in.
+   * Loads a Parquet file, returning the result as a `DataFrame`. This function returns an empty
+   * `DataFrame` if no paths are passed in.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().parquet()`.
@@ -816,7 +816,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
+   * Loads a JSON file (one object per line), returning the result as a `DataFrame`.
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
@@ -829,7 +829,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads a JSON file (one object per line) and applies the given schema,
-   * returning the result as a [[DataFrame]].
+   * returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -850,7 +850,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
@@ -861,7 +861,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * It goes through the entire dataset once to determine the schema.
    *
    * @group specificdata
@@ -872,7 +872,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record) and applies the given schema,
-   * returning the result as a [[DataFrame]].
+   * returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -884,7 +884,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
-   * schema, returning the result as a [[DataFrame]].
+   * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -896,7 +896,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record) inferring the
-   * schema, returning the result as a [[DataFrame]].
+   * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -908,7 +908,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Loads a JavaRDD[String] storing JSON objects (one object per record) inferring the
-   * schema, returning the result as a [[DataFrame]].
+   * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().json()`.
@@ -995,7 +995,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table.
    *
    * @group specificdata
@@ -1007,7 +1007,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table.  Partitions of the table will be retrieved in parallel based on the parameters
    * passed to this function.
    *
@@ -1031,10 +1031,10 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table. The theParts parameter gives a list expressions
    * suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the [[DataFrame]].
+   * of the `DataFrame`.
    *
    * @group specificdata
    * @deprecated As of 1.4.0, replaced by `read().jdbc()`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
index b9dbfcf7734c3..cdb755edc79a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala
@@ -69,7 +69,8 @@ object FrequentItems extends Logging {
   /**
    * Finding frequent items for columns, possibly with false positives. Using the
    * frequent element count algorithm described in
-   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * <a href="http://dx.doi.org/10.1145/762471.762473">here</a>, proposed by Karp, Schenker,
+   * and Papadimitriou.
    * The `support` should be greater than 1e-4.
    * For Internal use only.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index c02b15498748f..2b2e706125ede 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -41,8 +41,8 @@ object StatFunctions extends Logging {
    *
    * This method implements a variation of the Greenwald-Khanna algorithm (with some speed
    * optimizations).
-   * The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient
-   * Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+   * The algorithm was first present in <a href="http://dx.doi.org/10.1145/375663.375670">
+   * Space-efficient Online Computation of Quantile Summaries</a> by Greenwald and Khanna.
    *
    * @param df the dataframe
    * @param cols numerical columns of the dataframe
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
index eea98414003ba..058c38c8cb8f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression
 
 /**
  * :: Experimental ::
- * A base class for user-defined aggregations, which can be used in [[Dataset]] operations to take
+ * A base class for user-defined aggregations, which can be used in `Dataset` operations to take
  * all of the elements of a group and reduce them to a single value.
  *
  * For example, the following aggregator extracts an `int` from a specific class and adds them up:
@@ -80,19 +80,19 @@ abstract class Aggregator[-IN, BUF, OUT] extends Serializable {
   def finish(reduction: BUF): OUT
 
   /**
-   * Specifies the [[Encoder]] for the intermediate value type.
+   * Specifies the `Encoder` for the intermediate value type.
    * @since 2.0.0
    */
   def bufferEncoder: Encoder[BUF]
 
   /**
-   * Specifies the [[Encoder]] for the final ouput value type.
+   * Specifies the `Encoder` for the final ouput value type.
    * @since 2.0.0
    */
   def outputEncoder: Encoder[OUT]
 
   /**
-   * Returns this `Aggregator` as a [[TypedColumn]] that can be used in [[Dataset]].
+   * Returns this `Aggregator` as a `TypedColumn` that can be used in `Dataset`.
    * operations.
    * @since 1.6.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 36dd5f78ac137..b13fe7016092c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.functions
 import org.apache.spark.sql.types.DataType
 
 /**
- * A user-defined function. To create one, use the `udf` functions in [[functions]].
+ * A user-defined function. To create one, use the `udf` functions in `functions`.
  *
  * As an example:
  * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
index 327bc379d4132..f3cf3052ea3ea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala
@@ -117,8 +117,8 @@ object Window {
    * "current row", while "-1" means the row before the current row, and "5" means the fifth row
    * after the current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
    * A row based boundary is based on the position of the row within the partition.
@@ -148,9 +148,9 @@ object Window {
    * }}}
    *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 2.1.0
    */
   // Note: when updating the doc for this method, also update WindowSpec.rowsBetween.
@@ -166,8 +166,8 @@ object Window {
    * while "-1" means one off before the current row, and "5" means the five off after the
    * current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
    * A range based boundary is based on the actual value of the ORDER BY
@@ -200,9 +200,9 @@ object Window {
    * }}}
    *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 2.1.0
    */
   // Note: when updating the doc for this method, also update WindowSpec.rangeBetween.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index 4a8ce695bd4da..de7d7a1772753 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -85,8 +85,8 @@ class WindowSpec private[sql](
    * "current row", while "-1" means the row before the current row, and "5" means the fifth row
    * after the current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `[Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
    * A row based boundary is based on the position of the row within the partition.
@@ -116,9 +116,9 @@ class WindowSpec private[sql](
    * }}}
    *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 1.4.0
    */
   // Note: when updating the doc for this method, also update Window.rowsBetween.
@@ -133,8 +133,8 @@ class WindowSpec private[sql](
    * while "-1" means one off before the current row, and "5" means the five off after the
    * current row.
    *
-   * We recommend users use [[Window.unboundedPreceding]], [[Window.unboundedFollowing]],
-   * and [[Window.currentRow]] to specify special boundary values, rather than using integral
+   * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`,
+   * and `[Window.currentRow` to specify special boundary values, rather than using integral
    * values directly.
    *
    * A range based boundary is based on the actual value of the ORDER BY
@@ -167,9 +167,9 @@ class WindowSpec private[sql](
    * }}}
    *
    * @param start boundary start, inclusive. The frame is unbounded if this is
-   *              the minimum long value ([[Window.unboundedPreceding]]).
+   *              the minimum long value (`Window.unboundedPreceding`).
    * @param end boundary end, inclusive. The frame is unbounded if this is the
-   *            maximum long value  ([[Window.unboundedFollowing]]).
+   *            maximum long value  (`Window.unboundedFollowing`).
    * @since 1.4.0
    */
   // Note: when updating the doc for this method, also update Window.rangeBetween.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
index aa71cb9e3bc85..650ffd4586592 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.execution.aggregate._
 
 /**
  * :: Experimental ::
- * Type-safe functions available for [[Dataset]] operations in Scala.
+ * Type-safe functions available for `Dataset` operations in Scala.
  *
  * Java users should use [[org.apache.spark.sql.expressions.javalang.typed]].
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
index bc9788d81fe6a..4976b875fa298 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala
@@ -32,9 +32,9 @@ import org.apache.spark.sql.types._
 abstract class UserDefinedAggregateFunction extends Serializable {
 
   /**
-   * A [[StructType]] represents data types of input arguments of this aggregate function.
+   * A `StructType` represents data types of input arguments of this aggregate function.
    * For example, if a [[UserDefinedAggregateFunction]] expects two input arguments
-   * with type of [[DoubleType]] and [[LongType]], the returned [[StructType]] will look like
+   * with type of `DoubleType` and `LongType`, the returned `StructType` will look like
    *
    * ```
    *   new StructType()
@@ -42,7 +42,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    *    .add("longInput", LongType)
    * ```
    *
-   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * The name of a field of this `StructType` is only used to identify the corresponding
    * input argument. Users can choose names to identify the input arguments.
    *
    * @since 1.5.0
@@ -50,10 +50,10 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def inputSchema: StructType
 
   /**
-   * A [[StructType]] represents data types of values in the aggregation buffer.
+   * A `StructType` represents data types of values in the aggregation buffer.
    * For example, if a [[UserDefinedAggregateFunction]]'s buffer has two values
-   * (i.e. two intermediate values) with type of [[DoubleType]] and [[LongType]],
-   * the returned [[StructType]] will look like
+   * (i.e. two intermediate values) with type of `DoubleType` and `LongType`,
+   * the returned `StructType` will look like
    *
    * ```
    *   new StructType()
@@ -61,7 +61,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
    *    .add("longInput", LongType)
    * ```
    *
-   * The name of a field of this [[StructType]] is only used to identify the corresponding
+   * The name of a field of this `StructType` is only used to identify the corresponding
    * buffer value. Users can choose names to identify the input arguments.
    *
    * @since 1.5.0
@@ -69,7 +69,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def bufferSchema: StructType
 
   /**
-   * The [[DataType]] of the returned value of this [[UserDefinedAggregateFunction]].
+   * The `DataType` of the returned value of this [[UserDefinedAggregateFunction]].
    *
    * @since 1.5.0
    */
@@ -121,7 +121,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   def evaluate(buffer: Row): Any
 
   /**
-   * Creates a [[Column]] for this UDAF using given [[Column]]s as input arguments.
+   * Creates a `Column` for this UDAF using given `Column`s as input arguments.
    *
    * @since 1.5.0
    */
@@ -136,8 +136,8 @@ abstract class UserDefinedAggregateFunction extends Serializable {
   }
 
   /**
-   * Creates a [[Column]] for this UDAF using the distinct values of the given
-   * [[Column]]s as input arguments.
+   * Creates a `Column` for this UDAF using the distinct values of the given
+   * `Column`s as input arguments.
    *
    * @since 1.5.0
    */
@@ -153,7 +153,7 @@ abstract class UserDefinedAggregateFunction extends Serializable {
 }
 
 /**
- * A [[Row]] representing a mutable aggregation buffer.
+ * A `Row` representing a mutable aggregation buffer.
  *
  * This is not meant to be extended outside of Spark.
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 7c64e28d24724..83857c322a0ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -40,7 +40,7 @@ case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int)
  * SQL dialect of a certain database or jdbc driver.
  * Lots of databases define types that aren't explicitly supported
  * by the JDBC spec.  Some JDBC drivers also report inaccurate
- * information---for instance, BIT(n>1) being reported as a BIT type is quite
+ * information---for instance, BIT(n&gt;1) being reported as a BIT type is quite
  * common, even though BIT in JDBC is meant for single-bit values.  Also, there
  * does not appear to be a standard name for an unbounded string or binary
  * type; we use BLOB and CLOB by default but override with database-specific
@@ -134,7 +134,7 @@ abstract class JdbcDialect extends Serializable {
 
 /**
  * :: DeveloperApi ::
- * Registry of dialects that apply to every new jdbc [[org.apache.spark.sql.DataFrame]].
+ * Registry of dialects that apply to every new jdbc `org.apache.spark.sql.DataFrame`.
  *
  * If multiple matching dialects are registered then all matching ones will be
  * tried in reverse order. A user-added dialect will thus be applied first,
@@ -148,7 +148,7 @@ abstract class JdbcDialect extends Serializable {
 object JdbcDialects {
 
   /**
-   * Register a dialect for use on all new matching jdbc [[org.apache.spark.sql.DataFrame]].
+   * Register a dialect for use on all new matching jdbc `org.apache.spark.sql.DataFrame`.
    * Reading an existing dialect will cause a move-to-front.
    *
    * @param dialect The new dialect.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 40b482e4c01a5..c50733534e2b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.execution.streaming.StreamingRelation
 import org.apache.spark.sql.types.StructType
 
 /**
- * Interface used to load a streaming [[Dataset]] from external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[SparkSession.readStream]] to access this.
+ * Interface used to load a streaming `Dataset` from external storage systems (e.g. file systems,
+ * key-value stores, etc). Use `SparkSession.readStream` to access this.
  *
  * @since 2.0.0
  */
@@ -109,7 +109,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
 
 
   /**
-   * Loads input data stream in as a [[DataFrame]], for data streams that don't require a path
+   * Loads input data stream in as a `DataFrame`, for data streams that don't require a path
    * (e.g. external key-value stores).
    *
    * @since 2.0.0
@@ -125,7 +125,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data streams that read from some path.
+   * Loads input in as a `DataFrame`, for data streams that read from some path.
    *
    * @since 2.0.0
    */
@@ -134,8 +134,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads a JSON file stream ([[http://jsonlines.org/ JSON Lines text format or newline-delimited
-   * JSON]]) and returns the result as a [[DataFrame]].
+   * Loads a JSON file stream (<a href="http://jsonlines.org/">JSON Lines text format or
+   * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -181,7 +181,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   def json(path: String): DataFrame = format("json").load(path)
 
   /**
-   * Loads a CSV file stream and returns the result as a [[DataFrame]].
+   * Loads a CSV file stream and returns the result as a `DataFrame`.
    *
    * This function will go through the input once to determine the input schema if `inferSchema`
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -243,7 +243,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   def csv(path: String): DataFrame = format("csv").load(path)
 
   /**
-   * Loads a Parquet file stream, returning the result as a [[DataFrame]].
+   * Loads a Parquet file stream, returning the result as a `DataFrame`.
    *
    * You can set the following Parquet-specific option(s) for reading Parquet files:
    * <ul>
@@ -262,7 +262,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   }
 
   /**
-   * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+   * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any.
    *
    * Each line in the text files is a new row in the resulting DataFrame. For example:
@@ -285,7 +285,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
   def text(path: String): DataFrame = format("text").load(path)
 
   /**
-   * Loads text file(s) and returns a [[Dataset]] of String. The underlying schema of the Dataset
+   * Loads text file(s) and returns a `Dataset` of String. The underlying schema of the Dataset
    * contains a single string column named "value".
    *
    * If the directory structure of the text files contains partitioning information, those are
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index daed1dcb77370..b3c600ae53dbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -26,8 +26,8 @@ import org.apache.spark.sql.execution.streaming.{ForeachSink, MemoryPlan, Memory
 
 /**
  * :: Experimental ::
- * Interface used to write a streaming [[Dataset]] to external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[Dataset.writeStream]] to access this.
+ * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
+ * key-value stores, etc). Use `Dataset.writeStream` to access this.
  *
  * @since 2.0.0
  */
@@ -273,8 +273,8 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
 
   /**
    * Starts the execution of the streaming query, which will continually send results to the given
-   * [[ForeachWriter]] as as new data arrives. The [[ForeachWriter]] can be used to send the data
-   * generated by the [[DataFrame]]/[[Dataset]] to an external system.
+   * `ForeachWriter` as as new data arrives. The `ForeachWriter` can be used to send the data
+   * generated by the `DataFrame`/`Dataset` to an external system.
    *
    * Scala example:
    * {{{
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
index 0a85414451981..374313f2ca9ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
@@ -31,7 +31,7 @@ trait StreamingQuery {
 
   /**
    * Returns the name of the query. This name is unique across all active queries. This can be
-   * set in the [[org.apache.spark.sql.DataStreamWriter DataStreamWriter]] as
+   * set in the `org.apache.spark.sql.streaming.DataStreamWriter` as
    * `dataframe.writeStream.queryName("query").start()`.
    * @since 2.0.0
    */
@@ -45,7 +45,7 @@ trait StreamingQuery {
   def id: Long
 
   /**
-   * Returns the [[SparkSession]] associated with `this`.
+   * Returns the `SparkSession` associated with `this`.
    * @since 2.0.0
    */
   def sparkSession: SparkSession
@@ -90,10 +90,11 @@ trait StreamingQuery {
    * immediately (if the query was terminated by `stop()`), or throw the exception
    * immediately (if the query has terminated with exception).
    *
-   * @throws StreamingQueryException, if `this` query has terminated with an exception.
+   * @throws StreamingQueryException if the query has terminated with an exception.
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitTermination(): Unit
 
   /**
@@ -106,10 +107,11 @@ trait StreamingQuery {
    * `true` immediately (if the query was terminated by `stop()`), or throw the exception
    * immediately (if the query has terminated with exception).
    *
-   * @throws StreamingQueryException, if `this` query has terminated with an exception
+   * @throws StreamingQueryException if the query has terminated with an exception
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitTermination(timeoutMs: Long): Boolean
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index bba7bc753eea9..53968a82d8e22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -31,7 +31,7 @@ import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
  * :: Experimental ::
- * A class to manage all the [[StreamingQuery]] active on a [[SparkSession]].
+ * A class to manage all the [[StreamingQuery]] active on a `SparkSession`.
  *
  * @since 2.0.0
  */
@@ -81,10 +81,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
    * users need to stop all of them after any of them terminates with exception, and then check the
    * `query.exception()` for each query.
    *
-   * @throws StreamingQueryException, if any query has terminated with an exception
+   * @throws StreamingQueryException if any query has terminated with an exception
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitAnyTermination(): Unit = {
     awaitTerminationLock.synchronized {
       while (lastTerminatedQuery == null) {
@@ -113,10 +114,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
    * users need to stop all of them after any of them terminates with exception, and then check the
    * `query.exception()` for each query.
    *
-   * @throws StreamingQueryException, if any query has terminated with an exception
+   * @throws StreamingQueryException if any query has terminated with an exception
    *
    * @since 2.0.0
    */
+  @throws[StreamingQueryException]
   def awaitAnyTermination(timeoutMs: Long): Boolean = {
 
     val startTime = System.currentTimeMillis
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
index 4504582187b97..26ad0eadd9d4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -68,7 +68,7 @@ trait QueryExecutionListener {
 /**
  * :: Experimental ::
  *
- * Manager for [[QueryExecutionListener]]. See [[org.apache.spark.sql.SQLContext.listenerManager]].
+ * Manager for [[QueryExecutionListener]]. See `org.apache.spark.sql.SQLContext.listenerManager`.
  */
 @Experimental
 @InterfaceStability.Evolving
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index e333fc7febc2a..a2d64da0012f1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -57,9 +57,9 @@ import org.apache.spark.util.SerializableJobConf
  * @param partition a map from the partition key to the partition value (optional). If the partition
  *                  value is optional, dynamic partition insert will be performed.
  *                  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have
- *                  Map('a' -> Some('1'), 'b' -> Some('2')),
+ *                  Map('a' -&gt; Some('1'), 'b' -&gt; Some('2')),
  *                  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
- *                  would have Map('a' -> Some('1'), 'b' -> None).
+ *                  would have Map('a' -&gt; Some('1'), 'b' -&gt; None).
  * @param child the logical plan representing data to write to.
  * @param overwrite overwrite existing table or partitions.
  * @param ifNotExists If true, only write if the table or partition does not exist.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 42c92ed5cae26..0a7631f782193 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -42,8 +42,8 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
 /**
- * [[FileFormat]] for reading ORC files. If this is moved or renamed, please update
- * [[DataSource]]'s backwardCompatibilityMap.
+ * `FileFormat` for reading ORC files. If this is moved or renamed, please update
+ * `DataSource`'s backwardCompatibilityMap.
  */
 class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable {
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
index f5db73b715820..3f1f86c278db0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -38,7 +38,7 @@ private[orc] object OrcFileOperator extends Logging {
    * 1. Retrieving file metadata (schema and compression codecs, etc.)
    * 2. Read the actual file content (in this case, the given path should point to the target file)
    *
-   * @note As recorded by SPARK-8501, ORC writes an empty schema (<code>struct&lt;&gt;</code) to an
+   * @note As recorded by SPARK-8501, ORC writes an empty schema (<code>struct&lt;&gt;</code>) to an
    *       ORC file if the file contains zero rows. This is OK for Hive since the schema of the
    *       table is managed by metastore.  But this becomes a problem when reading ORC files
    *       directly from HDFS via Spark SQL, because we have to discover the schema from raw ORC

From b5afdaca33996eb8af5927bf6e0cff291ed97c7f Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Fri, 25 Nov 2016 05:02:48 -0800
Subject: [PATCH 192/534] [SPARK-18559][SQL] Fix HLL++ with small relative
 error

## What changes were proposed in this pull request?

In `HyperLogLogPlusPlus`, if the relative error is so small that p >= 19, it will cause ArrayIndexOutOfBoundsException in `THRESHOLDS(p-4)` . We should check `p` and when p >= 19, regress to the original HLL result and use the small range correction they use.

The pr also fixes the upper bound in the log info in `require()`.
The upper bound is computed by:
```
val relativeSD = 1.106d / Math.pow(Math.E, p * Math.log(2.0d) / 2.0d)
```
which is derived from the equation for computing `p`:
```
val p = 2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)
```

## How was this patch tested?

add test cases for:
1. checking validity of parameter relatvieSD
2. estimation with smaller relative error so that p >= 19

Author: Zhenhua Wang <wzh_zju@163.com>
Author: wangzhenhua <wangzhenhua@huawei.com>

Closes #15990 from wzhfy/hllppRsd.

(cherry picked from commit 5ecdc7c5c019acc6b1f9c2e6c5b7d35957eadb88)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../expressions/aggregate/HyperLogLogPlusPlus.scala      | 9 ++++++---
 .../expressions/aggregate/HyperLogLogPlusPlusSuite.scala | 9 ++++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
index b9862aa04fcd9..77b7eb228edc5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -93,7 +93,7 @@ case class HyperLogLogPlusPlus(
   private[this] val p = Math.ceil(2.0d * Math.log(1.106d / relativeSD) / Math.log(2.0d)).toInt
 
   require(p >= 4, "HLL++ requires at least 4 bits for addressing. " +
-    "Use a lower error, at most 27%.")
+    "Use a lower error, at most 39%.")
 
   /**
    * Shift used to extract the index of the register from the hashed value.
@@ -296,8 +296,9 @@ case class HyperLogLogPlusPlus(
     // We integrate two steps from the paper:
     // val Z = 1.0d / zInverse
     // val E = alphaM2 * Z
+    val E = alphaM2 / zInverse
     @inline
-    def EBiasCorrected = alphaM2 / zInverse match {
+    def EBiasCorrected = E match {
       case e if p < 19 && e < 5.0d * m => e - estimateBias(e)
       case e => e
     }
@@ -306,7 +307,9 @@ case class HyperLogLogPlusPlus(
     val estimate = if (V > 0) {
       // Use linear counting for small cardinality estimates.
       val H = m * Math.log(m / V)
-      if (H <= THRESHOLDS(p - 4)) {
+      // HLL++ is defined only when p < 19, otherwise we need to fallback to HLL.
+      // The threshold `2.5 * m` is from the original HLL algorithm.
+      if ((p < 19 && H <= THRESHOLDS(p - 4)) || E <= 2.5 * m) {
         H
       } else {
         EBiasCorrected
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
index 17f6b71bb270b..cc53880af5b24 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlusSuite.scala
@@ -50,6 +50,13 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
     assert(error < hll.trueRsd * 3.0d, "Error should be within 3 std. errors.")
   }
 
+  test("test invalid parameter relativeSD") {
+    // `relativeSD` should be at most 39%.
+    intercept[IllegalArgumentException] {
+      new HyperLogLogPlusPlus(new BoundReference(0, IntegerType, true), relativeSD = 0.4)
+    }
+  }
+
   test("add nulls") {
     val (hll, input, buffer) = createEstimator(0.05)
     input.setNullAt(0)
@@ -83,7 +90,7 @@ class HyperLogLogPlusPlusSuite extends SparkFunSuite {
   test("deterministic cardinality estimation") {
     val repeats = 10
     testCardinalityEstimates(
-      Seq(0.1, 0.05, 0.025, 0.01),
+      Seq(0.1, 0.05, 0.025, 0.01, 0.001),
       Seq(100, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000).map(_ * repeats),
       i => i / repeats,
       i => i / repeats)

From 906d82c4ca28c5f54d2c3f7fa58006a89472c78b Mon Sep 17 00:00:00 2001
From: jiangxingbo <jiangxb1987@gmail.com>
Date: Fri, 25 Nov 2016 12:44:34 -0800
Subject: [PATCH 193/534] [SPARK-18436][SQL] isin causing SQL syntax error with
 JDBC

## What changes were proposed in this pull request?

The expression `in(empty seq)` is invalid in some data source. Since `in(empty seq)` is always false, we should generate `in(empty seq)` to false literal in optimizer.
The sql `SELECT * FROM t WHERE a IN ()` throws a `ParseException` which is consistent with Hive, don't need to change that behavior.

## How was this patch tested?
Add new test case in `OptimizeInSuite`.

Author: jiangxingbo <jiangxb1987@gmail.com>

Closes #15977 from jiangxb1987/isin-empty.

(cherry picked from commit e2fb9fd365466da888ab8b3a2a0836049a65f8c8)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../catalyst/expressions/PredicateSuite.scala | 24 ++++++++++---------
 .../execution/datasources/jdbc/JDBCRDD.scala  |  2 ++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  2 ++
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index f9f6799e6e72f..6fc3de178f6df 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -35,7 +35,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(s"3VL $name") {
       truthTable.foreach {
         case (l, r, answer) =>
-          val expr = op(Literal.create(l, BooleanType), Literal.create(r, BooleanType))
+          val expr = op(NonFoldableLiteral(l, BooleanType), NonFoldableLiteral(r, BooleanType))
           checkEvaluation(expr, answer)
       }
     }
@@ -72,7 +72,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
         (false, true) ::
         (null, null) :: Nil
     notTrueTable.foreach { case (v, answer) =>
-      checkEvaluation(Not(Literal.create(v, BooleanType)), answer)
+      checkEvaluation(Not(NonFoldableLiteral(v, BooleanType)), answer)
     }
     checkConsistencyBetweenInterpretedAndCodegen(Not, BooleanType)
   }
@@ -120,12 +120,14 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       (null, null, null) :: Nil)
 
   test("IN") {
-    checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal(1), Literal(2))), null)
-    checkEvaluation(In(Literal.create(null, IntegerType), Seq(Literal.create(null, IntegerType))),
-      null)
-    checkEvaluation(In(Literal(1), Seq(Literal.create(null, IntegerType))), null)
-    checkEvaluation(In(Literal(1), Seq(Literal(1), Literal.create(null, IntegerType))), true)
-    checkEvaluation(In(Literal(2), Seq(Literal(1), Literal.create(null, IntegerType))), null)
+    checkEvaluation(In(NonFoldableLiteral(null, IntegerType), Seq(Literal(1), Literal(2))), null)
+    checkEvaluation(In(NonFoldableLiteral(null, IntegerType),
+      Seq(NonFoldableLiteral(null, IntegerType))), null)
+    checkEvaluation(In(NonFoldableLiteral(null, IntegerType), Seq.empty), null)
+    checkEvaluation(In(Literal(1), Seq.empty), false)
+    checkEvaluation(In(Literal(1), Seq(NonFoldableLiteral(null, IntegerType))), null)
+    checkEvaluation(In(Literal(1), Seq(Literal(1), NonFoldableLiteral(null, IntegerType))), true)
+    checkEvaluation(In(Literal(2), Seq(Literal(1), NonFoldableLiteral(null, IntegerType))), null)
     checkEvaluation(In(Literal(1), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(2), Seq(Literal(1), Literal(2))), true)
     checkEvaluation(In(Literal(3), Seq(Literal(1), Literal(2))), false)
@@ -133,7 +135,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
       And(In(Literal(1), Seq(Literal(1), Literal(2))), In(Literal(2), Seq(Literal(1), Literal(2)))),
       true)
 
-    val ns = Literal.create(null, StringType)
+    val ns = NonFoldableLiteral(null, StringType)
     checkEvaluation(In(ns, Seq(Literal("1"), Literal("2"))), null)
     checkEvaluation(In(ns, Seq(ns)), null)
     checkEvaluation(In(Literal("a"), Seq(ns)), null)
@@ -153,7 +155,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
           case _ => value
         }
       }
-      val input = inputData.map(Literal.create(_, t))
+      val input = inputData.map(NonFoldableLiteral(_, t))
       val expected = if (inputData(0) == null) {
         null
       } else if (inputData.slice(1, 10).contains(inputData(0))) {
@@ -277,7 +279,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("BinaryComparison: null test") {
     // Use -1 (default value for codegen) which can trigger some weird bugs, e.g. SPARK-14757
     val normalInt = Literal(-1)
-    val nullInt = Literal.create(null, IntegerType)
+    val nullInt = NonFoldableLiteral(null, IntegerType)
 
     def nullTest(op: (Expression, Expression) => Expression): Unit = {
       checkEvaluation(op(normalInt, nullInt), null)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index c0fabc81e42a4..a1e5dfdbf739e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -120,6 +120,8 @@ object JDBCRDD extends Logging {
       case StringStartsWith(attr, value) => s"${attr} LIKE '${value}%'"
       case StringEndsWith(attr, value) => s"${attr} LIKE '%${value}'"
       case StringContains(attr, value) => s"${attr} LIKE '%${value}%'"
+      case In(attr, value) if value.isEmpty =>
+        s"CASE WHEN ${attr} IS NULL THEN NULL ELSE FALSE END"
       case In(attr, value) => s"$attr IN (${compileValue(value)})"
       case Not(f) => compileFilter(f).map(p => s"(NOT ($p))").getOrElse(null)
       case Or(f1, f2) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 71cf5e6a22916..f921939ada73f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -619,6 +619,8 @@ class JDBCSuite extends SparkFunSuite
     assert(doCompileFilter(GreaterThan("col0", 3)) === "col0 > 3")
     assert(doCompileFilter(GreaterThanOrEqual("col0", 3)) === "col0 >= 3")
     assert(doCompileFilter(In("col1", Array("jkl"))) === "col1 IN ('jkl')")
+    assert(doCompileFilter(In("col1", Array.empty)) ===
+      "CASE WHEN col1 IS NULL THEN NULL ELSE FALSE END")
     assert(doCompileFilter(Not(In("col1", Array("mno", "pqr"))))
       === "(NOT (col1 IN ('mno', 'pqr')))")
     assert(doCompileFilter(IsNull("col1")) === "col1 IS NULL")

From da66b9742eabb2654b369f634eb05910220a6441 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Fri, 25 Nov 2016 20:25:29 -0800
Subject: [PATCH 194/534] [SPARK-18583][SQL] Fix nullability of InputFileName.

## What changes were proposed in this pull request?

The nullability of `InputFileName` should be `false`.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #16007 from ueshin/issues/SPARK-18583.

(cherry picked from commit a88329d4553b40c45ebf9eacf229db7839d46769)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/rdd/InputFileNameHolder.scala     | 10 +++++++++-
 .../spark/sql/catalyst/expressions/InputFileName.scala |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala b/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala
index f40d4c8e0a4d0..960c91a154db1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala
@@ -22,6 +22,8 @@ import org.apache.spark.unsafe.types.UTF8String
 /**
  * This holds file names of the current Spark task. This is used in HadoopRDD,
  * FileScanRDD, NewHadoopRDD and InputFileName function in Spark SQL.
+ *
+ * The returned value should never be null but empty string if it is unknown.
  */
 private[spark] object InputFileNameHolder {
   /**
@@ -32,9 +34,15 @@ private[spark] object InputFileNameHolder {
     override protected def initialValue(): UTF8String = UTF8String.fromString("")
   }
 
+  /**
+   * Returns the holding file name or empty string if it is unknown.
+   */
   def getInputFileName(): UTF8String = inputFileName.get()
 
-  private[spark] def setInputFileName(file: String) = inputFileName.set(UTF8String.fromString(file))
+  private[spark] def setInputFileName(file: String) = {
+    require(file != null, "The input file name cannot be null")
+    inputFileName.set(UTF8String.fromString(file))
+  }
 
   private[spark] def unsetInputFileName(): Unit = inputFileName.remove()
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
index b7fb285133bfc..d412336699d80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InputFileName.scala
@@ -30,7 +30,7 @@ import org.apache.spark.unsafe.types.UTF8String
   usage = "_FUNC_() - Returns the name of the current file being read if available.")
 case class InputFileName() extends LeafExpression with Nondeterministic {
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = false
 
   override def dataType: DataType = StringType
 

From 830ee1345b491bf10fd089d931ef22e28f98e615 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 26 Nov 2016 05:28:41 -0800
Subject: [PATCH 195/534] [SPARK-18481][ML] ML 2.1 QA: Remove deprecated
 methods for ML

## What changes were proposed in this pull request?
Remove deprecated methods for ML.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #15913 from yanboliang/spark-18481.

(cherry picked from commit c4a7eef0ce2d305c5c90a0a9a73b5a32eccfba95)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../scala/org/apache/spark/ml/Pipeline.scala  |  4 +
 .../ml/classification/GBTClassifier.scala     |  6 ++
 .../classification/LogisticRegression.scala   |  8 +-
 .../RandomForestClassifier.scala              | 11 +--
 .../spark/ml/feature/ChiSqSelector.scala      |  7 --
 .../org/apache/spark/ml/param/params.scala    | 15 ----
 .../spark/ml/regression/GBTRegressor.scala    |  6 ++
 .../ml/regression/LinearRegression.scala      |  3 -
 .../ml/regression/RandomForestRegressor.scala | 10 +--
 .../org/apache/spark/ml/tree/treeModels.scala |  5 --
 .../org/apache/spark/ml/tree/treeParams.scala | 90 ++++++++-----------
 .../org/apache/spark/ml/util/ReadWrite.scala  |  2 +-
 .../classification/GBTClassifierSuite.scala   |  8 ++
 .../LogisticRegressionSuite.scala             |  6 ++
 project/MimaExcludes.scala                    | 30 +++++++
 python/pyspark/ml/util.py                     | 40 ++++++++-
 16 files changed, 144 insertions(+), 107 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index f406f8c426d0c..38176b96ba2ed 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -46,6 +46,10 @@ abstract class PipelineStage extends Params with Logging {
    *
    * Check transform validity and derive the output schema from the input schema.
    *
+   * We check validity for interactions between parameters during `transformSchema` and
+   * raise an exception if any parameter value is invalid. Parameter value checks which
+   * do not depend on other parameters are handled by `Param.validate()`.
+   *
    * Typical implementation should first conduct verification on schema change and parameter
    * validity, including complex parameter interaction checks.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index 52f93f5a6b345..ca5223133317c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -203,6 +203,12 @@ class GBTClassificationModel private[ml](
   @Since("1.4.0")
   override def trees: Array[DecisionTreeRegressionModel] = _trees
 
+  /**
+   * Number of trees in ensemble
+   */
+  @Since("2.0.0")
+  val getNumTrees: Int = trees.length
+
   @Since("1.4.0")
   override def treeWeights: Array[Double] = _treeWeights
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index fe29926e0d994..41b84f481633c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -40,7 +40,7 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.VersionUtils
 
@@ -176,8 +176,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
     }
   }
 
-  override def validateParams(): Unit = {
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
     checkThresholdConsistency()
+    super.validateAndTransformSchema(schema, fitting, featuresDataType)
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 907c73e2e4d0a..d151213f9edd8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -158,7 +158,7 @@ class RandomForestClassificationModel private[ml] (
     @Since("1.6.0") override val numFeatures: Int,
     @Since("1.5.0") override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel]
-  with RandomForestClassificationModelParams with TreeEnsembleModel[DecisionTreeClassificationModel]
+  with RandomForestClassifierParams with TreeEnsembleModel[DecisionTreeClassificationModel]
   with MLWritable with Serializable {
 
   require(_trees.nonEmpty, "RandomForestClassificationModel requires at least 1 tree.")
@@ -221,15 +221,6 @@ class RandomForestClassificationModel private[ml] (
     }
   }
 
-  /**
-   * Number of trees in ensemble
-   *
-   * @deprecated  Use [[getNumTrees]] instead.  This method will be removed in 2.1.0
-   */
-  // TODO: Once this is removed, then this class can inherit from RandomForestClassifierParams
-  @deprecated("Use getNumTrees instead.  This method will be removed in 2.1.0.", "2.0.0")
-  val numTrees: Int = trees.length
-
   @Since("1.4.0")
   override def copy(extra: ParamMap): RandomForestClassificationModel = {
     copyValues(new RandomForestClassificationModel(uid, _trees, numFeatures, numClasses), extra)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 653fa41124f88..7cd0f159c6be7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -216,13 +216,6 @@ final class ChiSqSelectorModel private[ml] (
   @Since("1.6.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /**
-   * @group setParam
-   */
-  @Since("1.6.0")
-  @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
-  def setLabelCol(value: String): this.type = set(labelCol, value)
-
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val transformedSchema = transformSchema(dataset.schema, logging = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 96206e0b7ad88..5bd8ebe0987a9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -546,21 +546,6 @@ trait Params extends Identifiable with Serializable {
       .map(m => m.invoke(this).asInstanceOf[Param[_]])
   }
 
-  /**
-   * Validates parameter values stored internally.
-   * Raise an exception if any parameter value is invalid.
-   *
-   * This only needs to check for interactions between parameters.
-   * Parameter value checks which do not depend on other parameters are handled by
-   * `Param.validate()`. This method does not handle input/output column parameters;
-   * those are checked during schema validation.
-   * @deprecated Will be removed in 2.1.0. All the checks should be merged into transformSchema
-   */
-  @deprecated("Will be removed in 2.1.0. Checks should be merged into transformSchema.", "2.0.0")
-  def validateParams(): Unit = {
-    // Do nothing by default.  Override to handle Param interactions.
-  }
-
   /**
    * Explains a param.
    * @param param input param, must belong to this instance.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index ed2d05525d611..6d8159aa3bdcf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -183,6 +183,12 @@ class GBTRegressionModel private[ml](
   @Since("1.4.0")
   override def trees: Array[DecisionTreeRegressionModel] = _trees
 
+  /**
+   * Number of trees in ensemble
+   */
+  @Since("2.0.0")
+  val getNumTrees: Int = trees.length
+
   @Since("1.4.0")
   override def treeWeights: Array[Double] = _treeWeights
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index eb4e38cc83c19..19ddf36a718c4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -611,9 +611,6 @@ class LinearRegressionSummary private[regression] (
     private val privateModel: LinearRegressionModel,
     private val diagInvAtWA: Array[Double]) extends Serializable {
 
-  @deprecated("The model field is deprecated and will be removed in 2.1.0.", "2.0.0")
-  val model: LinearRegressionModel = privateModel
-
   @transient private val metrics = new RegressionMetrics(
     predictions
       .select(col(predictionCol), col(labelCol).cast(DoubleType))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index d60f05eed58d9..90d89c51c5740 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -145,7 +145,7 @@ class RandomForestRegressionModel private[ml] (
     private val _trees: Array[DecisionTreeRegressionModel],
     override val numFeatures: Int)
   extends PredictionModel[Vector, RandomForestRegressionModel]
-  with RandomForestRegressionModelParams with TreeEnsembleModel[DecisionTreeRegressionModel]
+  with RandomForestRegressorParams with TreeEnsembleModel[DecisionTreeRegressionModel]
   with MLWritable with Serializable {
 
   require(_trees.nonEmpty, "RandomForestRegressionModel requires at least 1 tree.")
@@ -182,14 +182,6 @@ class RandomForestRegressionModel private[ml] (
     _trees.map(_.rootNode.predictImpl(features).prediction).sum / getNumTrees
   }
 
-  /**
-   * Number of trees in ensemble
-   * @deprecated  Use [[getNumTrees]] instead.  This method will be removed in 2.1.0
-   */
-  // TODO: Once this is removed, then this class can inherit from RandomForestRegressorParams
-  @deprecated("Use getNumTrees instead.  This method will be removed in 2.1.0.", "2.0.0")
-  val numTrees: Int = trees.length
-
   @Since("1.4.0")
   override def copy(extra: ParamMap): RandomForestRegressionModel = {
     copyValues(new RandomForestRegressionModel(uid, _trees, numFeatures), extra).setParent(parent)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
index d3cbc363799a5..0d6e9034e5ce4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala
@@ -95,11 +95,6 @@ private[ml] trait TreeEnsembleModel[M <: DecisionTreeModel] {
   /** Trees in this ensemble. Warning: These have null parent Estimators. */
   def trees: Array[M]
 
-  /**
-   * Number of trees in ensemble
-   */
-  val getNumTrees: Int = trees.length
-
   /** Weights for each tree, zippable with [[trees]] */
   def treeWeights: Array[Double]
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 40510ad804ef0..83ab4b5da87be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -319,8 +319,32 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams {
   }
 }
 
-/** Used for [[RandomForestParams]] */
-private[ml] trait HasFeatureSubsetStrategy extends Params {
+/**
+ * Parameters for Random Forest algorithms.
+ */
+private[ml] trait RandomForestParams extends TreeEnsembleParams {
+
+  /**
+   * Number of trees to train (>= 1).
+   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
+   * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
+   * (default = 20)
+   *
+   * Note: The reason that we cannot add this to both GBT and RF (i.e. in TreeEnsembleParams)
+   * is the param `maxIter` controls how many trees a GBT has. The semantics in the algorithms
+   * are a bit different.
+   * @group param
+   */
+  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
+    ParamValidators.gtEq(1))
+
+  setDefault(numTrees -> 20)
+
+  /** @group setParam */
+  def setNumTrees(value: Int): this.type = set(numTrees, value)
+
+  /** @group getParam */
+  final def getNumTrees: Int = $(numTrees)
 
   /**
    * The number of features to consider for splits at each tree node.
@@ -366,38 +390,6 @@ private[ml] trait HasFeatureSubsetStrategy extends Params {
   final def getFeatureSubsetStrategy: String = $(featureSubsetStrategy).toLowerCase
 }
 
-/**
- * Used for [[RandomForestParams]].
- * This is separated out from [[RandomForestParams]] because of an issue with the
- * `numTrees` method conflicting with this Param in the Estimator.
- */
-private[ml] trait HasNumTrees extends Params {
-
-  /**
-   * Number of trees to train (>= 1).
-   * If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
-   * TODO: Change to always do bootstrapping (simpler).  SPARK-7130
-   * (default = 20)
-   * @group param
-   */
-  final val numTrees: IntParam = new IntParam(this, "numTrees", "Number of trees to train (>= 1)",
-    ParamValidators.gtEq(1))
-
-  setDefault(numTrees -> 20)
-
-  /** @group setParam */
-  def setNumTrees(value: Int): this.type = set(numTrees, value)
-
-  /** @group getParam */
-  final def getNumTrees: Int = $(numTrees)
-}
-
-/**
- * Parameters for Random Forest algorithms.
- */
-private[ml] trait RandomForestParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with HasNumTrees
-
 private[spark] object RandomForestParams {
   // These options should be lowercase.
   final val supportedFeatureSubsetStrategies: Array[String] =
@@ -407,21 +399,15 @@ private[spark] object RandomForestParams {
 private[ml] trait RandomForestClassifierParams
   extends RandomForestParams with TreeClassifierParams
 
-private[ml] trait RandomForestClassificationModelParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with TreeClassifierParams
-
 private[ml] trait RandomForestRegressorParams
   extends RandomForestParams with TreeRegressorParams
 
-private[ml] trait RandomForestRegressionModelParams extends TreeEnsembleParams
-  with HasFeatureSubsetStrategy with TreeRegressorParams
-
 /**
  * Parameters for Gradient-Boosted Tree algorithms.
  *
  * Note: Marked as private and DeveloperApi since this may be made public in the future.
  */
-private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasStepSize {
+private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
 
   /* TODO: Add this doc when we add this param.  SPARK-7132
    * Threshold for stopping early when runWithValidation is used.
@@ -434,24 +420,26 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter with HasS
   // final val validationTol: DoubleParam = new DoubleParam(this, "validationTol", "")
   // validationTol -> 1e-5
 
-  setDefault(maxIter -> 20, stepSize -> 0.1)
-
   /** @group setParam */
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   /**
-   * Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each
-   * estimator.
+   * Param for Step size (a.k.a. learning rate) in interval (0, 1] for shrinking
+   * the contribution of each estimator.
    * (default = 0.1)
-   * @group setParam
+   * @group param
    */
+  final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size " +
+    "(a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.",
+    ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true))
+
+  /** @group getParam */
+  final def getStepSize: Double = $(stepSize)
+
+  /** @group setParam */
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
-  override def validateParams(): Unit = {
-    require(ParamValidators.inRange(0, 1, lowerInclusive = false, upperInclusive = true)(
-      getStepSize), "GBT parameter stepSize should be in interval (0, 1], " +
-      s"but it given invalid value $getStepSize.")
-  }
+  setDefault(maxIter -> 20, stepSize -> 0.1)
 
   /** (private[ml]) Create a BoostingStrategy instance to use with the old API. */
   private[ml] def getOldBoostingStrategy(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 5b7e5ec75c842..bbb9886391697 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -46,7 +46,7 @@ private[util] sealed trait BaseReadWrite {
    * Sets the Spark SQLContext to use for saving/loading.
    */
   @Since("1.6.0")
-  @deprecated("Use session instead", "2.0.0")
+  @deprecated("Use session instead, This method will be removed in 2.2.0.", "2.0.0")
   def context(sqlContext: SQLContext): this.type = {
     optionSparkSession = Option(sqlContext.sparkSession)
     this
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index 3492709677d4f..7c36745ab213b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -70,6 +70,14 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext
     ParamsSuite.checkParams(model)
   }
 
+  test("GBT parameter stepSize should be in interval (0, 1]") {
+    withClue("GBT parameter stepSize should be in interval (0, 1]") {
+      intercept[IllegalArgumentException] {
+        new GBTClassifier().setStepSize(10)
+      }
+    }
+  }
+
   test("Binary classification with continuous features: Log Loss") {
     val categoricalFeatures = Map.empty[Int, Int]
     testCombinations.foreach {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index e360542eae2ab..9c4c59a5e60fa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -192,6 +192,12 @@ class LogisticRegressionSuite
       }
     }
     // thresholds and threshold must be consistent: values
+    withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
+      intercept[IllegalArgumentException] {
+        lr2.fit(smallBinaryDataset,
+          lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0))
+      }
+    }
     withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
       intercept[IllegalArgumentException] {
         val lr2model = lr2.fit(smallBinaryDataset,
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 350b144f8294b..03c9fcc0124d2 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -864,6 +864,36 @@ object MimaExcludes {
       // [SPARK-12221] Add CPU time to metrics
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetrics.this"),
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskMetricDistributions.this")
+    ) ++ Seq(
+      // [SPARK-18481] ML 2.1 QA: Remove deprecated methods for ML
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.PipelineStage.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.JavaParams.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.param.Params.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassificationModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegression.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.GBTClassifier.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.feature.ChiSqSelectorModel.setLabelCol"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.evaluation.Evaluator.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressor.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.GBTRegressionModel.validateParams"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.LinearRegressionSummary.model"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassifier"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassifier"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.classification.GBTClassificationModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressor"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressor"),
+      ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.ml.regression.GBTRegressionModel"),
+      ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.getNumTrees"),
+      ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.getNumTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.numTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
     )
   }
 
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index 7d39c30122350..bec4b28952102 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -78,7 +78,14 @@ def overwrite(self):
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for saving."""
+        """
+        Sets the SQL context to use for saving.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
+
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for saving."""
         raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))
 
 
@@ -105,10 +112,19 @@ def overwrite(self):
         return self
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for saving."""
+        """
+        Sets the SQL context to use for saving.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.")
         self._jwrite.context(sqlContext._ssql_ctx)
         return self
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for saving."""
+        self._jwrite.session(sparkSession._jsparkSession)
+        return self
+
 
 @inherit_doc
 class MLWritable(object):
@@ -155,7 +171,14 @@ def load(self, path):
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for loading."""
+        """
+        Sets the SQL context to use for loading.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
+
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for loading."""
         raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))
 
 
@@ -180,10 +203,19 @@ def load(self, path):
         return self._clazz._from_java(java_obj)
 
     def context(self, sqlContext):
-        """Sets the SQL context to use for loading."""
+        """
+        Sets the SQL context to use for loading.
+        .. note:: Deprecated in 2.1 and will be removed in 2.2, use session instead.
+        """
+        warnings.warn("Deprecated in 2.1 and will be removed in 2.2, use session instead.")
         self._jread.context(sqlContext._ssql_ctx)
         return self
 
+    def session(self, sparkSession):
+        """Sets the Spark Session to use for loading."""
+        self._jread.session(sparkSession._jsparkSession)
+        return self
+
     @classmethod
     def _java_loader_class(cls, clazz):
         """

From ff699332c113e21b942f5a62f475ae79ac6c0ee5 Mon Sep 17 00:00:00 2001
From: Weiqing Yang <yangweiqing001@gmail.com>
Date: Sat, 26 Nov 2016 15:41:37 +0000
Subject: [PATCH 196/534] [WIP][SQL][DOC] Fix incorrect `code` tag

## What changes were proposed in this pull request?
This PR is to fix incorrect `code` tag in `sql-programming-guide.md`

## How was this patch tested?
Manually.

Author: Weiqing Yang <yangweiqing001@gmail.com>

Closes #15941 from weiqingy/fixtag.

(cherry picked from commit f4a98e421e14434fddc3f9f1018a17124d660ef0)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/sql-programming-guide.md                                   | 2 +-
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index ba3e55fc061a7..3093d48282919 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1089,7 +1089,7 @@ the following case-sensitive options:
   <tr>
      <td><code>isolationLevel</code></td>
      <td>
-       The transaction isolation level, which applies to current connection. It can be one of <code>NONE<code>, <code>READ_COMMITTED<code>, <code>READ_UNCOMMITTED<code>, <code>REPEATABLE_READ<code>, or <code>SERIALIZABLE<code>, corresponding to standard transaction isolation levels defined by JDBC's Connection object, with default of <code>READ_UNCOMMITTED<code>. This option applies only to writing. Please refer the documentation in <code>java.sql.Connection</code>.
+       The transaction isolation level, which applies to current connection. It can be one of <code>NONE</code>, <code>READ_COMMITTED</code>, <code>READ_UNCOMMITTED</code>, <code>REPEATABLE_READ</code>, or <code>SERIALIZABLE</code>, corresponding to standard transaction isolation levels defined by JDBC's Connection object, with default of <code>READ_UNCOMMITTED</code>. This option applies only to writing. Please refer the documentation in <code>java.sql.Connection</code>.
      </td>
    </tr>
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7cca9dba2962a..5589805212b7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -108,7 +108,7 @@ object SQLConf {
     .doc("Configures the maximum size in bytes for a table that will be broadcast to all worker " +
       "nodes when performing a join.  By setting this value to -1 broadcasting can be disabled. " +
       "Note that currently statistics are only supported for Hive Metastore tables where the " +
-      "command<code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been " +
+      "command <code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been " +
       "run, and file-based data source tables where the statistics are computed directly on " +
       "the files of data.")
     .longConf

From 9c5495728aac1693ddac96421f8a6181a595e775 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 26 Nov 2016 14:57:48 -0800
Subject: [PATCH 197/534] [SPARK-17251][SQL] Improve `OuterReference` to be
 `NamedExpression`

## What changes were proposed in this pull request?

Currently, `OuterReference` is not `NamedExpression`. So, it raises 'ClassCastException` when it used in projection lists of IN correlated subqueries. This PR aims to support that by making `OuterReference` as `NamedExpression` to show correct error messages.

```scala
scala> sql("CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES 1, 2 AS t1(a)")
scala> sql("CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES 1 AS t2(b)")
scala> sql("SELECT a FROM t1 WHERE a IN (SELECT a FROM t2)").show
java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.OuterReference cannot be cast to org.apache.spark.sql.catalyst.expressions.NamedExpression
```

## How was this patch tested?

Pass the Jenkins test with new test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #16015 from dongjoon-hyun/SPARK-17251-2.

(cherry picked from commit 9c03c564605783d8e94f6795432bb59c33933e52)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  3 +-
 .../expressions/namedExpressions.scala        |  9 +++-
 .../analysis/ResolveSubquerySuite.scala       | 43 +++++++++++++++++++
 3 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 2d272762b384f..e576d53280504 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -968,7 +968,8 @@ class Analyzer(
       def failOnOuterReference(p: LogicalPlan): Unit = {
         if (p.expressions.exists(containsOuter)) {
           failAnalysis(
-            s"Correlated predicates are not supported outside of WHERE/HAVING clauses: $p")
+            "Expressions referencing the outer query are not supported outside of WHERE/HAVING " +
+              s"clauses: $p")
         }
       }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 1274757136051..c842f85af693c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -356,10 +356,17 @@ case class PrettyAttribute(
  * A place holder used to hold a reference that has been resolved to a field outside of the current
  * plan. This is used for correlated subqueries.
  */
-case class OuterReference(e: NamedExpression) extends LeafExpression with Unevaluable {
+case class OuterReference(e: NamedExpression)
+  extends LeafExpression with NamedExpression with Unevaluable {
   override def dataType: DataType = e.dataType
   override def nullable: Boolean = e.nullable
   override def prettyName: String = "outer"
+
+  override def name: String = e.name
+  override def qualifier: Option[String] = e.qualifier
+  override def exprId: ExprId = e.exprId
+  override def toAttribute: Attribute = e.toAttribute
+  override def newInstance(): NamedExpression = OuterReference(e.newInstance())
 }
 
 object VirtualColumn {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
new file mode 100644
index 0000000000000..4aafb2b83fb69
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveSubquerySuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference}
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project}
+
+/**
+ * Unit tests for [[ResolveSubquery]].
+ */
+class ResolveSubquerySuite extends AnalysisTest {
+
+  val a = 'a.int
+  val b = 'b.int
+  val t1 = LocalRelation(a)
+  val t2 = LocalRelation(b)
+
+  test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
+    val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1)
+    val m = intercept[AnalysisException] {
+      SimpleAnalyzer.ResolveSubquery(expr)
+    }.getMessage
+    assert(m.contains(
+      "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
+  }
+}

From 1e8fbefa3b61e2deb3dc7d7d3467e4cec69e54ce Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sun, 27 Nov 2016 19:43:24 -0800
Subject: [PATCH 198/534] [SPARK-18594][SQL] Name Validation of
 Databases/Tables

### What changes were proposed in this pull request?
Currently, the name validation checks are limited to table creation. It is enfored by Analyzer rule: `PreWriteCheck`.

However, table renaming and database creation have the same issues. It makes more sense to do the checks in `SessionCatalog`. This PR is to add it into `SessionCatalog`.

### How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #16018 from gatorsmile/nameValidate.

(cherry picked from commit 07f32c2283e26e86474ba8c9b50125831009a1ea)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../sql/catalyst/catalog/SessionCatalog.scala | 18 ++++++++++++
 .../catalog/SessionCatalogSuite.scala         | 27 ++++++++++++++++++
 .../sql/execution/datasources/rules.scala     | 28 ++++---------------
 .../spark/sql/hive/MultiDatabaseSuite.scala   | 11 ++++----
 4 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 19a8fcdd8b75b..002aecb9bf133 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -85,6 +85,21 @@ class SessionCatalog(
   @GuardedBy("this")
   protected var currentDb = formatDatabaseName(DEFAULT_DATABASE)
 
+  /**
+   * Checks if the given name conforms the Hive standard ("[a-zA-z_0-9]+"),
+   * i.e. if this name only contains characters, numbers, and _.
+   *
+   * This method is intended to have the same behavior of
+   * org.apache.hadoop.hive.metastore.MetaStoreUtils.validateName.
+   */
+  private def validateName(name: String): Unit = {
+    val validNameFormat = "([\\w_]+)".r
+    if (!validNameFormat.pattern.matcher(name).matches()) {
+      throw new AnalysisException(s"`$name` is not a valid name for tables/databases. " +
+        "Valid names only contain alphabet characters, numbers and _.")
+    }
+  }
+
   /**
    * Format table name, taking into account case sensitivity.
    */
@@ -143,6 +158,7 @@ class SessionCatalog(
         s"${globalTempViewManager.database} is a system preserved database, " +
           "you cannot create a database with this name.")
     }
+    validateName(dbName)
     val qualifiedPath = makeQualifiedPath(dbDefinition.locationUri).toString
     externalCatalog.createDatabase(
       dbDefinition.copy(name = dbName, locationUri = qualifiedPath),
@@ -226,6 +242,7 @@ class SessionCatalog(
   def createTable(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = {
     val db = formatDatabaseName(tableDefinition.identifier.database.getOrElse(getCurrentDatabase))
     val table = formatTableName(tableDefinition.identifier.table)
+    validateName(table)
     val newTableDefinition = tableDefinition.copy(identifier = TableIdentifier(table, Some(db)))
     requireDbExists(db)
     externalCatalog.createTable(newTableDefinition, ignoreIfExists)
@@ -474,6 +491,7 @@ class SessionCatalog(
       if (oldName.database.isDefined || !tempTables.contains(oldTableName)) {
         requireTableExists(TableIdentifier(oldTableName, Some(db)))
         requireTableNotExists(TableIdentifier(newTableName, Some(db)))
+        validateName(newTableName)
         externalCatalog.renameTable(db, oldTableName, newTableName)
       } else {
         if (newName.database.isDefined) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 52385de50db6b..da41d3614b784 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -61,6 +61,22 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(!catalog.databaseExists("does_not_exist"))
   }
 
+  def testInvalidName(func: (String) => Unit) {
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    val name = "砖"
+    // scalastyle:on
+    val e = intercept[AnalysisException] {
+      func(name)
+    }.getMessage
+    assert(e.contains(s"`$name` is not a valid name for tables/databases."))
+  }
+
+  test("create databases using invalid names") {
+    val catalog = new SessionCatalog(newEmptyCatalog())
+    testInvalidName(name => catalog.createDatabase(newDb(name), ignoreIfExists = true))
+  }
+
   test("get database when a database exists") {
     val catalog = new SessionCatalog(newBasicCatalog())
     val db1 = catalog.getDatabaseMetadata("db1")
@@ -194,6 +210,11 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(externalCatalog.listTables("db2").toSet == Set("tbl1", "tbl2", "tbl3"))
   }
 
+  test("create tables using invalid names") {
+    val catalog = new SessionCatalog(newEmptyCatalog())
+    testInvalidName(name => catalog.createTable(newTable(name, "db1"), ignoreIfExists = false))
+  }
+
   test("create table when database does not exist") {
     val catalog = new SessionCatalog(newBasicCatalog())
     // Creating table in non-existent database should always fail
@@ -309,6 +330,12 @@ class SessionCatalogSuite extends SparkFunSuite {
     }
   }
 
+  test("rename tables to an invalid name") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    testInvalidName(
+      name => catalog.renameTable(TableIdentifier("tbl1", Some("db2")), TableIdentifier(name)))
+  }
+
   test("rename table when database/table does not exist") {
     val catalog = new SessionCatalog(newBasicCatalog())
     intercept[NoSuchDatabaseException] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 5ba44ff9f5d9d..7154e3e41c93b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -309,24 +309,9 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
 
   def failAnalysis(msg: String): Unit = { throw new AnalysisException(msg) }
 
-  // This regex is used to check if the table name and database name is valid for `CreateTable`.
-  private val validNameFormat = Pattern.compile("[\\w_]+")
-
   def apply(plan: LogicalPlan): Unit = {
     plan.foreach {
       case c @ CreateTable(tableDesc, mode, query) if c.resolved =>
-        // Since we are saving table metadata to metastore, we should make sure the table name
-        // and database name don't break some common restrictions, e.g. special chars except
-        // underscore are not allowed.
-        val tblIdent = tableDesc.identifier
-        if (!validNameFormat.matcher(tblIdent.table).matches()) {
-          failAnalysis(s"Table name ${tblIdent.table} is not a valid name for " +
-            s"metastore. Metastore only accepts table name containing characters, numbers and _.")
-        }
-        if (tblIdent.database.exists(db => !validNameFormat.matcher(db).matches())) {
-          failAnalysis(s"Database name ${tblIdent.database.get} is not a valid name for " +
-            s"metastore. Metastore only accepts table name containing characters, numbers and _.")
-        }
         if (query.isDefined &&
           mode == SaveMode.Overwrite &&
           catalog.tableExists(tableDesc.identifier)) {
@@ -334,7 +319,7 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
           EliminateSubqueryAliases(catalog.lookupRelation(tableDesc.identifier)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
-            case l @ LogicalRelation(dest: BaseRelation, _, _) =>
+            case LogicalRelation(dest: BaseRelation, _, _) =>
               // Get all input data source relations of the query.
               val srcRelations = query.get.collect {
                 case LogicalRelation(src: BaseRelation, _, _) => src
@@ -347,9 +332,8 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
           }
         }
 
-      case i @ logical.InsertIntoTable(
-        l @ LogicalRelation(t: InsertableRelation, _, _),
-        partition, query, overwrite, ifNotExists) =>
+      case logical.InsertIntoTable(
+          l @ LogicalRelation(t: InsertableRelation, _, _), partition, query, _, _) =>
         // Right now, we do not support insert into a data source table with partition specs.
         if (partition.nonEmpty) {
           failAnalysis(s"Insert into a partition is not allowed because $l is not partitioned.")
@@ -367,15 +351,15 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
         }
 
       case logical.InsertIntoTable(
-        LogicalRelation(r: HadoopFsRelation, _, _), part, query, overwrite, _) =>
+        LogicalRelation(r: HadoopFsRelation, _, _), part, query, _, _) =>
         // We need to make sure the partition columns specified by users do match partition
         // columns of the relation.
         val existingPartitionColumns = r.partitionSchema.fieldNames.toSet
         val specifiedPartitionColumns = part.keySet
         if (existingPartitionColumns != specifiedPartitionColumns) {
-          failAnalysis(s"Specified partition columns " +
+          failAnalysis("Specified partition columns " +
             s"(${specifiedPartitionColumns.mkString(", ")}) " +
-            s"do not match the partition columns of the table. Please use " +
+            "do not match the partition columns of the table. Please use " +
             s"(${existingPartitionColumns.mkString(", ")}) as the partition columns.")
         } else {
           // OK
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
index 9f4401ae22560..73224651092f6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -269,17 +269,17 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
       val message = intercept[AnalysisException] {
         df.write.format("parquet").saveAsTable("`d:b`.`t:a`")
       }.getMessage
-      assert(message.contains("is not a valid name for metastore"))
+      assert(message.contains("Database 'd:b' not found"))
     }
 
     {
       val message = intercept[AnalysisException] {
         df.write.format("parquet").saveAsTable("`d:b`.`table`")
       }.getMessage
-      assert(message.contains("is not a valid name for metastore"))
+      assert(message.contains("Database 'd:b' not found"))
     }
 
-    withTempPath { dir =>
+    withTempDir { dir =>
       val path = dir.getCanonicalPath
 
       {
@@ -293,7 +293,8 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
             |)
             """.stripMargin)
         }.getMessage
-        assert(message.contains("is not a valid name for metastore"))
+        assert(message.contains("`t:a` is not a valid name for tables/databases. " +
+          "Valid names only contain alphabet characters, numbers and _."))
       }
 
       {
@@ -307,7 +308,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
               |)
               """.stripMargin)
         }.getMessage
-        assert(message.contains("is not a valid name for metastore"))
+        assert(message.contains("Database 'd:b' not found"))
       }
     }
   }

From 6b77889e8aea86322e90f0013d45872f867ba905 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 27 Nov 2016 21:45:50 -0800
Subject: [PATCH 199/534] [SPARK-18482][SQL] make sure Spark can access the
 table metadata created by older version of spark

## What changes were proposed in this pull request?

In Spark 2.1, we did a lot of refactor for `HiveExternalCatalog` and related code path. These refactor may introduce external behavior changes and break backward compatibility. e.g. http://issues.apache.org/jira/browse/SPARK-18464

To avoid future compatibility problems of `HiveExternalCatalog`, this PR dumps some typical table metadata from tables created by 2.0, and test if they can recognized by current version of Spark.

## How was this patch tested?

test only change

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16003 from cloud-fan/test.

(cherry picked from commit fc2c13bdf0be5e349539b2ab90087c34b2d3faab)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 ...nalCatalogBackwardCompatibilitySuite.scala | 251 ++++++++++++++++++
 .../sql/hive/MetastoreDataSourcesSuite.scala  |  43 ---
 2 files changed, 251 insertions(+), 43 deletions(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
new file mode 100644
index 0000000000000..cca4480c44150
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.net.URI
+
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.hive.client.HiveClient
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
+
+
+class HiveExternalCatalogBackwardCompatibilitySuite extends QueryTest
+  with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach {
+
+  // To test `HiveExternalCatalog`, we need to read/write the raw table meta from/to hive client.
+  val hiveClient: HiveClient =
+    spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+  val tempDir = Utils.createTempDir().getCanonicalFile
+
+  override def beforeEach(): Unit = {
+    sql("CREATE DATABASE test_db")
+    for ((tbl, _) <- rawTablesAndExpectations) {
+      hiveClient.createTable(tbl, ignoreIfExists = false)
+    }
+  }
+
+  override def afterEach(): Unit = {
+    Utils.deleteRecursively(tempDir)
+    hiveClient.dropDatabase("test_db", ignoreIfNotExists = false, cascade = true)
+  }
+
+  private def getTableMetadata(tableName: String): CatalogTable = {
+    spark.sharedState.externalCatalog.getTable("test_db", tableName)
+  }
+
+  private def defaultTablePath(tableName: String): String = {
+    spark.sessionState.catalog.defaultTablePath(TableIdentifier(tableName, Some("test_db")))
+  }
+
+
+  // Raw table metadata that are dumped from tables created by Spark 2.0. Note that, all spark
+  // versions prior to 2.1 would generate almost same raw table metadata for a specific table.
+  val simpleSchema = new StructType().add("i", "int")
+  val partitionedSchema = new StructType().add("i", "int").add("j", "int")
+
+  lazy val hiveTable = CatalogTable(
+    identifier = TableIdentifier("tbl1", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(
+      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
+      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+    schema = simpleSchema)
+
+  lazy val externalHiveTable = CatalogTable(
+    identifier = TableIdentifier("tbl2", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(tempDir.getAbsolutePath),
+      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
+      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+    schema = simpleSchema)
+
+  lazy val partitionedHiveTable = CatalogTable(
+    identifier = TableIdentifier("tbl3", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(
+      inputFormat = Some("org.apache.hadoop.mapred.TextInputFormat"),
+      outputFormat = Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+    schema = partitionedSchema,
+    partitionColumnNames = Seq("j"))
+
+
+  val simpleSchemaJson =
+    """
+      |{
+      | "type": "struct",
+      | "fields": [{
+      |             "name": "i",
+      |             "type": "integer",
+      |             "nullable": true,
+      |             "metadata": {}
+      |            }]
+      |}
+    """.stripMargin
+
+  val partitionedSchemaJson =
+    """
+      |{
+      | "type": "struct",
+      | "fields": [{
+      |             "name": "i",
+      |             "type": "integer",
+      |             "nullable": true,
+      |             "metadata": {}
+      |            },
+      |            {
+      |             "name": "j",
+      |             "type": "integer",
+      |             "nullable": true,
+      |             "metadata": {}
+      |            }]
+      |}
+    """.stripMargin
+
+  lazy val dataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl4", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(properties = Map("path" -> defaultTablePath("tbl4"))),
+    schema = new StructType(),
+    properties = Map(
+      "spark.sql.sources.provider" -> "json",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val hiveCompatibleDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl5", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(properties = Map("path" -> defaultTablePath("tbl5"))),
+    schema = simpleSchema,
+    properties = Map(
+      "spark.sql.sources.provider" -> "parquet",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val partitionedDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl6", Some("test_db")),
+    tableType = CatalogTableType.MANAGED,
+    storage = CatalogStorageFormat.empty.copy(properties = Map("path" -> defaultTablePath("tbl6"))),
+    schema = new StructType(),
+    properties = Map(
+      "spark.sql.sources.provider" -> "json",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> partitionedSchemaJson,
+      "spark.sql.sources.schema.numPartCols" -> "1",
+      "spark.sql.sources.schema.partCol.0" -> "j"))
+
+  lazy val externalDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl7", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(defaultTablePath("tbl7") + "-__PLACEHOLDER__"),
+      properties = Map("path" -> tempDir.getAbsolutePath)),
+    schema = new StructType(),
+    properties = Map(
+      "spark.sql.sources.provider" -> "json",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val hiveCompatibleExternalDataSourceTable = CatalogTable(
+    identifier = TableIdentifier("tbl8", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(tempDir.getAbsolutePath),
+      properties = Map("path" -> tempDir.getAbsolutePath)),
+    schema = simpleSchema,
+    properties = Map(
+      "spark.sql.sources.provider" -> "parquet",
+      "spark.sql.sources.schema.numParts" -> "1",
+      "spark.sql.sources.schema.part.0" -> simpleSchemaJson))
+
+  lazy val dataSourceTableWithoutSchema = CatalogTable(
+    identifier = TableIdentifier("tbl9", Some("test_db")),
+    tableType = CatalogTableType.EXTERNAL,
+    storage = CatalogStorageFormat.empty.copy(
+      locationUri = Some(defaultTablePath("tbl9") + "-__PLACEHOLDER__"),
+      properties = Map("path" -> tempDir.getAbsolutePath)),
+    schema = new StructType(),
+    properties = Map("spark.sql.sources.provider" -> "json"))
+
+  // A list of all raw tables we want to test, with their expected schema.
+  lazy val rawTablesAndExpectations = Seq(
+    hiveTable -> simpleSchema,
+    externalHiveTable -> simpleSchema,
+    partitionedHiveTable -> partitionedSchema,
+    dataSourceTable -> simpleSchema,
+    hiveCompatibleDataSourceTable -> simpleSchema,
+    partitionedDataSourceTable -> partitionedSchema,
+    externalDataSourceTable -> simpleSchema,
+    hiveCompatibleExternalDataSourceTable -> simpleSchema,
+    dataSourceTableWithoutSchema -> new StructType())
+
+  test("make sure we can read table created by old version of Spark") {
+    for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
+      val readBack = getTableMetadata(tbl.identifier.table)
+      assert(readBack.schema == expectedSchema)
+
+      if (tbl.tableType == CatalogTableType.EXTERNAL) {
+        // trim the URI prefix
+        val tableLocation = new URI(readBack.storage.locationUri.get).getPath
+        assert(tableLocation == tempDir.getAbsolutePath)
+      }
+    }
+  }
+
+  test("make sure we can alter table location created by old version of Spark") {
+    withTempDir { dir =>
+      for ((tbl, _) <- rawTablesAndExpectations if tbl.tableType == CatalogTableType.EXTERNAL) {
+        sql(s"ALTER TABLE ${tbl.identifier} SET LOCATION '${dir.getAbsolutePath}'")
+
+        val readBack = getTableMetadata(tbl.identifier.table)
+
+        // trim the URI prefix
+        val actualTableLocation = new URI(readBack.storage.locationUri.get).getPath
+        assert(actualTableLocation == dir.getAbsolutePath)
+      }
+    }
+  }
+
+  test("make sure we can rename table created by old version of Spark") {
+    for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
+      val newName = tbl.identifier.table + "_renamed"
+      sql(s"ALTER TABLE ${tbl.identifier} RENAME TO $newName")
+
+      val readBack = getTableMetadata(newName)
+      assert(readBack.schema == expectedSchema)
+
+      // trim the URI prefix
+      val actualTableLocation = new URI(readBack.storage.locationUri.get).getPath
+      val expectedLocation = if (tbl.tableType == CatalogTableType.EXTERNAL) {
+        tempDir.getAbsolutePath
+      } else {
+        // trim the URI prefix
+        new URI(defaultTablePath(newName)).getPath
+      }
+      assert(actualTableLocation == expectedLocation)
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c7cc75fbc8a07..a45f4b5d6376c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1370,47 +1370,4 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       sparkSession.sparkContext.conf.set(DEBUG_MODE, previousValue)
     }
   }
-
-  test("SPARK-17470: support old table that stores table location in storage properties") {
-    withTable("old") {
-      withTempPath { path =>
-        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
-        val tableDesc = CatalogTable(
-          identifier = TableIdentifier("old", Some("default")),
-          tableType = CatalogTableType.EXTERNAL,
-          storage = CatalogStorageFormat.empty.copy(
-            properties = Map("path" -> path.getAbsolutePath)
-          ),
-          schema = new StructType(),
-          properties = Map(
-            HiveExternalCatalog.DATASOURCE_PROVIDER -> "parquet",
-            HiveExternalCatalog.DATASOURCE_SCHEMA ->
-              new StructType().add("i", "int").add("j", "string").json))
-        hiveClient.createTable(tableDesc, ignoreIfExists = false)
-        checkAnswer(spark.table("old"), Row(1, "a"))
-      }
-    }
-  }
-
-  test("SPARK-18464: support old table which doesn't store schema in table properties") {
-    withTable("old") {
-      withTempPath { path =>
-        Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
-        val tableDesc = CatalogTable(
-          identifier = TableIdentifier("old", Some("default")),
-          tableType = CatalogTableType.EXTERNAL,
-          storage = CatalogStorageFormat.empty.copy(
-            properties = Map("path" -> path.getAbsolutePath)
-          ),
-          schema = new StructType(),
-          properties = Map(
-            HiveExternalCatalog.DATASOURCE_PROVIDER -> "parquet"))
-        hiveClient.createTable(tableDesc, ignoreIfExists = false)
-
-        checkAnswer(spark.table("old"), Row(1, "a"))
-
-        checkAnswer(sql("DESC old"), Row("i", "int", null) :: Row("j", "string", null) :: Nil)
-      }
-    }
-  }
 }

From 886f880df42b3b2d64377b2e9a236dda180d610d Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@happy-camper.st>
Date: Sun, 27 Nov 2016 23:30:18 -0800
Subject: [PATCH 200/534] [SPARK-18585][SQL] Use `ev.isNull = "false"` if
 possible for Janino to have a chance to optimize.

## What changes were proposed in this pull request?

Janino can optimize `true ? a : b` into `a` or `false ? a : b` into `b`, or if/else with literal condition, so we should use literal as `ev.isNull` if possible.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@happy-camper.st>

Closes #16008 from ueshin/issues/SPARK-18585.

(cherry picked from commit 87141622ee6b11ac177f68f58d0dc5f8b9a9f948)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/catalyst/expressions/complexTypeCreator.scala | 11 ++++-------
 .../sql/catalyst/expressions/nullExpressions.scala    |  6 ++----
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index c9f36649ec8ee..599fb638db32a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -61,7 +61,6 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
     ctx.addMutableState("Object[]", values, s"this.$values = null;")
 
     ev.copy(code = s"""
-      final boolean ${ev.isNull} = false;
       this.$values = new Object[${children.size}];""" +
       ctx.splitExpressions(
         ctx.INPUT_ROW,
@@ -78,7 +77,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
       s"""
         final ArrayData ${ev.value} = new $arrayClass($values);
         this.$values = null;
-      """)
+      """, isNull = "false")
   }
 
   override def prettyName: String = "array"
@@ -144,7 +143,6 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
     val keyData = s"new $arrayClass($keyArray)"
     val valueData = s"new $arrayClass($valueArray)"
     ev.copy(code = s"""
-      final boolean ${ev.isNull} = false;
       $keyArray = new Object[${keys.size}];
       $valueArray = new Object[${values.size}];""" +
       ctx.splitExpressions(
@@ -177,7 +175,7 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
         final MapData ${ev.value} = new $mapClass($keyData, $valueData);
         this.$keyArray = null;
         this.$valueArray = null;
-      """)
+      """, isNull = "false")
   }
 
   override def prettyName: String = "map"
@@ -301,7 +299,6 @@ case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStruc
     ctx.addMutableState("Object[]", values, s"this.$values = null;")
 
     ev.copy(code = s"""
-      boolean ${ev.isNull} = false;
       $values = new Object[${valExprs.size}];""" +
       ctx.splitExpressions(
         ctx.INPUT_ROW,
@@ -317,7 +314,7 @@ case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStruc
       s"""
         final InternalRow ${ev.value} = new $rowClass($values);
         this.$values = null;
-      """)
+      """, isNull = "false")
   }
 
   override def prettyName: String = "named_struct"
@@ -333,7 +330,7 @@ case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStruc
 case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateNamedStructLike {
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val eval = GenerateUnsafeProjection.createCode(ctx, valExprs)
-    ExprCode(code = eval.code, isNull = eval.isNull, value = eval.value)
+    ExprCode(code = eval.code, isNull = "false", value = eval.value)
   }
 
   override def prettyName: String = "named_struct_unsafe"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 8b2e8f3e7ef73..d24a502c9fbde 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -206,9 +206,8 @@ case class IsNaN(child: Expression) extends UnaryExpression
       case DoubleType | FloatType =>
         ev.copy(code = s"""
           ${eval.code}
-          boolean ${ev.isNull} = false;
           ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
-          ${ev.value} = !${eval.isNull} && Double.isNaN(${eval.value});""")
+          ${ev.value} = !${eval.isNull} && Double.isNaN(${eval.value});""", isNull = "false")
     }
   }
 }
@@ -383,7 +382,6 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
     ev.copy(code = s"""
       int $nonnull = 0;
       $code
-      boolean ${ev.isNull} = false;
-      boolean ${ev.value} = $nonnull >= $n;""")
+      boolean ${ev.value} = $nonnull >= $n;""", isNull = "false")
   }
 }

From d6e027e610bdff0123e71925735ecedcf4787b83 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Mon, 28 Nov 2016 02:56:26 -0800
Subject: [PATCH 201/534] [SPARK-18604][SQL] Make sure CollapseWindow returns
 the attributes in the same order.

## What changes were proposed in this pull request?
The `CollapseWindow` optimizer rule changes the order of output attributes. This modifies the output of the plan, which the optimizer cannot do. This also breaks things like `collect()` for which we use a `RowEncoder` that assumes that the output attributes of the executed plan are equal to those outputted by the logical plan.

## How was this patch tested?
I have updated an incorrect test in `CollapseWindowSuite`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #16027 from hvanhovell/SPARK-18604.

(cherry picked from commit 454b8049916a0353772a0ea5cfe14b62cbd81df4)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala    |  2 +-
 .../catalyst/optimizer/CollapseWindowSuite.scala    | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 6ba8b33b3fa74..2679e026bb00a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -545,7 +545,7 @@ object CollapseRepartition extends Rule[LogicalPlan] {
 object CollapseWindow extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case w @ Window(we1, ps1, os1, Window(we2, ps2, os2, grandChild)) if ps1 == ps2 && os1 == os2 =>
-      w.copy(windowExpressions = we1 ++ we2, child = grandChild)
+      w.copy(windowExpressions = we2 ++ we1, child = grandChild)
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala
index 797076e55cfcc..3f7d1d9fd99af 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseWindowSuite.scala
@@ -46,12 +46,15 @@ class CollapseWindowSuite extends PlanTest {
       .window(Seq(sum(b).as('sum_b)), partitionSpec1, orderSpec1)
       .window(Seq(avg(b).as('avg_b)), partitionSpec1, orderSpec1)
 
-    val optimized = Optimize.execute(query.analyze)
+    val analyzed = query.analyze
+    val optimized = Optimize.execute(analyzed)
+    assert(analyzed.output === optimized.output)
+
     val correctAnswer = testRelation.window(Seq(
-        avg(b).as('avg_b),
-        sum(b).as('sum_b),
-        max(a).as('max_a),
-        min(a).as('min_a)), partitionSpec1, orderSpec1)
+      min(a).as('min_a),
+      max(a).as('max_a),
+      sum(b).as('sum_b),
+      avg(b).as('avg_b)), partitionSpec1, orderSpec1)
 
     comparePlans(optimized, correctAnswer)
   }

From 712bd5abc827c4eaf3f53bfc9155c8535584ca96 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 28 Nov 2016 04:18:35 -0800
Subject: [PATCH 202/534] [SPARK-18118][SQL] fix a compilation error due to
 nested JavaBeans

## What changes were proposed in this pull request?

This PR avoids a compilation error due to more than 64KB Java byte code size. This error occur since generated java code `SpecificSafeProjection.apply()` for nested JavaBeans is too big. This PR avoids this compilation error by splitting a big code chunk into multiple methods by calling `CodegenContext.splitExpression` at `InitializeJavaBean.doGenCode`
An object reference for JavaBean is stored to an instance variable `javaBean...`. Then, the instance variable will be referenced in the split methods.

Generated code with this PR
````
/* 22098 */   private void apply130_0(InternalRow i) {
...
/* 22125 */     boolean isNull238 = i.isNullAt(2);
/* 22126 */     InternalRow value238 = isNull238 ? null : (i.getStruct(2, 3));
/* 22127 */     boolean isNull236 = false;
/* 22128 */     test.org.apache.spark.sql.JavaDatasetSuite$Nesting1 value236 = null;
/* 22129 */     if (!false && isNull238) {
/* 22130 */
/* 22131 */       final test.org.apache.spark.sql.JavaDatasetSuite$Nesting1 value239 = null;
/* 22132 */       isNull236 = true;
/* 22133 */       value236 = value239;
/* 22134 */     } else {
/* 22135 */
/* 22136 */       final test.org.apache.spark.sql.JavaDatasetSuite$Nesting1 value241 = false ? null : new test.org.apache.spark.sql.JavaDatasetSuite$Nesting1();
/* 22137 */       this.javaBean14 = value241;
/* 22138 */       if (!false) {
/* 22139 */         apply25_0(i);
/* 22140 */         apply25_1(i);
/* 22141 */         apply25_2(i);
/* 22142 */       }
/* 22143 */       isNull236 = false;
/* 22144 */       value236 = value241;
/* 22145 */     }
/* 22146 */     this.javaBean.setField2(value236);
/* 22147 */
/* 22148 */   }
...
/* 22928 */   public java.lang.Object apply(java.lang.Object _i) {
/* 22929 */     InternalRow i = (InternalRow) _i;
/* 22930 */
/* 22931 */     final test.org.apache.spark.sql.JavaDatasetSuite$NestedComplicatedJavaBean value1 = false ? null : new test.org.apache.spark.sql.JavaDatasetSuite$NestedComplicatedJavaBean();
/* 22932 */     this.javaBean = value1;
/* 22933 */     if (!false) {
/* 22934 */       apply130_0(i);
/* 22935 */       apply130_1(i);
/* 22936 */       apply130_2(i);
/* 22937 */       apply130_3(i);
/* 22938 */       apply130_4(i);
/* 22939 */     }
/* 22940 */     if (false) {
/* 22941 */       mutableRow.setNullAt(0);
/* 22942 */     } else {
/* 22943 */
/* 22944 */       mutableRow.update(0, value1);
/* 22945 */     }
/* 22946 */
/* 22947 */     return mutableRow;
/* 22948 */   }
````

## How was this patch tested?

added a test suite into `JavaDatasetSuite.java`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #16032 from kiszk/SPARK-18118.

(cherry picked from commit f075cd9cb7157819df9aec67baee8913c4ed5c53)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../expressions/objects/objects.scala         |  10 +-
 .../apache/spark/sql/JavaDatasetSuite.java    | 429 ++++++++++++++++++
 2 files changed, 437 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 5c27179ec3b46..6952f54928161 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -896,19 +896,25 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val instanceGen = beanInstance.genCode(ctx)
 
+    val javaBeanInstance = ctx.freshName("javaBean")
+    val beanInstanceJavaType = ctx.javaType(beanInstance.dataType)
+    ctx.addMutableState(beanInstanceJavaType, javaBeanInstance, "")
+
     val initialize = setters.map {
       case (setterMethod, fieldValue) =>
         val fieldGen = fieldValue.genCode(ctx)
         s"""
            ${fieldGen.code}
-           ${instanceGen.value}.$setterMethod(${fieldGen.value});
+           this.${javaBeanInstance}.$setterMethod(${fieldGen.value});
          """
     }
+    val initializeCode = ctx.splitExpressions(ctx.INPUT_ROW, initialize.toSeq)
 
     val code = s"""
       ${instanceGen.code}
+      this.${javaBeanInstance} = ${instanceGen.value};
       if (!${instanceGen.isNull}) {
-        ${initialize.mkString("\n")}
+        $initializeCode
       }
      """
     ev.copy(code = code, isNull = instanceGen.isNull, value = instanceGen.value)
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index 96e8fb066854a..8304b728aa238 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -876,4 +876,433 @@ public void testRuntimeNullabilityCheck() {
       ds.collect();
     }
   }
+
+  public static class Nesting3 implements Serializable {
+    private Integer field3_1;
+    private Double field3_2;
+    private String field3_3;
+
+    public Nesting3() {
+    }
+
+    public Nesting3(Integer field3_1, Double field3_2, String field3_3) {
+      this.field3_1 = field3_1;
+      this.field3_2 = field3_2;
+      this.field3_3 = field3_3;
+    }
+
+    private Nesting3(Builder builder) {
+      setField3_1(builder.field3_1);
+      setField3_2(builder.field3_2);
+      setField3_3(builder.field3_3);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Integer getField3_1() {
+      return field3_1;
+    }
+
+    public void setField3_1(Integer field3_1) {
+      this.field3_1 = field3_1;
+    }
+
+    public Double getField3_2() {
+      return field3_2;
+    }
+
+    public void setField3_2(Double field3_2) {
+      this.field3_2 = field3_2;
+    }
+
+    public String getField3_3() {
+      return field3_3;
+    }
+
+    public void setField3_3(String field3_3) {
+      this.field3_3 = field3_3;
+    }
+
+    public static final class Builder {
+      private Integer field3_1 = 0;
+      private Double field3_2 = 0.0;
+      private String field3_3 = "value";
+
+      private Builder() {
+      }
+
+      public Builder field3_1(Integer field3_1) {
+        this.field3_1 = field3_1;
+        return this;
+      }
+
+      public Builder field3_2(Double field3_2) {
+        this.field3_2 = field3_2;
+        return this;
+      }
+
+      public Builder field3_3(String field3_3) {
+        this.field3_3 = field3_3;
+        return this;
+      }
+
+      public Nesting3 build() {
+        return new Nesting3(this);
+      }
+    }
+  }
+
+  public static class Nesting2 implements Serializable {
+    private Nesting3 field2_1;
+    private Nesting3 field2_2;
+    private Nesting3 field2_3;
+
+    public Nesting2() {
+    }
+
+    public Nesting2(Nesting3 field2_1, Nesting3 field2_2, Nesting3 field2_3) {
+      this.field2_1 = field2_1;
+      this.field2_2 = field2_2;
+      this.field2_3 = field2_3;
+    }
+
+    private Nesting2(Builder builder) {
+      setField2_1(builder.field2_1);
+      setField2_2(builder.field2_2);
+      setField2_3(builder.field2_3);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Nesting3 getField2_1() {
+      return field2_1;
+    }
+
+    public void setField2_1(Nesting3 field2_1) {
+      this.field2_1 = field2_1;
+    }
+
+    public Nesting3 getField2_2() {
+      return field2_2;
+    }
+
+    public void setField2_2(Nesting3 field2_2) {
+      this.field2_2 = field2_2;
+    }
+
+    public Nesting3 getField2_3() {
+      return field2_3;
+    }
+
+    public void setField2_3(Nesting3 field2_3) {
+      this.field2_3 = field2_3;
+    }
+
+
+    public static final class Builder {
+      private Nesting3 field2_1 = Nesting3.newBuilder().build();
+      private Nesting3 field2_2 = Nesting3.newBuilder().build();
+      private Nesting3 field2_3 = Nesting3.newBuilder().build();
+
+      private Builder() {
+      }
+
+      public Builder field2_1(Nesting3 field2_1) {
+        this.field2_1 = field2_1;
+        return this;
+      }
+
+      public Builder field2_2(Nesting3 field2_2) {
+        this.field2_2 = field2_2;
+        return this;
+      }
+
+      public Builder field2_3(Nesting3 field2_3) {
+        this.field2_3 = field2_3;
+        return this;
+      }
+
+      public Nesting2 build() {
+        return new Nesting2(this);
+      }
+    }
+  }
+
+  public static class Nesting1 implements Serializable {
+    private Nesting2 field1_1;
+    private Nesting2 field1_2;
+    private Nesting2 field1_3;
+
+    public Nesting1() {
+    }
+
+    public Nesting1(Nesting2 field1_1, Nesting2 field1_2, Nesting2 field1_3) {
+      this.field1_1 = field1_1;
+      this.field1_2 = field1_2;
+      this.field1_3 = field1_3;
+    }
+
+    private Nesting1(Builder builder) {
+      setField1_1(builder.field1_1);
+      setField1_2(builder.field1_2);
+      setField1_3(builder.field1_3);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Nesting2 getField1_1() {
+      return field1_1;
+    }
+
+    public void setField1_1(Nesting2 field1_1) {
+      this.field1_1 = field1_1;
+    }
+
+    public Nesting2 getField1_2() {
+      return field1_2;
+    }
+
+    public void setField1_2(Nesting2 field1_2) {
+      this.field1_2 = field1_2;
+    }
+
+    public Nesting2 getField1_3() {
+      return field1_3;
+    }
+
+    public void setField1_3(Nesting2 field1_3) {
+      this.field1_3 = field1_3;
+    }
+
+
+    public static final class Builder {
+      private Nesting2 field1_1 = Nesting2.newBuilder().build();
+      private Nesting2 field1_2 = Nesting2.newBuilder().build();
+      private Nesting2 field1_3 = Nesting2.newBuilder().build();
+
+      private Builder() {
+      }
+
+      public Builder field1_1(Nesting2 field1_1) {
+        this.field1_1 = field1_1;
+        return this;
+      }
+
+      public Builder field1_2(Nesting2 field1_2) {
+        this.field1_2 = field1_2;
+        return this;
+      }
+
+      public Builder field1_3(Nesting2 field1_3) {
+        this.field1_3 = field1_3;
+        return this;
+      }
+
+      public Nesting1 build() {
+        return new Nesting1(this);
+      }
+    }
+  }
+
+  public static class NestedComplicatedJavaBean implements Serializable {
+    private Nesting1 field1;
+    private Nesting1 field2;
+    private Nesting1 field3;
+    private Nesting1 field4;
+    private Nesting1 field5;
+    private Nesting1 field6;
+    private Nesting1 field7;
+    private Nesting1 field8;
+    private Nesting1 field9;
+    private Nesting1 field10;
+
+    public NestedComplicatedJavaBean() {
+    }
+
+    private NestedComplicatedJavaBean(Builder builder) {
+      setField1(builder.field1);
+      setField2(builder.field2);
+      setField3(builder.field3);
+      setField4(builder.field4);
+      setField5(builder.field5);
+      setField6(builder.field6);
+      setField7(builder.field7);
+      setField8(builder.field8);
+      setField9(builder.field9);
+      setField10(builder.field10);
+    }
+
+    public static Builder newBuilder() {
+      return new Builder();
+    }
+
+    public Nesting1 getField1() {
+      return field1;
+    }
+
+    public void setField1(Nesting1 field1) {
+      this.field1 = field1;
+    }
+
+    public Nesting1 getField2() {
+      return field2;
+    }
+
+    public void setField2(Nesting1 field2) {
+      this.field2 = field2;
+    }
+
+    public Nesting1 getField3() {
+      return field3;
+    }
+
+    public void setField3(Nesting1 field3) {
+      this.field3 = field3;
+    }
+
+    public Nesting1 getField4() {
+      return field4;
+    }
+
+    public void setField4(Nesting1 field4) {
+      this.field4 = field4;
+    }
+
+    public Nesting1 getField5() {
+      return field5;
+    }
+
+    public void setField5(Nesting1 field5) {
+      this.field5 = field5;
+    }
+
+    public Nesting1 getField6() {
+      return field6;
+    }
+
+    public void setField6(Nesting1 field6) {
+      this.field6 = field6;
+    }
+
+    public Nesting1 getField7() {
+      return field7;
+    }
+
+    public void setField7(Nesting1 field7) {
+      this.field7 = field7;
+    }
+
+    public Nesting1 getField8() {
+      return field8;
+    }
+
+    public void setField8(Nesting1 field8) {
+      this.field8 = field8;
+    }
+
+    public Nesting1 getField9() {
+      return field9;
+    }
+
+    public void setField9(Nesting1 field9) {
+      this.field9 = field9;
+    }
+
+    public Nesting1 getField10() {
+      return field10;
+    }
+
+    public void setField10(Nesting1 field10) {
+      this.field10 = field10;
+    }
+
+    public static final class Builder {
+      private Nesting1 field1 = Nesting1.newBuilder().build();
+      private Nesting1 field2 = Nesting1.newBuilder().build();
+      private Nesting1 field3 = Nesting1.newBuilder().build();
+      private Nesting1 field4 = Nesting1.newBuilder().build();
+      private Nesting1 field5 = Nesting1.newBuilder().build();
+      private Nesting1 field6 = Nesting1.newBuilder().build();
+      private Nesting1 field7 = Nesting1.newBuilder().build();
+      private Nesting1 field8 = Nesting1.newBuilder().build();
+      private Nesting1 field9 = Nesting1.newBuilder().build();
+      private Nesting1 field10 = Nesting1.newBuilder().build();
+
+      private Builder() {
+      }
+
+      public Builder field1(Nesting1 field1) {
+        this.field1 = field1;
+        return this;
+      }
+
+      public Builder field2(Nesting1 field2) {
+        this.field2 = field2;
+        return this;
+      }
+
+      public Builder field3(Nesting1 field3) {
+        this.field3 = field3;
+        return this;
+      }
+
+      public Builder field4(Nesting1 field4) {
+        this.field4 = field4;
+        return this;
+      }
+
+      public Builder field5(Nesting1 field5) {
+        this.field5 = field5;
+        return this;
+      }
+
+      public Builder field6(Nesting1 field6) {
+        this.field6 = field6;
+        return this;
+      }
+
+      public Builder field7(Nesting1 field7) {
+        this.field7 = field7;
+        return this;
+      }
+
+      public Builder field8(Nesting1 field8) {
+        this.field8 = field8;
+        return this;
+      }
+
+      public Builder field9(Nesting1 field9) {
+        this.field9 = field9;
+        return this;
+      }
+
+      public Builder field10(Nesting1 field10) {
+        this.field10 = field10;
+        return this;
+      }
+
+      public NestedComplicatedJavaBean build() {
+        return new NestedComplicatedJavaBean(this);
+      }
+    }
+  }
+
+  @Test
+  public void test() {
+    /* SPARK-15285 Large numbers of Nested JavaBeans generates more than 64KB java bytecode */
+    List<NestedComplicatedJavaBean> data = new ArrayList<NestedComplicatedJavaBean>();
+    data.add(NestedComplicatedJavaBean.newBuilder().build());
+
+    NestedComplicatedJavaBean obj3 = new NestedComplicatedJavaBean();
+
+    Dataset<NestedComplicatedJavaBean> ds =
+      spark.createDataset(data, Encoders.bean(NestedComplicatedJavaBean.class));
+    ds.collectAsList();
+  }
 }

From e449f7546897c5f29075e6a0913a5a6106bcbb5f Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Mon, 28 Nov 2016 04:41:43 -0800
Subject: [PATCH 203/534] [SPARK-18118][SQL] fix a compilation error due to
 nested JavaBeans

Remove this reference.

(cherry picked from commit 70dfdcbbf11c9c3174abc111afa2250236e31af2)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../apache/spark/sql/catalyst/expressions/objects/objects.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 6952f54928161..e517ec18eb540 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -905,7 +905,7 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
         val fieldGen = fieldValue.genCode(ctx)
         s"""
            ${fieldGen.code}
-           this.${javaBeanInstance}.$setterMethod(${fieldGen.value});
+           ${javaBeanInstance}.$setterMethod(${fieldGen.value});
          """
     }
     val initializeCode = ctx.splitExpressions(ctx.INPUT_ROW, initialize.toSeq)

From a9d4febe900aa3eb9c595089e7283a64a24c8761 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 28 Nov 2016 07:04:38 -0800
Subject: [PATCH 204/534] [SPARK-17783][SQL] Hide Credentials in CREATE and
 DESC FORMATTED/EXTENDED a PERSISTENT/TEMP Table for JDBC

### What changes were proposed in this pull request?

We should never expose the Credentials in the EXPLAIN and DESC FORMATTED/EXTENDED command. However, below commands exposed the credentials.

In the related PR: https://github.com/apache/spark/pull/10452

> URL patterns to specify credential seems to be vary between different databases.

Thus, we hide the whole `url` value if it contains the keyword `password`. We also hide the `password` property.

Before the fix, the command outputs look like:

``` SQL
CREATE TABLE tab1
USING org.apache.spark.sql.jdbc
OPTIONS (
 url 'jdbc:h2:mem:testdb0;user=testUser;password=testPass',
 dbtable 'TEST.PEOPLE',
 user 'testUser',
 password '$password')

DESC FORMATTED tab1
DESC EXTENDED tab1
```

Before the fix,
- The output of SQL statement EXPLAIN
```
== Physical Plan ==
ExecutedCommand
   +- CreateDataSourceTableCommand CatalogTable(
	Table: `tab1`
	Created: Wed Nov 16 23:00:10 PST 2016
	Last Access: Wed Dec 31 15:59:59 PST 1969
	Type: MANAGED
	Provider: org.apache.spark.sql.jdbc
	Storage(Properties: [url=jdbc:h2:mem:testdb0;user=testUser;password=testPass, dbtable=TEST.PEOPLE, user=testUser, password=testPass])), false
```

- The output of `DESC FORMATTED`
```
...
|Storage Desc Parameters:    |                                                                  |       |
|  url                       |jdbc:h2:mem:testdb0;user=testUser;password=testPass               |       |
|  dbtable                   |TEST.PEOPLE                                                       |       |
|  user                      |testUser                                                          |       |
|  password                  |testPass                                                          |       |
+----------------------------+------------------------------------------------------------------+-------+
```

- The output of `DESC EXTENDED`
```
|# Detailed Table Information|CatalogTable(
	Table: `default`.`tab1`
	Created: Wed Nov 16 23:00:10 PST 2016
	Last Access: Wed Dec 31 15:59:59 PST 1969
	Type: MANAGED
	Schema: [StructField(NAME,StringType,false), StructField(THEID,IntegerType,false)]
	Provider: org.apache.spark.sql.jdbc
	Storage(Location: file:/Users/xiaoli/IdeaProjects/sparkDelivery/spark-warehouse/tab1, Properties: [url=jdbc:h2:mem:testdb0;user=testUser;password=testPass, dbtable=TEST.PEOPLE, user=testUser, password=testPass]))|       |
```

After the fix,
- The output of SQL statement EXPLAIN
```
== Physical Plan ==
ExecutedCommand
   +- CreateDataSourceTableCommand CatalogTable(
	Table: `tab1`
	Created: Wed Nov 16 22:43:49 PST 2016
	Last Access: Wed Dec 31 15:59:59 PST 1969
	Type: MANAGED
	Provider: org.apache.spark.sql.jdbc
	Storage(Properties: [url=###, dbtable=TEST.PEOPLE, user=testUser, password=###])), false
```
- The output of `DESC FORMATTED`
```
...
|Storage Desc Parameters:    |                                                                  |       |
|  url                       |###                                                               |       |
|  dbtable                   |TEST.PEOPLE                                                       |       |
|  user                      |testUser                                                          |       |
|  password                  |###                                                               |       |
+----------------------------+------------------------------------------------------------------+-------+
```

- The output of `DESC EXTENDED`
```
|# Detailed Table Information|CatalogTable(
	Table: `default`.`tab1`
	Created: Wed Nov 16 22:43:49 PST 2016
	Last Access: Wed Dec 31 15:59:59 PST 1969
	Type: MANAGED
	Schema: [StructField(NAME,StringType,false), StructField(THEID,IntegerType,false)]
	Provider: org.apache.spark.sql.jdbc
	Storage(Location: file:/Users/xiaoli/IdeaProjects/sparkDelivery/spark-warehouse/tab1, Properties: [url=###, dbtable=TEST.PEOPLE, user=testUser, password=###]))|       |
```

### How was this patch tested?

Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15358 from gatorsmile/maskCredentials.

(cherry picked from commit 9f273c5173c05017c3009faaf3e10f2f70a842d0)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../catalog/ExternalCatalogUtils.scala        | 15 +++++++++
 .../sql/catalyst/catalog/interface.scala      | 10 +++---
 .../spark/sql/execution/command/tables.scala  |  3 +-
 .../spark/sql/execution/datasources/ddl.scala | 10 +++++-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 32 +++++++++++++++++++
 5 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
index b1442eec164d8..817c1ab688471 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -119,3 +119,18 @@ object ExternalCatalogUtils {
     }
   }
 }
+
+object CatalogUtils {
+  /**
+   * Masking credentials in the option lists. For example, in the sql plan explain output
+   * for JDBC data sources.
+   */
+  def maskCredentials(options: Map[String, String]): Map[String, String] = {
+    options.map {
+      case (key, _) if key.toLowerCase == "password" => (key, "###")
+      case (key, value) if key.toLowerCase == "url" && value.toLowerCase.contains("password") =>
+        (key, "###")
+      case o => o
+    }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 93c70de18ae7e..d8bc86727e466 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -52,12 +52,10 @@ case class CatalogStorageFormat(
     properties: Map[String, String]) {
 
   override def toString: String = {
-    val serdePropsToString =
-      if (properties.nonEmpty) {
-        s"Properties: " + properties.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]")
-      } else {
-        ""
-      }
+    val serdePropsToString = CatalogUtils.maskCredentials(properties) match {
+      case props if props.isEmpty => ""
+      case props => "Properties: " + props.map(p => p._1 + "=" + p._2).mkString("[", ", ", "]")
+    }
     val output =
       Seq(locationUri.map("Location: " + _).getOrElse(""),
         inputFormat.map("InputFormat: " + _).getOrElse(""),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 7049e53a78684..ca4d20a99cf7c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -503,7 +503,8 @@ case class DescribeTableCommand(
     describeBucketingInfo(metadata, buffer)
 
     append(buffer, "Storage Desc Parameters:", "", "")
-    metadata.storage.properties.foreach { case (key, value) =>
+    val maskedProperties = CatalogUtils.maskCredentials(metadata.storage.properties)
+    maskedProperties.foreach { case (key, value) =>
       append(buffer, s"  $key", value, "")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index fa8dfa9640d3d..695ba1234d458 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.CatalogTable
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogUtils}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
 import org.apache.spark.sql.execution.command.RunnableCommand
@@ -56,6 +56,14 @@ case class CreateTempViewUsing(
       s"Temporary view '$tableIdent' should not have specified a database")
   }
 
+  override def argString: String = {
+    s"[tableIdent:$tableIdent " +
+      userSpecifiedSchema.map(_ + " ").getOrElse("") +
+      s"replace:$replace " +
+      s"provider:$provider " +
+      CatalogUtils.maskCredentials(options)
+  }
+
   def run(sparkSession: SparkSession): Seq[Row] = {
     val dataSource = DataSource(
       sparkSession,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index f921939ada73f..b16be457ed5c3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -734,6 +734,38 @@ class JDBCSuite extends SparkFunSuite
     }
   }
 
+  test("hide credentials in create and describe a persistent/temp table") {
+    val password = "testPass"
+    val tableName = "tab1"
+    Seq("TABLE", "TEMPORARY VIEW").foreach { tableType =>
+      withTable(tableName) {
+        val df = sql(
+          s"""
+             |CREATE $tableType $tableName
+             |USING org.apache.spark.sql.jdbc
+             |OPTIONS (
+             | url '$urlWithUserAndPass',
+             | dbtable 'TEST.PEOPLE',
+             | user 'testUser',
+             | password '$password')
+           """.stripMargin)
+
+        val explain = ExplainCommand(df.queryExecution.logical, extended = true)
+        spark.sessionState.executePlan(explain).executedPlan.executeCollect().foreach { r =>
+          assert(!r.toString.contains(password))
+        }
+
+        sql(s"DESC FORMATTED $tableName").collect().foreach { r =>
+          assert(!r.toString().contains(password))
+        }
+
+        sql(s"DESC EXTENDED $tableName").collect().foreach { r =>
+          assert(!r.toString().contains(password))
+        }
+      }
+    }
+  }
+
   test("SPARK 12941: The data type mapping for StringType to Oracle") {
     val oracleDialect = JdbcDialects.get("jdbc:oracle://127.0.0.1/db")
     assert(oracleDialect.getJDBCType(StringType).

From 32b259faed7e0573c0f465954205cbd3b94ee440 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Mon, 28 Nov 2016 07:10:52 -0800
Subject: [PATCH 205/534] [SPARK-18597][SQL] Do not push-down join conditions
 to the right side of a LEFT ANTI join

## What changes were proposed in this pull request?
We currently push down join conditions of a Left Anti join to both sides of the join. This is similar to Inner, Left Semi and Existence (a specialized left semi) join. The problem is that this changes the semantics of the join; a left anti join filters out rows that matches the join condition.

This PR fixes this by only pushing down conditions to the left hand side of the join. This is similar to the behavior of left outer join.

## How was this patch tested?
Added tests to `FilterPushdownSuite.scala` and created a SQLQueryTestSuite file for left anti joins with a regression test.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #16026 from hvanhovell/SPARK-18597.

(cherry picked from commit 38e29824d9a50464daa397c28e89610ed0aed4b6)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  6 ++--
 .../optimizer/FilterPushdownSuite.scala       | 33 +++++++++++++++++++
 .../resources/sql-tests/inputs/anti-join.sql  |  7 ++++
 .../sql-tests/results/anti-join.sql.out       | 29 ++++++++++++++++
 4 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/anti-join.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/anti-join.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 2679e026bb00a..805cad5cb953e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -932,7 +932,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
         split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right)
 
       joinType match {
-        case _: InnerLike | LeftExistence(_) =>
+        case _: InnerLike |  LeftSemi | ExistenceJoin(_) =>
           // push down the single side only join filter for both sides sub queries
           val newLeft = leftJoinConditions.
             reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
@@ -949,14 +949,14 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
           val newJoinCond = (rightJoinConditions ++ commonJoinCondition).reduceLeftOption(And)
 
           Join(newLeft, newRight, RightOuter, newJoinCond)
-        case LeftOuter =>
+        case LeftOuter | LeftAnti =>
           // push down the right side only join filter for right sub query
           val newLeft = left
           val newRight = rightJoinConditions.
             reduceLeftOption(And).map(Filter(_, right)).getOrElse(right)
           val newJoinCond = (leftJoinConditions ++ commonJoinCondition).reduceLeftOption(And)
 
-          Join(newLeft, newRight, LeftOuter, newJoinCond)
+          Join(newLeft, newRight, joinType, newJoinCond)
         case FullOuter => j
         case NaturalJoin(_) => sys.error("Untransformed NaturalJoin node")
         case UsingJoin(_, _) => sys.error("Untransformed Using join node")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 019f132d94cb2..3e67282d687f5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -514,6 +514,39 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
   }
 
+  test("joins: push down where clause into left anti join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery =
+      x.join(y, LeftAnti, Some("x.b".attr === "y.b".attr))
+        .where("x.a".attr > 10)
+        .analyze
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      x.where("x.a".attr > 10)
+        .join(y, LeftAnti, Some("x.b".attr === "y.b".attr))
+        .analyze
+    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+  }
+
+  test("joins: only push down join conditions to the right of a left anti join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val originalQuery =
+      x.join(y,
+        LeftAnti,
+        Some("x.b".attr === "y.b".attr && "y.a".attr > 10 && "x.a".attr > 10)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      x.join(
+        y.where("y.a".attr > 10),
+        LeftAnti,
+        Some("x.b".attr === "y.b".attr && "x.a".attr > 10))
+        .analyze
+    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+  }
+
+
   val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType))
 
   test("generate: predicate referenced no generated column") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/anti-join.sql b/sql/core/src/test/resources/sql-tests/inputs/anti-join.sql
new file mode 100644
index 0000000000000..0346f57d609ad
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/anti-join.sql
@@ -0,0 +1,7 @@
+-- SPARK-18597: Do not push down predicates to left hand side in an anti-join
+CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2);
+CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1);
+
+SELECT *
+FROM   tbl_a
+       LEFT ANTI JOIN tbl_b ON ((tbl_a.c1 = tbl_a.c2) IS NULL OR tbl_a.c1 = tbl_a.c2);
diff --git a/sql/core/src/test/resources/sql-tests/results/anti-join.sql.out b/sql/core/src/test/resources/sql-tests/results/anti-join.sql.out
new file mode 100644
index 0000000000000..6f38c4d08bc5a
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/anti-join.sql.out
@@ -0,0 +1,29 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 3
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT *
+FROM   tbl_a
+       LEFT ANTI JOIN tbl_b ON ((tbl_a.c1 = tbl_a.c2) IS NULL OR tbl_a.c1 = tbl_a.c2)
+-- !query 2 schema
+struct<c1:int,c2:int>
+-- !query 2 output
+2	1
+3	6

From 34ad4d520ae0e4302972097c5985ab2c5a8d5e04 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 28 Nov 2016 10:09:30 -0800
Subject: [PATCH 206/534] [SPARK-18602] Set the version of
 org.codehaus.janino:commons-compiler to 3.0.0 to match the version of
 org.codehaus.janino:janino

## What changes were proposed in this pull request?
org.codehaus.janino:janino depends on org.codehaus.janino:commons-compiler and we have been upgraded to org.codehaus.janino:janino 3.0.0.

However, seems we are still pulling in org.codehaus.janino:commons-compiler 2.7.6 because of calcite. It looks like an accident because we exclude janino from calcite (see here https://github.com/apache/spark/blob/branch-2.1/pom.xml#L1759). So, this PR upgrades org.codehaus.janino:commons-compiler to 3.0.0.

## How was this patch tested?
jenkins

Author: Yin Huai <yhuai@databricks.com>

Closes #16025 from yhuai/janino-commons-compile.

(cherry picked from commit eba727757ed5dc23c635e1926795aea62ec0fc66)
Signed-off-by: Yin Huai <yhuai@databricks.com>
---
 dev/deps/spark-deps-hadoop-2.2 | 2 +-
 dev/deps/spark-deps-hadoop-2.3 | 2 +-
 dev/deps/spark-deps-hadoop-2.4 | 2 +-
 dev/deps/spark-deps-hadoop-2.6 | 2 +-
 dev/deps/spark-deps-hadoop-2.7 | 2 +-
 pom.xml                        | 9 +++++++++
 sql/catalyst/pom.xml           | 4 ++++
 7 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index bbdea069f9496..89bfcef4d9466 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -24,7 +24,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index a2dec41d64519..8df3858825e13 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index c1f02b93d751c..71e7fb6dd243d 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -27,7 +27,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 4f04636be712b..ba31391495f54 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index da3af9ffa155b..b129e5a99e2ff 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -31,7 +31,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-2.7.6.jar
+commons-compiler-3.0.0.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
diff --git a/pom.xml b/pom.xml
index 5c417d2b35727..c391102d37502 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1759,6 +1759,10 @@
             <groupId>org.codehaus.janino</groupId>
             <artifactId>janino</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>org.codehaus.janino</groupId>
+            <artifactId>commons-compiler</artifactId>
+          </exclusion>
           <!-- hsqldb interferes with the use of derby as the default db
             in hive's use of datanucleus.
           -->
@@ -1796,6 +1800,11 @@
         <artifactId>janino</artifactId>
         <version>${janino.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.codehaus.janino</groupId>
+        <artifactId>commons-compiler</artifactId>
+        <version>${janino.version}</version>
+      </dependency>
       <dependency>
         <groupId>joda-time</groupId>
         <artifactId>joda-time</artifactId>
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 82b49ebb21a44..f118a9a984620 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -70,6 +70,10 @@
       <groupId>org.codehaus.janino</groupId>
       <artifactId>janino</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.codehaus.janino</groupId>
+      <artifactId>commons-compiler</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.antlr</groupId>
       <artifactId>antlr4-runtime</artifactId>

From 4d7947856be540bb671dc527fecb0881536d5a29 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 28 Nov 2016 10:57:17 -0800
Subject: [PATCH 207/534] [SQL][MINOR] DESC should use 'Catalog' as partition
 provider

## What changes were proposed in this pull request?

`CatalogTable` has a parameter named `tracksPartitionsInCatalog`, and in `CatalogTable.toString` we use `"Partition Provider: Catalog"` to represent it. This PR fixes `DESC TABLE` to make it consistent with `CatalogTable.toString`.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16035 from cloud-fan/minor.

(cherry picked from commit 185642846e25fa812f9c7f398ab20bffc1e25273)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../scala/org/apache/spark/sql/execution/command/tables.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index ca4d20a99cf7c..57d66f1f14785 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -489,7 +489,7 @@ case class DescribeTableCommand(
     if (table.tableType == CatalogTableType.VIEW) describeViewInfo(table, buffer)
 
     if (DDLUtils.isDatasourceTable(table) && table.tracksPartitionsInCatalog) {
-      append(buffer, "Partition Provider:", "Hive", "")
+      append(buffer, "Partition Provider:", "Catalog", "")
     }
   }
 

From 81e3f9711da5758fdeb297fe057685f648b6458b Mon Sep 17 00:00:00 2001
From: jiangxingbo <jiangxb1987@gmail.com>
Date: Mon, 28 Nov 2016 11:05:58 -0800
Subject: [PATCH 208/534] [SPARK-16282][SQL] Implement percentile SQL function.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Implement percentile SQL function. It computes the exact percentile(s) of expr at pc with range in [0, 1].

## How was this patch tested?

Add a new testsuite `PercentileSuite` to test percentile directly.
Updated related testcases in `ExpressionToSQLSuite`.

Author: jiangxingbo <jiangxb1987@gmail.com>
Author: 蒋星博 <jiangxingbo@meituan.com>
Author: jiangxingbo <jiangxingbo@meituan.com>

Closes #14136 from jiangxb1987/percentile.

(cherry picked from commit 0f5f52a3d1e5dcf5b970c49e324e322b9deb00f3)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../expressions/aggregate/Percentile.scala    | 269 ++++++++++++++++++
 .../aggregate/PercentileSuite.scala           | 245 ++++++++++++++++
 .../spark/sql/hive/HiveSessionCatalog.scala   |   3 +-
 .../sql/catalyst/ExpressionToSQLSuite.scala   |   2 +
 5 files changed, 518 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 007cdc1ccbe4e..2636afe6209ef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -249,6 +249,7 @@ object FunctionRegistry {
     expression[Max]("max"),
     expression[Average]("mean"),
     expression[Min]("min"),
+    expression[Percentile]("percentile"),
     expression[Skewness]("skewness"),
     expression[ApproximatePercentile]("percentile_approx"),
     expression[StddevSamp]("std"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
new file mode 100644
index 0000000000000..356e088d1d665
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
+import java.util
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.collection.OpenHashMap
+
+/**
+ * The Percentile aggregate function returns the exact percentile(s) of numeric column `expr` at
+ * the given percentage(s) with value range in [0.0, 1.0].
+ *
+ * The operator is bound to the slower sort based aggregation path because the number of elements
+ * and their partial order cannot be determined in advance. Therefore we have to store all the
+ * elements in memory, and that too many elements can cause GC paused and eventually OutOfMemory
+ * Errors.
+ *
+ * @param child child expression that produce numeric column value with `child.eval(inputRow)`
+ * @param percentageExpression Expression that represents a single percentage value or an array of
+ *                             percentage values. Each percentage value must be in the range
+ *                             [0.0, 1.0].
+ */
+@ExpressionDescription(
+  usage =
+    """
+      _FUNC_(col, percentage) - Returns the exact percentile value of numeric column `col` at the
+      given percentage. The value of percentage must be between 0.0 and 1.0.
+
+      _FUNC_(col, array(percentage1 [, percentage2]...)) - Returns the exact percentile value array
+      of numeric column `col` at the given percentage(s). Each value of the percentage array must
+      be between 0.0 and 1.0.
+    """)
+case class Percentile(
+  child: Expression,
+  percentageExpression: Expression,
+  mutableAggBufferOffset: Int = 0,
+  inputAggBufferOffset: Int = 0) extends TypedImperativeAggregate[OpenHashMap[Number, Long]] {
+
+  def this(child: Expression, percentageExpression: Expression) = {
+    this(child, percentageExpression, 0, 0)
+  }
+
+  override def prettyName: String = "percentile"
+
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): Percentile =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): Percentile =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+  // Mark as lazy so that percentageExpression is not evaluated during tree transformation.
+  @transient
+  private lazy val returnPercentileArray = percentageExpression.dataType.isInstanceOf[ArrayType]
+
+  @transient
+  private lazy val percentages =
+    (percentageExpression.dataType, percentageExpression.eval()) match {
+      case (_, num: Double) => Seq(num)
+      case (ArrayType(baseType: NumericType, _), arrayData: ArrayData) =>
+        val numericArray = arrayData.toObjectArray(baseType)
+        numericArray.map { x =>
+          baseType.numeric.toDouble(x.asInstanceOf[baseType.InternalType])}.toSeq
+      case other =>
+        throw new AnalysisException(s"Invalid data type ${other._1} for parameter percentages")
+  }
+
+  override def children: Seq[Expression] = child :: percentageExpression :: Nil
+
+  // Returns null for empty inputs
+  override def nullable: Boolean = true
+
+  override lazy val dataType: DataType = percentageExpression.dataType match {
+    case _: ArrayType => ArrayType(DoubleType, false)
+    case _ => DoubleType
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = percentageExpression.dataType match {
+    case _: ArrayType => Seq(NumericType, ArrayType)
+    case _ => Seq(NumericType, DoubleType)
+  }
+
+  // Check the inputTypes are valid, and the percentageExpression satisfies:
+  // 1. percentageExpression must be foldable;
+  // 2. percentages(s) must be in the range [0.0, 1.0].
+  override def checkInputDataTypes(): TypeCheckResult = {
+    // Validate the inputTypes
+    val defaultCheck = super.checkInputDataTypes()
+    if (defaultCheck.isFailure) {
+      defaultCheck
+    } else if (!percentageExpression.foldable) {
+      // percentageExpression must be foldable
+      TypeCheckFailure("The percentage(s) must be a constant literal, " +
+        s"but got $percentageExpression")
+    } else if (percentages.exists(percentage => percentage < 0.0 || percentage > 1.0)) {
+      // percentages(s) must be in the range [0.0, 1.0]
+      TypeCheckFailure("Percentage(s) must be between 0.0 and 1.0, " +
+        s"but got $percentageExpression")
+    } else {
+      TypeCheckSuccess
+    }
+  }
+
+  override def createAggregationBuffer(): OpenHashMap[Number, Long] = {
+    // Initialize new counts map instance here.
+    new OpenHashMap[Number, Long]()
+  }
+
+  override def update(buffer: OpenHashMap[Number, Long], input: InternalRow): Unit = {
+    val key = child.eval(input).asInstanceOf[Number]
+
+    // Null values are ignored in counts map.
+    if (key != null) {
+      buffer.changeValue(key, 1L, _ + 1L)
+    }
+  }
+
+  override def merge(buffer: OpenHashMap[Number, Long], other: OpenHashMap[Number, Long]): Unit = {
+    other.foreach { case (key, count) =>
+      buffer.changeValue(key, count, _ + count)
+    }
+  }
+
+  override def eval(buffer: OpenHashMap[Number, Long]): Any = {
+    generateOutput(getPercentiles(buffer))
+  }
+
+  private def getPercentiles(buffer: OpenHashMap[Number, Long]): Seq[Double] = {
+    if (buffer.isEmpty) {
+      return Seq.empty
+    }
+
+    val sortedCounts = buffer.toSeq.sortBy(_._1)(
+      child.dataType.asInstanceOf[NumericType].ordering.asInstanceOf[Ordering[Number]])
+    val accumlatedCounts = sortedCounts.scanLeft(sortedCounts.head._1, 0L) {
+      case ((key1, count1), (key2, count2)) => (key2, count1 + count2)
+    }.tail
+    val maxPosition = accumlatedCounts.last._2 - 1
+
+    percentages.map { percentile =>
+      getPercentile(accumlatedCounts, maxPosition * percentile).doubleValue()
+    }
+  }
+
+  private def generateOutput(results: Seq[Double]): Any = {
+    if (results.isEmpty) {
+      null
+    } else if (returnPercentileArray) {
+      new GenericArrayData(results)
+    } else {
+      results.head
+    }
+  }
+
+  /**
+   * Get the percentile value.
+   *
+   * This function has been based upon similar function from HIVE
+   * `org.apache.hadoop.hive.ql.udf.UDAFPercentile.getPercentile()`.
+   */
+  private def getPercentile(aggreCounts: Seq[(Number, Long)], position: Double): Number = {
+    // We may need to do linear interpolation to get the exact percentile
+    val lower = position.floor.toLong
+    val higher = position.ceil.toLong
+
+    // Use binary search to find the lower and the higher position.
+    val countsArray = aggreCounts.map(_._2).toArray[Long]
+    val lowerIndex = binarySearchCount(countsArray, 0, aggreCounts.size, lower + 1)
+    val higherIndex = binarySearchCount(countsArray, 0, aggreCounts.size, higher + 1)
+
+    val lowerKey = aggreCounts(lowerIndex)._1
+    if (higher == lower) {
+      // no interpolation needed because position does not have a fraction
+      return lowerKey
+    }
+
+    val higherKey = aggreCounts(higherIndex)._1
+    if (higherKey == lowerKey) {
+      // no interpolation needed because lower position and higher position has the same key
+      return lowerKey
+    }
+
+    // Linear interpolation to get the exact percentile
+    return (higher - position) * lowerKey.doubleValue() +
+      (position - lower) * higherKey.doubleValue()
+  }
+
+  /**
+   * use a binary search to find the index of the position closest to the current value.
+   */
+  private def binarySearchCount(
+      countsArray: Array[Long], start: Int, end: Int, value: Long): Int = {
+    util.Arrays.binarySearch(countsArray, 0, end, value) match {
+      case ix if ix < 0 => -(ix + 1)
+      case ix => ix
+    }
+  }
+
+  override def serialize(obj: OpenHashMap[Number, Long]): Array[Byte] = {
+    val buffer = new Array[Byte](4 << 10)  // 4K
+    val bos = new ByteArrayOutputStream()
+    val out = new DataOutputStream(bos)
+    try {
+      val projection = UnsafeProjection.create(Array[DataType](child.dataType, LongType))
+      // Write pairs in counts map to byte buffer.
+      obj.foreach { case (key, count) =>
+        val row = InternalRow.apply(key, count)
+        val unsafeRow = projection.apply(row)
+        out.writeInt(unsafeRow.getSizeInBytes)
+        unsafeRow.writeToStream(out, buffer)
+      }
+      out.writeInt(-1)
+      out.flush()
+
+      bos.toByteArray
+    } finally {
+      out.close()
+      bos.close()
+    }
+  }
+
+  override def deserialize(bytes: Array[Byte]): OpenHashMap[Number, Long] = {
+    val bis = new ByteArrayInputStream(bytes)
+    val ins = new DataInputStream(bis)
+    try {
+      val counts = new OpenHashMap[Number, Long]
+      // Read unsafeRow size and content in bytes.
+      var sizeOfNextRow = ins.readInt()
+      while (sizeOfNextRow >= 0) {
+        val bs = new Array[Byte](sizeOfNextRow)
+        ins.readFully(bs)
+        val row = new UnsafeRow(2)
+        row.pointTo(bs, sizeOfNextRow)
+        // Insert the pairs into counts map.
+        val key = row.get(0, child.dataType).asInstanceOf[Number]
+        val count = row.get(1, LongType).asInstanceOf[Long]
+        counts.update(key, count)
+        sizeOfNextRow = ins.readInt()
+      }
+
+      counts
+    } finally {
+      ins.close()
+      bis.close()
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
new file mode 100644
index 0000000000000..f060ecc18426a
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.ArrayData
+import org.apache.spark.sql.types._
+import org.apache.spark.util.collection.OpenHashMap
+
+class PercentileSuite extends SparkFunSuite {
+
+  private val random = new java.util.Random()
+
+  private val data = (0 until 10000).map { _ =>
+    random.nextInt(10000)
+  }
+
+  test("serialize and de-serialize") {
+    val agg = new Percentile(BoundReference(0, IntegerType, true), Literal(0.5))
+
+    // Check empty serialize and deserialize
+    val buffer = new OpenHashMap[Number, Long]()
+    assert(compareEquals(agg.deserialize(agg.serialize(buffer)), buffer))
+
+    // Check non-empty buffer serializa and deserialize.
+    data.foreach { key =>
+      buffer.changeValue(key, 1L, _ + 1L)
+    }
+    assert(compareEquals(agg.deserialize(agg.serialize(buffer)), buffer))
+  }
+
+  test("class Percentile, high level interface, update, merge, eval...") {
+    val count = 10000
+    val data = (1 to count)
+    val percentages = Seq(0, 0.25, 0.5, 0.75, 1)
+    val expectedPercentiles = Seq(1, 2500.75, 5000.5, 7500.25, 10000)
+    val childExpression = Cast(BoundReference(0, IntegerType, nullable = false), DoubleType)
+    val percentageExpression = CreateArray(percentages.toSeq.map(Literal(_)))
+    val agg = new Percentile(childExpression, percentageExpression)
+
+    assert(agg.nullable)
+    val group1 = (0 until data.length / 2)
+    val group1Buffer = agg.createAggregationBuffer()
+    group1.foreach { index =>
+      val input = InternalRow(data(index))
+      agg.update(group1Buffer, input)
+    }
+
+    val group2 = (data.length / 2 until data.length)
+    val group2Buffer = agg.createAggregationBuffer()
+    group2.foreach { index =>
+      val input = InternalRow(data(index))
+      agg.update(group2Buffer, input)
+    }
+
+    val mergeBuffer = agg.createAggregationBuffer()
+    agg.merge(mergeBuffer, group1Buffer)
+    agg.merge(mergeBuffer, group2Buffer)
+
+    agg.eval(mergeBuffer) match {
+      case arrayData: ArrayData =>
+        val percentiles = arrayData.toDoubleArray()
+        assert(percentiles.zip(expectedPercentiles)
+          .forall(pair => pair._1 == pair._2))
+    }
+  }
+
+  test("class Percentile, low level interface, update, merge, eval...") {
+    val childExpression = Cast(BoundReference(0, IntegerType, nullable = true), DoubleType)
+    val inputAggregationBufferOffset = 1
+    val mutableAggregationBufferOffset = 2
+    val percentage = 0.5
+
+    // Phase one, partial mode aggregation
+    val agg = new Percentile(childExpression, Literal(percentage))
+      .withNewInputAggBufferOffset(inputAggregationBufferOffset)
+      .withNewMutableAggBufferOffset(mutableAggregationBufferOffset)
+
+    val mutableAggBuffer = new GenericInternalRow(
+      new Array[Any](mutableAggregationBufferOffset + 1))
+    agg.initialize(mutableAggBuffer)
+    val dataCount = 10
+    (1 to dataCount).foreach { data =>
+      agg.update(mutableAggBuffer, InternalRow(data))
+    }
+    agg.serializeAggregateBufferInPlace(mutableAggBuffer)
+
+    // Serialize the aggregation buffer
+    val serialized = mutableAggBuffer.getBinary(mutableAggregationBufferOffset)
+    val inputAggBuffer = new GenericInternalRow(Array[Any](null, serialized))
+
+    // Phase 2: final mode aggregation
+    // Re-initialize the aggregation buffer
+    agg.initialize(mutableAggBuffer)
+    agg.merge(mutableAggBuffer, inputAggBuffer)
+    val expectedPercentile = 5.5
+    assert(agg.eval(mutableAggBuffer).asInstanceOf[Double] == expectedPercentile)
+  }
+
+  test("call from sql query") {
+    // sql, single percentile
+    assertEqual(
+      s"percentile(`a`, 0.5D)",
+      new Percentile("a".attr, Literal(0.5)).sql: String)
+
+    // sql, array of percentile
+    assertEqual(
+      s"percentile(`a`, array(0.25D, 0.5D, 0.75D))",
+      new Percentile("a".attr, CreateArray(Seq(0.25, 0.5, 0.75).map(Literal(_)))).sql: String)
+
+    // sql(isDistinct = false), single percentile
+    assertEqual(
+      s"percentile(`a`, 0.5D)",
+      new Percentile("a".attr, Literal(0.5)).sql(isDistinct = false))
+
+    // sql(isDistinct = false), array of percentile
+    assertEqual(
+      s"percentile(`a`, array(0.25D, 0.5D, 0.75D))",
+      new Percentile("a".attr, CreateArray(Seq(0.25, 0.5, 0.75).map(Literal(_))))
+        .sql(isDistinct = false))
+
+    // sql(isDistinct = true), single percentile
+    assertEqual(
+      s"percentile(DISTINCT `a`, 0.5D)",
+      new Percentile("a".attr, Literal(0.5)).sql(isDistinct = true))
+
+    // sql(isDistinct = true), array of percentile
+    assertEqual(
+      s"percentile(DISTINCT `a`, array(0.25D, 0.5D, 0.75D))",
+      new Percentile("a".attr, CreateArray(Seq(0.25, 0.5, 0.75).map(Literal(_))))
+        .sql(isDistinct = true))
+  }
+
+  test("fail analysis if childExpression is invalid") {
+    val validDataTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType)
+    val percentage = Literal(0.5)
+
+    validDataTypes.foreach { dataType =>
+      val child = AttributeReference("a", dataType)()
+      val percentile = new Percentile(child, percentage)
+      assertEqual(percentile.checkInputDataTypes(), TypeCheckSuccess)
+    }
+
+    val invalidDataTypes = Seq(BooleanType, StringType, DateType, TimestampType,
+      CalendarIntervalType, NullType)
+
+    invalidDataTypes.foreach { dataType =>
+      val child = AttributeReference("a", dataType)()
+      val percentile = new Percentile(child, percentage)
+      assertEqual(percentile.checkInputDataTypes(),
+        TypeCheckFailure(s"argument 1 requires numeric type, however, " +
+            s"'`a`' is of ${dataType.simpleString} type."))
+    }
+  }
+
+  test("fails analysis if percentage(s) are invalid") {
+    val child = Cast(BoundReference(0, IntegerType, nullable = false), DoubleType)
+    val input = InternalRow(1)
+
+    val validPercentages = Seq(Literal(0D), Literal(0.5), Literal(1D),
+      CreateArray(Seq(0, 0.5, 1).map(Literal(_))))
+
+    validPercentages.foreach { percentage =>
+      val percentile1 = new Percentile(child, percentage)
+      assertEqual(percentile1.checkInputDataTypes(), TypeCheckSuccess)
+    }
+
+    val invalidPercentages = Seq(Literal(-0.5), Literal(1.5), Literal(2D),
+      CreateArray(Seq(-0.5, 0, 2).map(Literal(_))))
+
+    invalidPercentages.foreach { percentage =>
+      val percentile2 = new Percentile(child, percentage)
+      assertEqual(percentile2.checkInputDataTypes(),
+        TypeCheckFailure(s"Percentage(s) must be between 0.0 and 1.0, " +
+        s"but got ${percentage.simpleString}"))
+    }
+
+    val nonFoldablePercentage = Seq(NonFoldableLiteral(0.5),
+      CreateArray(Seq(0, 0.5, 1).map(NonFoldableLiteral(_))))
+
+    nonFoldablePercentage.foreach { percentage =>
+      val percentile3 = new Percentile(child, percentage)
+      assertEqual(percentile3.checkInputDataTypes(),
+        TypeCheckFailure(s"The percentage(s) must be a constant literal, " +
+          s"but got ${percentage}"))
+    }
+
+    val invalidDataTypes = Seq(ByteType, ShortType, IntegerType, LongType, FloatType,
+      BooleanType, StringType, DateType, TimestampType, CalendarIntervalType, NullType)
+
+    invalidDataTypes.foreach { dataType =>
+      val percentage = Literal(0.5, dataType)
+      val percentile4 = new Percentile(child, percentage)
+      assertEqual(percentile4.checkInputDataTypes(),
+        TypeCheckFailure(s"argument 2 requires double type, however, " +
+          s"'0.5' is of ${dataType.simpleString} type."))
+    }
+  }
+
+  test("null handling") {
+    val childExpression = Cast(BoundReference(0, IntegerType, nullable = true), DoubleType)
+    val agg = new Percentile(childExpression, Literal(0.5))
+    val buffer = new GenericInternalRow(new Array[Any](1))
+    agg.initialize(buffer)
+    // Empty aggregation buffer
+    assert(agg.eval(buffer) == null)
+    // Empty input row
+    agg.update(buffer, InternalRow(null))
+    assert(agg.eval(buffer) == null)
+
+    // Add some non-empty row
+    agg.update(buffer, InternalRow(0))
+    assert(agg.eval(buffer) != null)
+  }
+
+  private def compareEquals(
+      left: OpenHashMap[Number, Long], right: OpenHashMap[Number, Long]): Boolean = {
+    left.size == right.size && left.forall { case (key, count) =>
+      right.apply(key) == count
+    }
+  }
+
+  private def assertEqual[T](left: T, right: T): Unit = {
+    assert(left == right)
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 4a9b28a455a44..08bf1cd0efbb9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -234,7 +234,6 @@ private[sql] class HiveSessionCatalog(
   // noopwithmapstreaming, parse_url_tuple, reflect2, windowingtablefunction.
   // Note: don't forget to update SessionCatalog.isTemporaryFunction
   private val hiveFunctions = Seq(
-    "histogram_numeric",
-    "percentile"
+    "histogram_numeric"
   )
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
index fdd02821dfa29..27ea167b9050c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionToSQLSuite.scala
@@ -173,6 +173,8 @@ class ExpressionToSQLSuite extends SQLBuilderTest with SQLTestUtils {
     checkSqlGeneration("SELECT max(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT mean(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT min(value) FROM t1 GROUP BY key")
+    checkSqlGeneration("SELECT percentile(value, 0.25) FROM t1 GROUP BY key")
+    checkSqlGeneration("SELECT percentile(value, array(0.25, 0.75)) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT skewness(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT stddev(value) FROM t1 GROUP BY key")
     checkSqlGeneration("SELECT stddev_pop(value) FROM t1 GROUP BY key")

From b386943b2fe6af5237270bfa520295c1711bb341 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 28 Nov 2016 14:06:37 -0500
Subject: [PATCH 209/534] [SPARK-17680][SQL][TEST] Added test cases for
 InMemoryRelation

## What changes were proposed in this pull request?

This pull request adds test cases for the following cases:
- keep all data types with null or without null
- access `CachedBatch` disabling whole stage codegen
- access only some columns in `CachedBatch`

This PR is a part of https://github.com/apache/spark/pull/15219. Here are motivations to add these tests. When https://github.com/apache/spark/pull/15219 is enabled, the first two cases are handled by specialized (generated) code. The third one is a pitfall.

In general, even for now, it would be helpful to increase test coverage.
## How was this patch tested?

added test suites itself

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #15462 from kiszk/columnartestsuites.
---
 .../columnar/InMemoryColumnarQuerySuite.scala | 148 +++++++++++++++++-
 1 file changed, 146 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index b272c8e7d79c2..afeb47828edeb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -20,18 +20,96 @@ package org.apache.spark.sql.execution.columnar
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.types._
-import org.apache.spark.storage.StorageLevel.MEMORY_ONLY
+import org.apache.spark.storage.StorageLevel._
 
 class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   setupTestData()
 
+  private def cachePrimitiveTest(data: DataFrame, dataType: String) {
+    data.createOrReplaceTempView(s"testData$dataType")
+    val storageLevel = MEMORY_ONLY
+    val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan
+    val inMemoryRelation = InMemoryRelation(useCompression = true, 5, storageLevel, plan, None)
+
+    assert(inMemoryRelation.cachedColumnBuffers.getStorageLevel == storageLevel)
+    inMemoryRelation.cachedColumnBuffers.collect().head match {
+      case _: CachedBatch =>
+      case other => fail(s"Unexpected cached batch type: ${other.getClass.getName}")
+    }
+    checkAnswer(inMemoryRelation, data.collect().toSeq)
+  }
+
+  private def testPrimitiveType(nullability: Boolean): Unit = {
+    val dataTypes = Seq(BooleanType, ByteType, ShortType, IntegerType, LongType,
+      FloatType, DoubleType, DateType, TimestampType, DecimalType(25, 5), DecimalType(6, 5))
+    val schema = StructType(dataTypes.zipWithIndex.map { case (dataType, index) =>
+      StructField(s"col$index", dataType, nullability)
+    })
+    val rdd = spark.sparkContext.parallelize((1 to 10).map(i => Row(
+      if (nullability && i % 3 == 0) null else if (i % 2 == 0) true else false,
+      if (nullability && i % 3 == 0) null else i.toByte,
+      if (nullability && i % 3 == 0) null else i.toShort,
+      if (nullability && i % 3 == 0) null else i.toInt,
+      if (nullability && i % 3 == 0) null else i.toLong,
+      if (nullability && i % 3 == 0) null else (i + 0.25).toFloat,
+      if (nullability && i % 3 == 0) null else (i + 0.75).toDouble,
+      if (nullability && i % 3 == 0) null else new Date(i),
+      if (nullability && i % 3 == 0) null else new Timestamp(i * 1000000L),
+      if (nullability && i % 3 == 0) null else BigDecimal(Long.MaxValue.toString + ".12345"),
+      if (nullability && i % 3 == 0) null
+      else new java.math.BigDecimal(s"${i % 9 + 1}" + ".23456")
+    )))
+    cachePrimitiveTest(spark.createDataFrame(rdd, schema), "primitivesDateTimeStamp")
+  }
+
+  private def tesNonPrimitiveType(nullability: Boolean): Unit = {
+    val struct = StructType(StructField("f1", FloatType, false) ::
+      StructField("f2", ArrayType(BooleanType), true) :: Nil)
+    val schema = StructType(Seq(
+      StructField("col0", StringType, nullability),
+      StructField("col1", ArrayType(IntegerType), nullability),
+      StructField("col2", ArrayType(ArrayType(IntegerType)), nullability),
+      StructField("col3", MapType(StringType, IntegerType), nullability),
+      StructField("col4", struct, nullability)
+    ))
+    val rdd = spark.sparkContext.parallelize((1 to 10).map(i => Row(
+      if (nullability && i % 3 == 0) null else s"str${i}: test cache.",
+      if (nullability && i % 3 == 0) null else (i * 100 to i * 100 + i).toArray,
+      if (nullability && i % 3 == 0) null
+      else Array(Array(i, i + 1), Array(i * 100 + 1, i * 100, i * 100 + 2)),
+      if (nullability && i % 3 == 0) null else (i to i + i).map(j => s"key$j" -> j).toMap,
+      if (nullability && i % 3 == 0) null else Row((i + 0.25).toFloat, Seq(true, false, null))
+    )))
+    cachePrimitiveTest(spark.createDataFrame(rdd, schema), "StringArrayMapStruct")
+  }
+
+  test("primitive type with nullability:true") {
+    testPrimitiveType(true)
+  }
+
+  test("primitive type with nullability:false") {
+    testPrimitiveType(false)
+  }
+
+  test("non-primitive type with nullability:true") {
+    val schemaNull = StructType(Seq(StructField("col", NullType, true)))
+    val rddNull = spark.sparkContext.parallelize((1 to 10).map(i => Row(null)))
+    cachePrimitiveTest(spark.createDataFrame(rddNull, schemaNull), "Null")
+
+    tesNonPrimitiveType(true)
+  }
+
+  test("non-primitive type with nullability:false") {
+      tesNonPrimitiveType(false)
+  }
+
   test("simple columnar query") {
     val plan = spark.sessionState.executePlan(testData.logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
@@ -58,6 +136,13 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     }.map(Row.fromTuple))
   }
 
+  test("access only some column of the all of columns") {
+    val df = spark.range(1, 100).map(i => (i, (i + 1).toFloat)).toDF("i", "f")
+    df.cache
+    df.count  // forced to build cache
+    assert(df.filter("f <= 10.0").count == 9)
+  }
+
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
     val plan = spark.sessionState.executePlan(testData.logicalPlan).sparkPlan
     val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
@@ -246,4 +331,63 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     assert(cached.batchStats.value === expectedAnswer.size * INT.defaultSize)
   }
 
+  test("access primitive-type columns in CachedBatch without whole stage codegen") {
+    // whole stage codegen is not applied to a row with more than WHOLESTAGE_MAX_NUM_FIELDS fields
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "2") {
+      val data = Seq(null, true, 1.toByte, 3.toShort, 7, 15.toLong,
+        31.25.toFloat, 63.75, new Date(127), new Timestamp(255000000L), null)
+      val dataTypes = Seq(NullType, BooleanType, ByteType, ShortType, IntegerType, LongType,
+        FloatType, DoubleType, DateType, TimestampType, IntegerType)
+      val schemas = dataTypes.zipWithIndex.map { case (dataType, index) =>
+        StructField(s"col$index", dataType, true)
+      }
+      val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+      val df = spark.createDataFrame(rdd, StructType(schemas))
+      val row = df.persist.take(1).apply(0)
+      checkAnswer(df, row)
+    }
+  }
+
+  test("access decimal/string-type columns in CachedBatch without whole stage codegen") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "2") {
+      val data = Seq(BigDecimal(Long.MaxValue.toString + ".12345"),
+        new java.math.BigDecimal("1234567890.12345"),
+        new java.math.BigDecimal("1.23456"),
+        "test123"
+      )
+      val schemas = Seq(
+        StructField("col0", DecimalType(25, 5), true),
+        StructField("col1", DecimalType(15, 5), true),
+        StructField("col2", DecimalType(6, 5), true),
+        StructField("col3", StringType, true)
+      )
+      val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+      val df = spark.createDataFrame(rdd, StructType(schemas))
+      val row = df.persist.take(1).apply(0)
+      checkAnswer(df, row)
+    }
+  }
+
+  test("access non-primitive-type columns in CachedBatch without whole stage codegen") {
+    withSQLConf(SQLConf.WHOLESTAGE_MAX_NUM_FIELDS.key -> "2") {
+      val data = Seq((1 to 10).toArray,
+        Array(Array(10, 11), Array(100, 111, 123)),
+        Map("key1" -> 111, "key2" -> 222),
+        Row(1.25.toFloat, Seq(true, false, null))
+      )
+      val struct = StructType(StructField("f1", FloatType, false) ::
+        StructField("f2", ArrayType(BooleanType), true) :: Nil)
+      val schemas = Seq(
+        StructField("col0", ArrayType(IntegerType), true),
+        StructField("col1", ArrayType(ArrayType(IntegerType)), true),
+        StructField("col2", MapType(StringType, IntegerType), true),
+        StructField("col3", struct, true)
+      )
+      val rdd = sparkContext.makeRDD(Seq(Row.fromSeq(data)))
+      val df = spark.createDataFrame(rdd, StructType(schemas))
+      val row = df.persist.take(1).apply(0)
+      checkAnswer(df, row)
+    }
+  }
+
 }

From 80aabc0bd33dc5661a90133156247e7a8c1bf7f5 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Mon, 28 Nov 2016 11:48:12 -0800
Subject: [PATCH 210/534] Preparing Spark release v2.1.0-rc1

---
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 2 +-
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 38 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index ec243eaebaea7..aebfd12227751 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index fcefe64d59c91..67d78d5f102fb 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 511e1f29de368..93790979d7b26 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 606ad15739617..53cb8dd815d81 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 626f023a5b99c..89bee8567fc74 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 1c60d510e5703..7b45b23e9c546 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 45af98d94ef91..9b84f1e0c1dfc 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index eac99ab82a2e4..bbe07006109ea 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index e4fc093fe7334..cd5849b37453c 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,7 +14,7 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.0-SNAPSHOT
+SPARK_VERSION: 2.1.0
 SPARK_VERSION_SHORT: 2.1.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
diff --git a/examples/pom.xml b/examples/pom.xml
index 90bbd3fbb9404..2fb42413aca81 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 57d553b75b872..4061c5f089c54 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index fb0292a5f11e0..6cfc47ef00e2a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 5e9275c8e66d9..58caf35f65a16 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 7b68ca7373fe6..ed32fc0ec4c18 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index 1bc206e8675f1..a3f3907573f21 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 4f5045326a009..9ae4461db64a2 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index ebff5fd07a9b9..f7276d0bd2197 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index c36d479007091..52c88150137e3 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index bc02b8a66246a..93b49bcf615b6 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 91ccd4a927e98..cdfd29e3a9208 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index f7cb764463396..c6a79aa86bcf0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 57809ff692c28..3fa28aa81f214 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index fab409d3e9f96..5c828780600cd 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 10d5ba93ebb88..1818bc80ea78a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 6023cf0771862..d60a633b87699 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index 57cc26a4ccef9..f8e43d2c43ec2 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 8c985fd13ac06..6dcb44cebb254 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 4484998a49c8f..5cf3a7f3e0f5e 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index c391102d37502..49f12703c04df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0-SNAPSHOT</version>
+  <version>2.1.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 08a301695fda7..e91e778cb518c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.0.dev0"
+__version__ = "2.1.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 73493e600e546..1e7db9b10f045 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index f118a9a984620..c58e0f43b2ac7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 7da77158ff07e..37e7dccd2e27d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 819897cd46858..468d758a77884 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 2be99cb1046f4..7bf4fc0df45e8 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 07a0dab0ee047..06569e6ee2231 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index b9be8db684a90..35d53b30191a5 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 64ff845b5ae9a..38374b5ae5a3b 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 75d73d13e82aa88a7043d60b041b97fdb19e49b9 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Mon, 28 Nov 2016 11:48:21 -0800
Subject: [PATCH 211/534] Preparing development version 2.1.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 981ae1246476b..46fb178112802 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SparkR
 Type: Package
 Title: R Frontend for Apache Spark
-Version: 2.1.0
+Version: 2.1.1
 Date: 2016-11-06
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index aebfd12227751..29522fd3fd829 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 67d78d5f102fb..85644c4a37bbe 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 93790979d7b26..e15ede974cf8c 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 53cb8dd815d81..c93a355b84d0b 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 89bee8567fc74..7c9870a8cb85e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 7b45b23e9c546..8f949b94fd233 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 9b84f1e0c1dfc..a9b858e27150f 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index bbe07006109ea..d24ef118a5c1e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index cd5849b37453c..84ad5500c0a7d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.0
-SPARK_VERSION_SHORT: 2.1.0
+SPARK_VERSION: 2.1.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.1.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 2fb42413aca81..8a9e6cfcfcc70 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 4061c5f089c54..3849c02ffb03c 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6cfc47ef00e2a..964e45f31b741 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 58caf35f65a16..eec7a889ca1ff 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index ed32fc0ec4c18..a7622d08151fe 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index a3f3907573f21..e862126e48dbe 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 9ae4461db64a2..be8e73e41b947 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index f7276d0bd2197..fdfd2ccd4327a 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 52c88150137e3..e5bf070124b6a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 93b49bcf615b6..c0a94f5950d5c 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index cdfd29e3a9208..a02e23c69171d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index c6a79aa86bcf0..d7bb1acdc1d81 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fa28aa81f214..c53b72eefe84d 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 5c828780600cd..41b16500dd2bc 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 1818bc80ea78a..96e34cacff8b0 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d60a633b87699..c0b70dfdc3364 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index f8e43d2c43ec2..532d6073343ba 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6dcb44cebb254..6c3a35eeb9ecd 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5cf3a7f3e0f5e..757906d137c29 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 49f12703c04df..555324524ee82 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0</version>
+  <version>2.1.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index e91e778cb518c..6ae3609ae7fae 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.0"
+__version__ = "2.1.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1e7db9b10f045..705316a944e28 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index c58e0f43b2ac7..72be7e1005f64 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 37e7dccd2e27d..d7989c2413040 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 468d758a77884..34e0ae5bbc229 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 7bf4fc0df45e8..c543a3e049531 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 06569e6ee2231..fba6a5d7734a4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 35d53b30191a5..0c4c9c9f51828 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 38374b5ae5a3b..85ec270bf9965 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From cdf315ba1bd732291f05756281070eb7aa4e123f Mon Sep 17 00:00:00 2001
From: Yun Ni <yunn@uber.com>
Date: Mon, 28 Nov 2016 15:14:46 -0800
Subject: [PATCH 212/534] [SPARK-18408][ML] API Improvements for LSH

## What changes were proposed in this pull request?

(1) Change output schema to `Array of Vector` instead of `Vectors`
(2) Use `numHashTables` as the dimension of Array
(3) Rename `RandomProjection` to `BucketedRandomProjectionLSH`, `MinHash` to `MinHashLSH`
(4) Make `randUnitVectors/randCoefficients` private
(5) Make Multi-Probe NN Search and `hashDistance` private for future discussion

Saved for future PRs:
(1) AND-amplification and `numHashFunctions` as the dimension of Vector are saved for a future PR.
(2) `hashDistance` and MultiProbe NN Search needs more discussion. The current implementation is just a backward compatible one.

## How was this patch tested?
Related unit tests are modified to make sure the performance of LSH are ensured, and the outputs of the APIs meets expectation.

Author: Yun Ni <yunn@uber.com>
Author: Yunni <Euler57721@gmail.com>

Closes #15874 from Yunni/SPARK-18408-yunn-api-improvements.

(cherry picked from commit 05f7c6ffab2a6be548375cd624dc27092677232f)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 ...cala => BucketedRandomProjectionLSH.scala} |  77 +++++-----
 .../org/apache/spark/ml/feature/LSH.scala     | 138 ++++++++++--------
 .../{MinHash.scala => MinHashLSH.scala}       | 112 +++++++-------
 ...=> BucketedRandomProjectionLSHSuite.scala} | 100 +++++++------
 .../org/apache/spark/ml/feature/LSHTest.scala |  17 ++-
 ...nHashSuite.scala => MinHashLSHSuite.scala} |  83 ++++++++---
 6 files changed, 306 insertions(+), 221 deletions(-)
 rename mllib/src/main/scala/org/apache/spark/ml/feature/{RandomProjection.scala => BucketedRandomProjectionLSH.scala} (67%)
 rename mllib/src/main/scala/org/apache/spark/ml/feature/{MinHash.scala => MinHashLSH.scala} (54%)
 rename mllib/src/test/scala/org/apache/spark/ml/feature/{RandomProjectionSuite.scala => BucketedRandomProjectionLSHSuite.scala} (66%)
 rename mllib/src/test/scala/org/apache/spark/ml/feature/{MinHashSuite.scala => MinHashLSHSuite.scala} (60%)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
similarity index 67%
rename from mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
rename to mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
index 2bff59a0da173..cbac16345a292 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
@@ -34,9 +34,9 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  *
- * Params for [[RandomProjection]].
+ * Params for [[BucketedRandomProjectionLSH]].
  */
-private[ml] trait RandomProjectionParams extends Params {
+private[ml] trait BucketedRandomProjectionLSHParams extends Params {
 
   /**
    * The length of each hash bucket, a larger bucket lowers the false negative rate. The number of
@@ -58,8 +58,8 @@ private[ml] trait RandomProjectionParams extends Params {
 /**
  * :: Experimental ::
  *
- * Model produced by [[RandomProjection]], where multiple random vectors are stored. The vectors
- * are normalized to be unit vectors and each vector is used in a hash function:
+ * Model produced by [[BucketedRandomProjectionLSH]], where multiple random vectors are stored. The
+ * vectors are normalized to be unit vectors and each vector is used in a hash function:
  *    `h_i(x) = floor(r_i.dot(x) / bucketLength)`
  * where `r_i` is the i-th random unit vector. The number of buckets will be `(max L2 norm of input
  * vectors) / bucketLength`.
@@ -68,18 +68,19 @@ private[ml] trait RandomProjectionParams extends Params {
  */
 @Experimental
 @Since("2.1.0")
-class RandomProjectionModel private[ml] (
+class BucketedRandomProjectionLSHModel private[ml](
     override val uid: String,
-    @Since("2.1.0") val randUnitVectors: Array[Vector])
-  extends LSHModel[RandomProjectionModel] with RandomProjectionParams {
+    private[ml] val randUnitVectors: Array[Vector])
+  extends LSHModel[BucketedRandomProjectionLSHModel] with BucketedRandomProjectionLSHParams {
 
   @Since("2.1.0")
-  override protected[ml] val hashFunction: (Vector) => Vector = {
+  override protected[ml] val hashFunction: Vector => Array[Vector] = {
     key: Vector => {
       val hashValues: Array[Double] = randUnitVectors.map({
         randUnitVector => Math.floor(BLAS.dot(key, randUnitVector) / $(bucketLength))
       })
-      Vectors.dense(hashValues)
+      // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
+      hashValues.map(Vectors.dense(_))
     }
   }
 
@@ -89,27 +90,29 @@ class RandomProjectionModel private[ml] (
   }
 
   @Since("2.1.0")
-  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+  override protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
     // Since it's generated by hashing, it will be a pair of dense vectors.
-    x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min
+    x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min
   }
 
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 
   @Since("2.1.0")
-  override def write: MLWriter = new RandomProjectionModel.RandomProjectionModelWriter(this)
+  override def write: MLWriter = {
+    new BucketedRandomProjectionLSHModel.BucketedRandomProjectionLSHModelWriter(this)
+  }
 }
 
 /**
  * :: Experimental ::
  *
- * This [[RandomProjection]] implements Locality Sensitive Hashing functions for Euclidean
- * distance metrics.
+ * This [[BucketedRandomProjectionLSH]] implements Locality Sensitive Hashing functions for
+ * Euclidean distance metrics.
  *
  * The input is dense or sparse vectors, each of which represents a point in the Euclidean
- * distance space. The output will be vectors of configurable dimension. Hash value in the same
- * dimension is calculated by the same hash function.
+ * distance space. The output will be vectors of configurable dimension. Hash values in the
+ * same dimension are calculated by the same hash function.
  *
  * References:
  *
@@ -121,8 +124,9 @@ class RandomProjectionModel private[ml] (
  */
 @Experimental
 @Since("2.1.0")
-class RandomProjection(override val uid: String) extends LSH[RandomProjectionModel]
-  with RandomProjectionParams with HasSeed {
+class BucketedRandomProjectionLSH(override val uid: String)
+  extends LSH[BucketedRandomProjectionLSHModel]
+    with BucketedRandomProjectionLSHParams with HasSeed {
 
   @Since("2.1.0")
   override def setInputCol(value: String): this.type = super.setInputCol(value)
@@ -131,11 +135,11 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod
   override def setOutputCol(value: String): this.type = super.setOutputCol(value)
 
   @Since("2.1.0")
-  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+  override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)
 
   @Since("2.1.0")
   def this() = {
-    this(Identifiable.randomUID("random projection"))
+    this(Identifiable.randomUID("brp-lsh"))
   }
 
   /** @group setParam */
@@ -147,15 +151,16 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod
   def setSeed(value: Long): this.type = set(seed, value)
 
   @Since("2.1.0")
-  override protected[this] def createRawLSHModel(inputDim: Int): RandomProjectionModel = {
+  override protected[this] def createRawLSHModel(
+    inputDim: Int): BucketedRandomProjectionLSHModel = {
     val rand = new Random($(seed))
     val randUnitVectors: Array[Vector] = {
-      Array.fill($(outputDim)) {
+      Array.fill($(numHashTables)) {
         val randArray = Array.fill(inputDim)(rand.nextGaussian())
         Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray)))
       }
     }
-    new RandomProjectionModel(uid, randUnitVectors)
+    new BucketedRandomProjectionLSHModel(uid, randUnitVectors)
   }
 
   @Since("2.1.0")
@@ -169,23 +174,25 @@ class RandomProjection(override val uid: String) extends LSH[RandomProjectionMod
 }
 
 @Since("2.1.0")
-object RandomProjection extends DefaultParamsReadable[RandomProjection] {
+object BucketedRandomProjectionLSH extends DefaultParamsReadable[BucketedRandomProjectionLSH] {
 
   @Since("2.1.0")
-  override def load(path: String): RandomProjection = super.load(path)
+  override def load(path: String): BucketedRandomProjectionLSH = super.load(path)
 }
 
 @Since("2.1.0")
-object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
+object BucketedRandomProjectionLSHModel extends MLReadable[BucketedRandomProjectionLSHModel] {
 
   @Since("2.1.0")
-  override def read: MLReader[RandomProjectionModel] = new RandomProjectionModelReader
+  override def read: MLReader[BucketedRandomProjectionLSHModel] = {
+    new BucketedRandomProjectionLSHModelReader
+  }
 
   @Since("2.1.0")
-  override def load(path: String): RandomProjectionModel = super.load(path)
+  override def load(path: String): BucketedRandomProjectionLSHModel = super.load(path)
 
-  private[RandomProjectionModel] class RandomProjectionModelWriter(instance: RandomProjectionModel)
-    extends MLWriter {
+  private[BucketedRandomProjectionLSHModel] class BucketedRandomProjectionLSHModelWriter(
+    instance: BucketedRandomProjectionLSHModel) extends MLWriter {
 
     // TODO: Save using the existing format of Array[Vector] once SPARK-12878 is resolved.
     private case class Data(randUnitVectors: Matrix)
@@ -203,12 +210,13 @@ object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
     }
   }
 
-  private class RandomProjectionModelReader extends MLReader[RandomProjectionModel] {
+  private class BucketedRandomProjectionLSHModelReader
+    extends MLReader[BucketedRandomProjectionLSHModel] {
 
     /** Checked against metadata when loading model */
-    private val className = classOf[RandomProjectionModel].getName
+    private val className = classOf[BucketedRandomProjectionLSHModel].getName
 
-    override def load(path: String): RandomProjectionModel = {
+    override def load(path: String): BucketedRandomProjectionLSHModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
       val dataPath = new Path(path, "data").toString
@@ -216,7 +224,8 @@ object RandomProjectionModel extends MLReadable[RandomProjectionModel] {
       val Row(randUnitVectors: Matrix) = MLUtils.convertMatrixColumnsToML(data, "randUnitVectors")
         .select("randUnitVectors")
         .head()
-      val model = new RandomProjectionModel(metadata.uid, randUnitVectors.rowIter.toArray)
+      val model = new BucketedRandomProjectionLSHModel(metadata.uid,
+        randUnitVectors.rowIter.toArray)
 
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
index eb117c40eea3a..309cc2ef52b04 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -33,28 +33,28 @@ import org.apache.spark.sql.types._
  */
 private[ml] trait LSHParams extends HasInputCol with HasOutputCol {
   /**
-   * Param for the dimension of LSH OR-amplification.
+   * Param for the number of hash tables used in LSH OR-amplification.
    *
-   * In this implementation, we use LSH OR-amplification to reduce the false negative rate. The
-   * higher the dimension is, the lower the false negative rate.
+   * LSH OR-amplification can be used to reduce the false negative rate. Higher values for this
+   * param lead to a reduced false negative rate, at the expense of added computational complexity.
    * @group param
    */
-  final val outputDim: IntParam = new IntParam(this, "outputDim", "output dimension, where" +
-    " increasing dimensionality lowers the false negative rate, and decreasing dimensionality" +
-    " improves the running performance", ParamValidators.gt(0))
+  final val numHashTables: IntParam = new IntParam(this, "numHashTables", "number of hash " +
+    "tables, where increasing number of hash tables lowers the false negative rate, and " +
+    "decreasing it improves the running performance", ParamValidators.gt(0))
 
   /** @group getParam */
-  final def getOutputDim: Int = $(outputDim)
+  final def getNumHashTables: Int = $(numHashTables)
 
-  setDefault(outputDim -> 1)
+  setDefault(numHashTables -> 1)
 
   /**
    * Transform the Schema for LSH
-   * @param schema The schema of the input dataset without [[outputCol]]
-   * @return A derived schema with [[outputCol]] added
+   * @param schema The schema of the input dataset without [[outputCol]].
+   * @return A derived schema with [[outputCol]] added.
    */
   protected[this] final def validateAndTransformSchema(schema: StructType): StructType = {
-    SchemaUtils.appendColumn(schema, $(outputCol), new VectorUDT)
+    SchemaUtils.appendColumn(schema, $(outputCol), DataTypes.createArrayType(new VectorUDT))
   }
 }
 
@@ -66,32 +66,32 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
   self: T =>
 
   /**
-   * The hash function of LSH, mapping a predefined KeyType to a Vector
+   * The hash function of LSH, mapping an input feature vector to multiple hash vectors.
    * @return The mapping of LSH function.
    */
-  protected[ml] val hashFunction: Vector => Vector
+  protected[ml] val hashFunction: Vector => Array[Vector]
 
   /**
    * Calculate the distance between two different keys using the distance metric corresponding
-   * to the hashFunction
-   * @param x One input vector in the metric space
-   * @param y One input vector in the metric space
-   * @return The distance between x and y
+   * to the hashFunction.
+   * @param x One input vector in the metric space.
+   * @param y One input vector in the metric space.
+   * @return The distance between x and y.
    */
   protected[ml] def keyDistance(x: Vector, y: Vector): Double
 
   /**
    * Calculate the distance between two different hash Vectors.
    *
-   * @param x One of the hash vector
-   * @param y Another hash vector
-   * @return The distance between hash vectors x and y
+   * @param x One of the hash vector.
+   * @param y Another hash vector.
+   * @return The distance between hash vectors x and y.
    */
-  protected[ml] def hashDistance(x: Vector, y: Vector): Double
+  protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    val transformUDF = udf(hashFunction, new VectorUDT)
+    val transformUDF = udf(hashFunction, DataTypes.createArrayType(new VectorUDT))
     dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol))))
   }
 
@@ -99,29 +99,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
     validateAndTransformSchema(schema)
   }
 
-  /**
-   * Given a large dataset and an item, approximately find at most k items which have the closest
-   * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
-   * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
-   * transformed data when necessary.
-   *
-   * This method implements two ways of fetching k nearest neighbors:
-   *  - Single Probing: Fast, return at most k elements (Probing only one buckets)
-   *  - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key)
-   *
-   * @param dataset the dataset to search for nearest neighbors of the key
-   * @param key Feature vector representing the item to search for
-   * @param numNearestNeighbors The maximum number of nearest neighbors
-   * @param singleProbing True for using Single Probing; false for multiple probing
-   * @param distCol Output column for storing the distance between each result row and the key
-   * @return A dataset containing at most k items closest to the key. A distCol is added to show
-   *         the distance between each row and the key.
-   */
-  def approxNearestNeighbors(
+  // TODO: Fix the MultiProbe NN Search in SPARK-18454
+  private[feature] def approxNearestNeighbors(
       dataset: Dataset[_],
       key: Vector,
       numNearestNeighbors: Int,
-      singleProbing: Boolean,
+      singleProbe: Boolean,
       distCol: String): Dataset[_] = {
     require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1")
     // Get Hash Value of the key
@@ -132,14 +115,24 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
         dataset.toDF()
       }
 
-    // In the origin dataset, find the hash value that is closest to the key
-    val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType)
-    val hashDistCol = hashDistUDF(col($(outputCol)))
+    val modelSubset = if (singleProbe) {
+      def sameBucket(x: Seq[Vector], y: Seq[Vector]): Boolean = {
+        x.zip(y).exists(tuple => tuple._1 == tuple._2)
+      }
+
+      // In the origin dataset, find the hash value that hash the same bucket with the key
+      val sameBucketWithKeyUDF = udf((x: Seq[Vector]) =>
+        sameBucket(x, keyHash), DataTypes.BooleanType)
 
-    val modelSubset = if (singleProbing) {
-      modelDataset.filter(hashDistCol === 0.0)
+      modelDataset.filter(sameBucketWithKeyUDF(col($(outputCol))))
     } else {
+      // In the origin dataset, find the hash value that is closest to the key
+      // Limit the use of hashDist since it's controversial
+      val hashDistUDF = udf((x: Seq[Vector]) => hashDistance(x, keyHash), DataTypes.DoubleType)
+      val hashDistCol = hashDistUDF(col($(outputCol)))
+
       // Compute threshold to get exact k elements.
+      // TODO: SPARK-18409: Use approxQuantile to get the threshold
       val modelDatasetSortedByHash = modelDataset.sort(hashDistCol).limit(numNearestNeighbors)
       val thresholdDataset = modelDatasetSortedByHash.select(max(hashDistCol))
       val hashThreshold = thresholdDataset.take(1).head.getDouble(0)
@@ -155,8 +148,30 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
   }
 
   /**
-   * Overloaded method for approxNearestNeighbors. Use Single Probing as default way to search
-   * nearest neighbors and "distCol" as default distCol.
+   * Given a large dataset and an item, approximately find at most k items which have the closest
+   * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
+   * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
+   * transformed data when necessary.
+   *
+   * @note This method is experimental and will likely change behavior in the next release.
+   *
+   * @param dataset The dataset to search for nearest neighbors of the key.
+   * @param key Feature vector representing the item to search for.
+   * @param numNearestNeighbors The maximum number of nearest neighbors.
+   * @param distCol Output column for storing the distance between each result row and the key.
+   * @return A dataset containing at most k items closest to the key. A column "distCol" is added
+   *         to show the distance between each row and the key.
+   */
+  def approxNearestNeighbors(
+    dataset: Dataset[_],
+    key: Vector,
+    numNearestNeighbors: Int,
+    distCol: String): Dataset[_] = {
+    approxNearestNeighbors(dataset, key, numNearestNeighbors, true, distCol)
+  }
+
+  /**
+   * Overloaded method for approxNearestNeighbors. Use "distCol" as default distCol.
    */
   def approxNearestNeighbors(
       dataset: Dataset[_],
@@ -172,31 +187,28 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    *
    * @param dataset The dataset to transform and explode.
    * @param explodeCols The alias for the exploded columns, must be a seq of two strings.
-   * @return A dataset containing idCol, inputCol and explodeCols
+   * @return A dataset containing idCol, inputCol and explodeCols.
    */
   private[this] def processDataset(
       dataset: Dataset[_],
       inputName: String,
       explodeCols: Seq[String]): Dataset[_] = {
     require(explodeCols.size == 2, "explodeCols must be two strings.")
-    val vectorToMap = udf((x: Vector) => x.asBreeze.iterator.toMap,
-      MapType(DataTypes.IntegerType, DataTypes.DoubleType))
     val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) {
       transform(dataset)
     } else {
       dataset.toDF()
     }
     modelDataset.select(
-      struct(col("*")).as(inputName),
-      explode(vectorToMap(col($(outputCol)))).as(explodeCols))
+      struct(col("*")).as(inputName), posexplode(col($(outputCol))).as(explodeCols))
   }
 
   /**
    * Recreate a column using the same column name but different attribute id. Used in approximate
    * similarity join.
-   * @param dataset The dataset where a column need to recreate
-   * @param colName The name of the column to recreate
-   * @param tmpColName A temporary column name which does not conflict with existing columns
+   * @param dataset The dataset where a column need to recreate.
+   * @param colName The name of the column to recreate.
+   * @param tmpColName A temporary column name which does not conflict with existing columns.
    * @return
    */
   private[this] def recreateCol(
@@ -215,12 +227,12 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
    * [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the transformed
    * data when necessary.
    *
-   * @param datasetA One of the datasets to join
-   * @param datasetB Another dataset to join
-   * @param threshold The threshold for the distance of row pairs
-   * @param distCol Output column for storing the distance between each result row and the key
+   * @param datasetA One of the datasets to join.
+   * @param datasetB Another dataset to join.
+   * @param threshold The threshold for the distance of row pairs.
+   * @param distCol Output column for storing the distance between each result row and the key.
    * @return A joined dataset containing pairs of rows. The original rows are in columns
-   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair
+   *         "datasetA" and "datasetB", and a distCol is added to show the distance of each pair.
    */
   def approxSimilarityJoin(
       datasetA: Dataset[_],
@@ -293,7 +305,7 @@ private[ml] abstract class LSH[T <: LSHModel[T]]
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /** @group setParam */
-  def setOutputDim(value: Int): this.type = set(outputDim, value)
+  def setNumHashTables(value: Int): this.type = set(numHashTables, value)
 
   /**
    * Validate and create a new instance of concrete LSHModel. Because different LSHModel may have
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
similarity index 54%
rename from mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
rename to mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
index f37233e1ab9c8..620e1fbb09ff7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -31,37 +31,39 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  *
- * Model produced by [[MinHash]], where multiple hash functions are stored. Each hash function is
- * a perfect hash function:
- *    `h_i(x) = (x * k_i mod prime) mod numEntries`
- * where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
+ * Model produced by [[MinHashLSH]], where multiple hash functions are stored. Each hash function
+ * is picked from the following family of hash functions, where a_i and b_i are randomly chosen
+ * integers less than prime:
+ *    `h_i(x) = ((x \cdot a_i + b_i) \mod prime)`
+ *
+ * This hash family is approximately min-wise independent according to the reference.
  *
  * Reference:
- * <a href="https://en.wikipedia.org/wiki/Perfect_hash_function">
- * Wikipedia on Perfect Hash Function</a>
+ * Tom Bohman, Colin Cooper, and Alan Frieze. "Min-wise independent linear permutations."
+ * Electronic Journal of Combinatorics 7 (2000): R26.
  *
- * @param numEntries The number of entries of the hash functions.
- * @param randCoefficients An array of random coefficients, each used by one hash function.
+ * @param randCoefficients Pairs of random coefficients. Each pair is used by one hash function.
  */
 @Experimental
 @Since("2.1.0")
-class MinHashModel private[ml] (
+class MinHashLSHModel private[ml](
     override val uid: String,
-    @Since("2.1.0") val numEntries: Int,
-    @Since("2.1.0") val randCoefficients: Array[Int])
-  extends LSHModel[MinHashModel] {
+    private[ml] val randCoefficients: Array[(Int, Int)])
+  extends LSHModel[MinHashLSHModel] {
 
   @Since("2.1.0")
-  override protected[ml] val hashFunction: Vector => Vector = {
-    elems: Vector =>
+  override protected[ml] val hashFunction: Vector => Array[Vector] = {
+    elems: Vector => {
       require(elems.numNonzeros > 0, "Must have at least 1 non zero entry.")
       val elemsList = elems.toSparse.indices.toList
-      val hashValues = randCoefficients.map({ randCoefficient: Int =>
-          elemsList.map({elem: Int =>
-            (1 + elem) * randCoefficient.toLong % MinHash.prime % numEntries
-          }).min.toDouble
-      })
-      Vectors.dense(hashValues)
+      val hashValues = randCoefficients.map { case (a, b) =>
+        elemsList.map { elem: Int =>
+          ((1 + elem) * a + b) % MinHashLSH.HASH_PRIME
+        }.min.toDouble
+      }
+      // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
+      hashValues.map(Vectors.dense(_))
+    }
   }
 
   @Since("2.1.0")
@@ -75,16 +77,19 @@ class MinHashModel private[ml] (
   }
 
   @Since("2.1.0")
-  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+  override protected[ml] def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
     // Since it's generated by hashing, it will be a pair of dense vectors.
-    x.toDense.values.zip(y.toDense.values).map(pair => math.abs(pair._1 - pair._2)).min
+    // TODO: This hashDistance function requires more discussion in SPARK-18454
+    x.zip(y).map(vectorPair =>
+      vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2)
+    ).min
   }
 
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 
   @Since("2.1.0")
-  override def write: MLWriter = new MinHashModel.MinHashModelWriter(this)
+  override def write: MLWriter = new MinHashLSHModel.MinHashLSHModelWriter(this)
 }
 
 /**
@@ -93,18 +98,17 @@ class MinHashModel private[ml] (
  * LSH class for Jaccard distance.
  *
  * The input can be dense or sparse vectors, but it is more efficient if it is sparse. For example,
- *    `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])`
- * means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5.
- * Also, any input vector must have at least 1 non-zero indices, and all non-zero values are treated
- * as binary "1" values.
+ *    `Vectors.sparse(10, Array((2, 1.0), (3, 1.0), (5, 1.0)))`
+ * means there are 10 elements in the space. This set contains elements 2, 3, and 5. Also, any
+ * input vector must have at least 1 non-zero index, and all non-zero values are
+ * treated as binary "1" values.
  *
  * References:
  * <a href="https://en.wikipedia.org/wiki/MinHash">Wikipedia on MinHash</a>
  */
 @Experimental
 @Since("2.1.0")
-class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
-
+class MinHashLSH(override val uid: String) extends LSH[MinHashLSHModel] with HasSeed {
 
   @Since("2.1.0")
   override def setInputCol(value: String): this.type = super.setInputCol(value)
@@ -113,11 +117,11 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
   override def setOutputCol(value: String): this.type = super.setOutputCol(value)
 
   @Since("2.1.0")
-  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+  override def setNumHashTables(value: Int): this.type = super.setNumHashTables(value)
 
   @Since("2.1.0")
   def this() = {
-    this(Identifiable.randomUID("min hash"))
+    this(Identifiable.randomUID("mh-lsh"))
   }
 
   /** @group setParam */
@@ -125,13 +129,14 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
   def setSeed(value: Long): this.type = set(seed, value)
 
   @Since("2.1.0")
-  override protected[ml] def createRawLSHModel(inputDim: Int): MinHashModel = {
-    require(inputDim <= MinHash.prime / 2,
-      s"The input vector dimension $inputDim exceeds the threshold ${MinHash.prime / 2}.")
+  override protected[ml] def createRawLSHModel(inputDim: Int): MinHashLSHModel = {
+    require(inputDim <= MinHashLSH.HASH_PRIME,
+      s"The input vector dimension $inputDim exceeds the threshold ${MinHashLSH.HASH_PRIME}.")
     val rand = new Random($(seed))
-    val numEntry = inputDim * 2
-    val randCoofs: Array[Int] = Array.fill($(outputDim))(1 + rand.nextInt(MinHash.prime - 1))
-    new MinHashModel(uid, numEntry, randCoofs)
+    val randCoefs: Array[(Int, Int)] = Array.fill($(numHashTables)) {
+        (1 + rand.nextInt(MinHashLSH.HASH_PRIME - 1), rand.nextInt(MinHashLSH.HASH_PRIME - 1))
+      }
+    new MinHashLSHModel(uid, randCoefs)
   }
 
   @Since("2.1.0")
@@ -145,48 +150,49 @@ class MinHash(override val uid: String) extends LSH[MinHashModel] with HasSeed {
 }
 
 @Since("2.1.0")
-object MinHash extends DefaultParamsReadable[MinHash] {
+object MinHashLSH extends DefaultParamsReadable[MinHashLSH] {
   // A large prime smaller than sqrt(2^63 − 1)
-  private[ml] val prime = 2038074743
+  private[ml] val HASH_PRIME = 2038074743
 
   @Since("2.1.0")
-  override def load(path: String): MinHash = super.load(path)
+  override def load(path: String): MinHashLSH = super.load(path)
 }
 
 @Since("2.1.0")
-object MinHashModel extends MLReadable[MinHashModel] {
+object MinHashLSHModel extends MLReadable[MinHashLSHModel] {
 
   @Since("2.1.0")
-  override def read: MLReader[MinHashModel] = new MinHashModelReader
+  override def read: MLReader[MinHashLSHModel] = new MinHashLSHModelReader
 
   @Since("2.1.0")
-  override def load(path: String): MinHashModel = super.load(path)
+  override def load(path: String): MinHashLSHModel = super.load(path)
 
-  private[MinHashModel] class MinHashModelWriter(instance: MinHashModel) extends MLWriter {
+  private[MinHashLSHModel] class MinHashLSHModelWriter(instance: MinHashLSHModel)
+    extends MLWriter {
 
-    private case class Data(numEntries: Int, randCoefficients: Array[Int])
+    private case class Data(randCoefficients: Array[Int])
 
     override protected def saveImpl(path: String): Unit = {
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      val data = Data(instance.numEntries, instance.randCoefficients)
+      val data = Data(instance.randCoefficients.flatMap(tuple => Array(tuple._1, tuple._2)))
       val dataPath = new Path(path, "data").toString
       sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
     }
   }
 
-  private class MinHashModelReader extends MLReader[MinHashModel] {
+  private class MinHashLSHModelReader extends MLReader[MinHashLSHModel] {
 
     /** Checked against metadata when loading model */
-    private val className = classOf[MinHashModel].getName
+    private val className = classOf[MinHashLSHModel].getName
 
-    override def load(path: String): MinHashModel = {
+    override def load(path: String): MinHashLSHModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
 
       val dataPath = new Path(path, "data").toString
-      val data = sparkSession.read.parquet(dataPath).select("numEntries", "randCoefficients").head()
-      val numEntries = data.getAs[Int](0)
-      val randCoefficients = data.getAs[Seq[Int]](1).toArray
-      val model = new MinHashModel(metadata.uid, numEntries, randCoefficients)
+      val data = sparkSession.read.parquet(dataPath).select("randCoefficients").head()
+      val randCoefficients = data.getAs[Seq[Int]](0).grouped(2)
+        .map(tuple => (tuple(0), tuple(1))).toArray
+      val model = new MinHashLSHModel(metadata.uid, randCoefficients)
 
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
similarity index 66%
rename from mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala
rename to mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
index cd82ee2117a07..ab937685a555c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RandomProjectionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
-class RandomProjectionSuite
+class BucketedRandomProjectionLSHSuite
   extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   @transient var dataset: Dataset[_] = _
@@ -43,70 +43,72 @@ class RandomProjectionSuite
   }
 
   test("params") {
-    ParamsSuite.checkParams(new RandomProjection)
-    val model = new RandomProjectionModel("rp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
+    ParamsSuite.checkParams(new BucketedRandomProjectionLSH)
+    val model = new BucketedRandomProjectionLSHModel(
+      "brp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
     ParamsSuite.checkParams(model)
   }
 
-  test("RandomProjection: default params") {
-    val rp = new RandomProjection
-    assert(rp.getOutputDim === 1.0)
+  test("BucketedRandomProjectionLSH: default params") {
+    val brp = new BucketedRandomProjectionLSH
+    assert(brp.getNumHashTables === 1.0)
   }
 
   test("read/write") {
-    def checkModelData(model: RandomProjectionModel, model2: RandomProjectionModel): Unit = {
+    def checkModelData(
+      model: BucketedRandomProjectionLSHModel,
+      model2: BucketedRandomProjectionLSHModel): Unit = {
       model.randUnitVectors.zip(model2.randUnitVectors)
         .foreach(pair => assert(pair._1 === pair._2))
     }
-    val mh = new RandomProjection()
+    val mh = new BucketedRandomProjectionLSH()
     val settings = Map("inputCol" -> "keys", "outputCol" -> "values", "bucketLength" -> 1.0)
     testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData)
   }
 
   test("hashFunction") {
     val randUnitVectors = Array(Vectors.dense(0.0, 1.0), Vectors.dense(1.0, 0.0))
-    val model = new RandomProjectionModel("rp", randUnitVectors)
+    val model = new BucketedRandomProjectionLSHModel("brp", randUnitVectors)
     model.set(model.bucketLength, 0.5)
     val res = model.hashFunction(Vectors.dense(1.23, 4.56))
-    assert(res.equals(Vectors.dense(9.0, 2.0)))
+    assert(res.length == 2)
+    assert(res(0).equals(Vectors.dense(9.0)))
+    assert(res(1).equals(Vectors.dense(2.0)))
   }
 
-  test("keyDistance and hashDistance") {
-    val model = new RandomProjectionModel("rp", Array(Vectors.dense(0.0, 1.0)))
+  test("keyDistance") {
+    val model = new BucketedRandomProjectionLSHModel("brp", Array(Vectors.dense(0.0, 1.0)))
     val keyDist = model.keyDistance(Vectors.dense(1, 2), Vectors.dense(-2, -2))
-    val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2))
     assert(keyDist === 5)
-    assert(hashDist === 3)
   }
 
-  test("RandomProjection: randUnitVectors") {
-    val rp = new RandomProjection()
-      .setOutputDim(20)
+  test("BucketedRandomProjectionLSH: randUnitVectors") {
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(1.0)
       .setSeed(12345)
-    val unitVectors = rp.fit(dataset).randUnitVectors
+    val unitVectors = brp.fit(dataset).randUnitVectors
     unitVectors.foreach { v: Vector =>
       assert(Vectors.norm(v, 2.0) ~== 1.0 absTol 1e-14)
     }
   }
 
-  test("RandomProjection: test of LSH property") {
+  test("BucketedRandomProjectionLSH: test of LSH property") {
     // Project from 2 dimensional Euclidean Space to 1 dimensions
-    val rp = new RandomProjection()
-      .setOutputDim(1)
+    val brp = new BucketedRandomProjectionLSH()
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(1.0)
       .setSeed(12345)
 
-    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, rp, 8.0, 2.0)
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(dataset, brp, 8.0, 2.0)
     assert(falsePositive < 0.4)
     assert(falseNegative < 0.4)
   }
 
-  test("RandomProjection with high dimension data: test of LSH property") {
+  test("BucketedRandomProjectionLSH with high dimension data: test of LSH property") {
     val numDim = 100
     val data = {
       for (i <- 0 until numDim; j <- Seq(-2, -1, 1, 2))
@@ -115,30 +117,30 @@ class RandomProjectionSuite
     val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
 
     // Project from 100 dimensional Euclidean Space to 10 dimensions
-    val rp = new RandomProjection()
-      .setOutputDim(10)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(10)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(2.5)
       .setSeed(12345)
 
-    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, rp, 3.0, 2.0)
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, brp, 3.0, 2.0)
     assert(falsePositive < 0.3)
     assert(falseNegative < 0.3)
   }
 
-  test("approxNearestNeighbors for random projection") {
+  test("approxNearestNeighbors for bucketed random projection") {
     val key = Vectors.dense(1.2, 3.4)
 
-    val rp = new RandomProjection()
-      .setOutputDim(2)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(4.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100,
-      singleProbing = true)
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(brp, dataset, key, 100,
+      singleProbe = true)
     assert(precision >= 0.6)
     assert(recall >= 0.6)
   }
@@ -146,33 +148,47 @@ class RandomProjectionSuite
   test("approxNearestNeighbors with multiple probing") {
     val key = Vectors.dense(1.2, 3.4)
 
-    val rp = new RandomProjection()
-      .setOutputDim(20)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(1.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(rp, dataset, key, 100,
-      singleProbing = false)
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(brp, dataset, key, 100,
+      singleProbe = false)
     assert(precision >= 0.7)
     assert(recall >= 0.7)
   }
 
-  test("approxSimilarityJoin for random projection on different dataset") {
+  test("approxNearestNeighbors for numNeighbors <= 0") {
+    val key = Vectors.dense(1.2, 3.4)
+
+    val model = new BucketedRandomProjectionLSHModel(
+      "brp", randUnitVectors = Array(Vectors.dense(1.0, 0.0)))
+
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, 0)
+    }
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, -1)
+    }
+  }
+
+  test("approxSimilarityJoin for bucketed random projection on different dataset") {
     val data2 = {
       for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i))
     }
     val dataset2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
 
-    val rp = new RandomProjection()
-      .setOutputDim(2)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(4.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, dataset, dataset2, 1.0)
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(brp, dataset, dataset2, 1.0)
     assert(precision == 1.0)
     assert(recall >= 0.7)
   }
@@ -183,14 +199,14 @@ class RandomProjectionSuite
     }
     val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
 
-    val rp = new RandomProjection()
-      .setOutputDim(2)
+    val brp = new BucketedRandomProjectionLSH()
+      .setNumHashTables(2)
       .setInputCol("keys")
       .setOutputCol("values")
       .setBucketLength(4.0)
       .setSeed(12345)
 
-    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(rp, df, df, 3.0)
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(brp, df, df, 3.0)
     assert(precision == 1.0)
     assert(recall >= 0.7)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
index 5c025546f332b..a9b559f7ba648 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
@@ -58,12 +58,18 @@ private[ml] object LSHTest {
     val outputCol = model.getOutputCol
     val transformedData = model.transform(dataset)
 
-    SchemaUtils.checkColumnType(transformedData.schema, model.getOutputCol, new VectorUDT)
+    // Check output column type
+    SchemaUtils.checkColumnType(
+      transformedData.schema, model.getOutputCol, DataTypes.createArrayType(new VectorUDT))
+
+    // Check output column dimensions
+    val headHashValue = transformedData.select(outputCol).head().get(0).asInstanceOf[Seq[Vector]]
+    assert(headHashValue.length == model.getNumHashTables)
 
     // Perform a cross join and label each pair of same_bucket and distance
     val pairs = transformedData.as("a").crossJoin(transformedData.as("b"))
     val distUDF = udf((x: Vector, y: Vector) => model.keyDistance(x, y), DataTypes.DoubleType)
-    val sameBucket = udf((x: Vector, y: Vector) => model.hashDistance(x, y) == 0.0,
+    val sameBucket = udf((x: Seq[Vector], y: Seq[Vector]) => model.hashDistance(x, y) == 0.0,
       DataTypes.BooleanType)
     val result = pairs
       .withColumn("same_bucket", sameBucket(col(s"a.$outputCol"), col(s"b.$outputCol")))
@@ -83,6 +89,7 @@ private[ml] object LSHTest {
    * @param dataset the dataset to look for the key
    * @param key The key to hash for the item
    * @param k The maximum number of items closest to the key
+   * @param singleProbe True for using single-probe; false for multi-probe
    * @tparam T The class type of lsh
    * @return A tuple of two doubles, representing precision and recall rate
    */
@@ -91,7 +98,7 @@ private[ml] object LSHTest {
       dataset: Dataset[_],
       key: Vector,
       k: Int,
-      singleProbing: Boolean): (Double, Double) = {
+      singleProbe: Boolean): (Double, Double) = {
     val model = lsh.fit(dataset)
 
     // Compute expected
@@ -99,14 +106,14 @@ private[ml] object LSHTest {
     val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k)
 
     // Compute actual
-    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol")
+    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbe, "distCol")
 
     assert(actual.schema.sameType(model
       .transformSchema(dataset.schema)
       .add("distCol", DataTypes.DoubleType))
     )
 
-    if (!singleProbing) {
+    if (!singleProbe) {
       assert(actual.count() == k)
     }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
similarity index 60%
rename from mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
rename to mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
index c32ca7d69cf84..3461cdf82460f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.Dataset
 
-class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   @transient var dataset: Dataset[_] = _
 
@@ -38,45 +38,51 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
   }
 
   test("params") {
-    ParamsSuite.checkParams(new MinHash)
-    val model = new MinHashModel("mh", numEntries = 2, randCoefficients = Array(1))
+    ParamsSuite.checkParams(new MinHashLSH)
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
     ParamsSuite.checkParams(model)
   }
 
-  test("MinHash: default params") {
-    val rp = new MinHash
-    assert(rp.getOutputDim === 1.0)
+  test("MinHashLSH: default params") {
+    val rp = new MinHashLSH
+    assert(rp.getNumHashTables === 1.0)
   }
 
   test("read/write") {
-    def checkModelData(model: MinHashModel, model2: MinHashModel): Unit = {
-      assert(model.numEntries === model2.numEntries)
+    def checkModelData(model: MinHashLSHModel, model2: MinHashLSHModel): Unit = {
       assertResult(model.randCoefficients)(model2.randCoefficients)
     }
-    val mh = new MinHash()
+    val mh = new MinHashLSH()
     val settings = Map("inputCol" -> "keys", "outputCol" -> "values")
     testEstimatorAndModelReadWrite(mh, dataset, settings, checkModelData)
   }
 
   test("hashFunction") {
-    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(0, 1, 3))
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((0, 1), (1, 2), (3, 0)))
     val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))))
-    assert(res.equals(Vectors.dense(0.0, 3.0, 4.0)))
+    assert(res.length == 3)
+    assert(res(0).equals(Vectors.dense(1.0)))
+    assert(res(1).equals(Vectors.dense(5.0)))
+    assert(res(2).equals(Vectors.dense(9.0)))
   }
 
-  test("keyDistance and hashDistance") {
-    val model = new MinHashModel("mh", numEntries = 20, randCoefficients = Array(1))
+  test("hashFunction: empty vector") {
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((0, 1), (1, 2), (3, 0)))
+    intercept[IllegalArgumentException] {
+      model.hashFunction(Vectors.sparse(10, Seq()))
+    }
+  }
+
+  test("keyDistance") {
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
     val v1 = Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0)))
     val v2 = Vectors.sparse(10, Seq((1, 1.0), (3, 1.0), (5, 1.0), (7, 1.0), (9, 1.0)))
     val keyDist = model.keyDistance(v1, v2)
-    val hashDist = model.hashDistance(Vectors.dense(-5, 5), Vectors.dense(1, 2))
     assert(keyDist === 0.5)
-    assert(hashDist === 3)
   }
 
-  test("MinHash: test of LSH property") {
-    val mh = new MinHash()
-      .setOutputDim(1)
+  test("MinHashLSH: test of LSH property") {
+    val mh = new MinHashLSH()
       .setInputCol("keys")
       .setOutputCol("values")
       .setSeed(12344)
@@ -86,9 +92,24 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     assert(falseNegative < 0.3)
   }
 
+  test("MinHashLSH: test of inputDim > prime") {
+    val mh = new MinHashLSH()
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(12344)
+
+    val data = {
+      for (i <- 0 to 2) yield Vectors.sparse(Int.MaxValue, (i until i + 5).map((_, 1.0)))
+    }
+    val badDataset = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+    intercept[IllegalArgumentException] {
+      mh.fit(badDataset)
+    }
+  }
+
   test("approxNearestNeighbors for min hash") {
-    val mh = new MinHash()
-      .setOutputDim(20)
+    val mh = new MinHashLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setSeed(12345)
@@ -97,12 +118,26 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (0 until 100).filter(_.toString.contains("1")).map((_, 1.0)))
 
     val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, dataset, key, 20,
-      singleProbing = true)
+      singleProbe = true)
     assert(precision >= 0.7)
     assert(recall >= 0.7)
   }
 
-  test("approxSimilarityJoin for minhash on different dataset") {
+  test("approxNearestNeighbors for numNeighbors <= 0") {
+    val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
+
+    val key: Vector = Vectors.sparse(100,
+      (0 until 100).filter(_.toString.contains("1")).map((_, 1.0)))
+
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, 0)
+    }
+    intercept[IllegalArgumentException] {
+      model.approxNearestNeighbors(dataset, key, -1)
+    }
+  }
+
+  test("approxSimilarityJoin for min hash on different dataset") {
     val data1 = {
       for (i <- 0 until 20) yield Vectors.sparse(100, (5 * i until 5 * i + 5).map((_, 1.0)))
     }
@@ -113,8 +148,8 @@ class MinHashSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
     val df2 = spark.createDataFrame(data2.map(Tuple1.apply)).toDF("keys")
 
-    val mh = new MinHash()
-      .setOutputDim(20)
+    val mh = new MinHashLSH()
+      .setNumHashTables(20)
       .setInputCol("keys")
       .setOutputCol("values")
       .setSeed(12345)

From c46928ff97371421613720a0d8d7f2baaa64bb73 Mon Sep 17 00:00:00 2001
From: Alexander Shorin <kxepal@apache.org>
Date: Mon, 28 Nov 2016 18:28:24 -0800
Subject: [PATCH 213/534] [SPARK-18523][PYSPARK] Make SparkContext.stop more
 reliable

## What changes were proposed in this pull request?

This PR fixes SparkContext broken state in which it may fall if spark driver get crashed or killed by OOM.

## How was this patch tested?

1. Start SparkContext;
2. Find Spark driver process and `kill -9` it;
3. Call `sc.stop()`;
4. Create new SparkContext after that;

Without this patch you will crash on step 3 and won't be able to do step 4 without manual reset private attibutes or IPython notebook / shell restart.

Author: Alexander Shorin <kxepal@apache.org>

Closes #15961 from kxepal/18523-make-spark-context-stop-more-reliable.

(cherry picked from commit 71352c94ad2a60d1695bd7ac0f4452539270e10c)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 python/pyspark/context.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 2fd3aee01d76c..5c4e79cb0499e 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -26,6 +26,8 @@
 from threading import RLock
 from tempfile import NamedTemporaryFile
 
+from py4j.protocol import Py4JError
+
 from pyspark import accumulators
 from pyspark.accumulators import Accumulator
 from pyspark.broadcast import Broadcast
@@ -373,8 +375,19 @@ def stop(self):
         Shut down the SparkContext.
         """
         if getattr(self, "_jsc", None):
-            self._jsc.stop()
-            self._jsc = None
+            try:
+                self._jsc.stop()
+            except Py4JError:
+                # Case: SPARK-18523
+                warnings.warn(
+                    'Unable to cleanly shutdown Spark JVM process.'
+                    ' It is possible that the process has crashed,'
+                    ' been killed or may also be in a zombie state.',
+                    RuntimeWarning
+                )
+                pass
+            finally:
+                self._jsc = None
         if getattr(self, "_accumulatorServer", None):
             self._accumulatorServer.shutdown()
             self._accumulatorServer = None

From a0c1c699e3c09027f6daa728a9ea2a8c0cd12d1c Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Mon, 28 Nov 2016 20:23:48 -0800
Subject: [PATCH 214/534] [SPARK-16282][SQL] Follow-up: remove "percentile"
 from temp function detection after implementing it natively

## What changes were proposed in this pull request?

In #15764 we added a mechanism to detect if a function is temporary or not. Hive functions are treated as non-temporary. Of the three hive functions, now "percentile" has been implemented natively, and "hash" has been removed. So we should update the list.

## How was this patch tested?

Unit tests.

Author: Shuai Lin <linshuai2012@gmail.com>

Closes #16049 from lins05/update-temp-function-detect-hive-list.

(cherry picked from commit e64a2047eaf02d65dcf98b6e0710e10196aa74b1)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../apache/spark/sql/catalyst/catalog/SessionCatalog.scala   | 5 +----
 .../spark/sql/catalyst/catalog/SessionCatalogSuite.scala     | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 002aecb9bf133..0b6a91fff71fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -939,10 +939,7 @@ class SessionCatalog(
    */
   def isTemporaryFunction(name: FunctionIdentifier): Boolean = {
     // copied from HiveSessionCatalog
-    val hiveFunctions = Seq(
-      "hash",
-      "histogram_numeric",
-      "percentile")
+    val hiveFunctions = Seq("histogram_numeric")
 
     // A temporary function is a function that has been registered in functionRegistry
     // without a database name, and is neither a built-in function nor a Hive function
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index da41d3614b784..3f27160d63934 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -981,7 +981,6 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(FunctionRegistry.builtin.functionExists("sum"))
     assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("sum")))
     assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("histogram_numeric")))
-    assert(!sessionCatalog.isTemporaryFunction(FunctionIdentifier("percentile")))
   }
 
   test("drop function") {

From 45e2b3c0e4cd5c6e1ce6d9c99950eda726d27250 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 28 Nov 2016 21:04:20 -0800
Subject: [PATCH 215/534] [SPARK-18588][SS][KAFKA] Ignore the flaky kafka test

## What changes were proposed in this pull request?

Ignore the flaky test to unblock other PRs while I'm debugging it.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16051 from zsxwing/ignore-flaky-kafka-test.

(cherry picked from commit 1633ff3b6c97e33191859f34c868782cbb0972fd)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index f9f62581a3066..e1af14f95dfc9 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -838,7 +838,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  test("stress test for failOnDataLoss=false") {
+  ignore("stress test for failOnDataLoss=false") {
     val reader = spark
       .readStream
       .format("kafka")

From c4cbdc864f7191ab1d49cdc360fe78ec16f48db5 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 28 Nov 2016 21:10:57 -0800
Subject: [PATCH 216/534] [SPARK-18547][CORE] Propagate I/O encryption key when
 executors register.

This change modifies the method used to propagate encryption keys used during
shuffle. Instead of relying on YARN's UserGroupInformation credential propagation,
this change explicitly distributes the key using the messages exchanged between
driver and executor during registration. When RPC encryption is enabled, this means
key propagation is also secure.

This allows shuffle encryption to work in non-YARN mode, which means that it's
easier to write unit tests for areas of the code that are affected by the feature.

The key is stored in the SecurityManager; because there are many instances of
that class used in the code, the key is only guaranteed to exist in the instance
managed by the SparkEnv. This path was chosen to avoid storing the key in the
SparkConf, which would risk having the key being written to disk as part of the
configuration (as, for example, is done when starting YARN applications).

Tested by new and existing unit tests (which were moved from the YARN module to
core), and by running apps with shuffle encryption enabled.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #15981 from vanzin/SPARK-18547.

(cherry picked from commit 8b325b17ecdf013b7a6edcb7ee3773546bd914df)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../org/apache/spark/SecurityManager.scala    |  23 +--
 .../scala/org/apache/spark/SparkContext.scala |   4 -
 .../scala/org/apache/spark/SparkEnv.scala     |  33 +++--
 .../CoarseGrainedExecutorBackend.scala        |   6 +-
 .../cluster/CoarseGrainedClusterMessage.scala |   7 +-
 .../CoarseGrainedSchedulerBackend.scala       |   6 +-
 .../spark/security/CryptoStreamUtils.scala    |  28 ++--
 .../spark/serializer/SerializerManager.scala  |  18 ++-
 .../security/CryptoStreamUtilsSuite.scala     | 135 ++++++++++--------
 docs/configuration.md                         |   3 +-
 .../spark/executor/MesosExecutorBackend.scala |   2 +-
 .../cluster/mesos/MesosClusterManager.scala   |   4 +
 .../mesos/MesosClusterManagerSuite.scala      |  11 +-
 .../org/apache/spark/deploy/yarn/Client.scala |   5 -
 .../spark/deploy/yarn/IOEncryptionSuite.scala | 108 --------------
 15 files changed, 166 insertions(+), 227 deletions(-)
 delete mode 100644 yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 199365ad925a3..87fe56315203e 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -21,7 +21,6 @@ import java.lang.{Byte => JByte}
 import java.net.{Authenticator, PasswordAuthentication}
 import java.security.{KeyStore, SecureRandom}
 import java.security.cert.X509Certificate
-import javax.crypto.KeyGenerator
 import javax.net.ssl._
 
 import com.google.common.hash.HashCodes
@@ -33,7 +32,6 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.sasl.SecretKeyHolder
-import org.apache.spark.security.CryptoStreamUtils._
 import org.apache.spark.util.Utils
 
 /**
@@ -185,7 +183,9 @@ import org.apache.spark.util.Utils
  *  setting `spark.ssl.useNodeLocalConf` to `true`.
  */
 
-private[spark] class SecurityManager(sparkConf: SparkConf)
+private[spark] class SecurityManager(
+    sparkConf: SparkConf,
+    ioEncryptionKey: Option[Array[Byte]] = None)
   extends Logging with SecretKeyHolder {
 
   import SecurityManager._
@@ -415,6 +415,8 @@ private[spark] class SecurityManager(sparkConf: SparkConf)
     logInfo("Changing acls enabled to: " + aclsOn)
   }
 
+  def getIOEncryptionKey(): Option[Array[Byte]] = ioEncryptionKey
+
   /**
    * Generates or looks up the secret key.
    *
@@ -559,19 +561,4 @@ private[spark] object SecurityManager {
   // key used to store the spark secret in the Hadoop UGI
   val SECRET_LOOKUP_KEY = "sparkCookie"
 
-  /**
-   * Setup the cryptographic key used by IO encryption in credentials. The key is generated using
-   * [[KeyGenerator]]. The algorithm and key length is specified by the [[SparkConf]].
-   */
-  def initIOEncryptionKey(conf: SparkConf, credentials: Credentials): Unit = {
-    if (credentials.getSecretKey(SPARK_IO_TOKEN) == null) {
-      val keyLen = conf.get(IO_ENCRYPTION_KEY_SIZE_BITS)
-      val ioKeyGenAlgorithm = conf.get(IO_ENCRYPTION_KEYGEN_ALGORITHM)
-      val keyGen = KeyGenerator.getInstance(ioKeyGenAlgorithm)
-      keyGen.init(keyLen)
-
-      val ioKey = keyGen.generateKey()
-      credentials.addSecretKey(SPARK_IO_TOKEN, ioKey.getEncoded)
-    }
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1261e3e735761..a159a170ebc50 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -422,10 +422,6 @@ class SparkContext(config: SparkConf) extends Logging {
     }
 
     if (master == "yarn" && deployMode == "client") System.setProperty("SPARK_YARN_MODE", "true")
-    if (_conf.get(IO_ENCRYPTION_ENABLED) && !SparkHadoopUtil.get.isYarnMode()) {
-      throw new SparkException("IO encryption is only supported in YARN mode, please disable it " +
-        s"by setting ${IO_ENCRYPTION_ENABLED.key} to false")
-    }
 
     // "_jobProgressListener" should be set up before creating SparkEnv because when creating
     // "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 1ffeb129880f9..1296386ac9bd3 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -36,6 +36,7 @@ import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.{RpcEndpoint, RpcEndpointRef, RpcEnv}
 import org.apache.spark.scheduler.{LiveListenerBus, OutputCommitCoordinator}
 import org.apache.spark.scheduler.OutputCommitCoordinator.OutputCommitCoordinatorEndpoint
+import org.apache.spark.security.CryptoStreamUtils
 import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerManager}
 import org.apache.spark.shuffle.ShuffleManager
 import org.apache.spark.storage._
@@ -165,15 +166,20 @@ object SparkEnv extends Logging {
     val bindAddress = conf.get(DRIVER_BIND_ADDRESS)
     val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS)
     val port = conf.get("spark.driver.port").toInt
+    val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) {
+      Some(CryptoStreamUtils.createKey(conf))
+    } else {
+      None
+    }
     create(
       conf,
       SparkContext.DRIVER_IDENTIFIER,
       bindAddress,
       advertiseAddress,
       port,
-      isDriver = true,
-      isLocal = isLocal,
-      numUsableCores = numCores,
+      isLocal,
+      numCores,
+      ioEncryptionKey,
       listenerBus = listenerBus,
       mockOutputCommitCoordinator = mockOutputCommitCoordinator
     )
@@ -189,6 +195,7 @@ object SparkEnv extends Logging {
       hostname: String,
       port: Int,
       numCores: Int,
+      ioEncryptionKey: Option[Array[Byte]],
       isLocal: Boolean): SparkEnv = {
     val env = create(
       conf,
@@ -196,9 +203,9 @@ object SparkEnv extends Logging {
       hostname,
       hostname,
       port,
-      isDriver = false,
-      isLocal = isLocal,
-      numUsableCores = numCores
+      isLocal,
+      numCores,
+      ioEncryptionKey
     )
     SparkEnv.set(env)
     env
@@ -213,18 +220,26 @@ object SparkEnv extends Logging {
       bindAddress: String,
       advertiseAddress: String,
       port: Int,
-      isDriver: Boolean,
       isLocal: Boolean,
       numUsableCores: Int,
+      ioEncryptionKey: Option[Array[Byte]],
       listenerBus: LiveListenerBus = null,
       mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {
 
+    val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER
+
     // Listener bus is only used on the driver
     if (isDriver) {
       assert(listenerBus != null, "Attempted to create driver SparkEnv with null listener bus!")
     }
 
-    val securityManager = new SecurityManager(conf)
+    val securityManager = new SecurityManager(conf, ioEncryptionKey)
+    ioEncryptionKey.foreach { _ =>
+      if (!securityManager.isSaslEncryptionEnabled()) {
+        logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " +
+          "wire.")
+      }
+    }
 
     val systemName = if (isDriver) driverSystemName else executorSystemName
     val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port, conf,
@@ -270,7 +285,7 @@ object SparkEnv extends Logging {
       "spark.serializer", "org.apache.spark.serializer.JavaSerializer")
     logDebug(s"Using serializer: ${serializer.getClass}")
 
-    val serializerManager = new SerializerManager(serializer, conf)
+    val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey)
 
     val closureSerializer = new JavaSerializer(conf)
 
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 7eec4ae64f296..92a27902c6696 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -200,8 +200,8 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         new SecurityManager(executorConf),
         clientMode = true)
       val driver = fetcher.setupEndpointRefByURI(driverUrl)
-      val props = driver.askWithRetry[Seq[(String, String)]](RetrieveSparkProps) ++
-        Seq[(String, String)](("spark.app.id", appId))
+      val cfg = driver.askWithRetry[SparkAppConfig](RetrieveSparkAppConfig)
+      val props = cfg.sparkProperties ++ Seq[(String, String)](("spark.app.id", appId))
       fetcher.shutdown()
 
       // Create SparkEnv using properties we fetched from the driver.
@@ -221,7 +221,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       }
 
       val env = SparkEnv.createExecutorEnv(
-        driverConf, executorId, hostname, port, cores, isLocal = false)
+        driverConf, executorId, hostname, port, cores, cfg.ioEncryptionKey, isLocal = false)
 
       env.rpcEnv.setupEndpoint("Executor", new CoarseGrainedExecutorBackend(
         env.rpcEnv, driverUrl, executorId, hostname, cores, userClassPath, env))
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index edc8aac5d1515..0a4f19d76073e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -28,7 +28,12 @@ private[spark] sealed trait CoarseGrainedClusterMessage extends Serializable
 
 private[spark] object CoarseGrainedClusterMessages {
 
-  case object RetrieveSparkProps extends CoarseGrainedClusterMessage
+  case object RetrieveSparkAppConfig extends CoarseGrainedClusterMessage
+
+  case class SparkAppConfig(
+      sparkProperties: Seq[(String, String)],
+      ioEncryptionKey: Option[Array[Byte]])
+    extends CoarseGrainedClusterMessage
 
   case object RetrieveLastAllocatedExecutorId extends CoarseGrainedClusterMessage
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 10d55c87fb8de..3452487e72e88 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -206,8 +206,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         removeExecutor(executorId, reason)
         context.reply(true)
 
-      case RetrieveSparkProps =>
-        context.reply(sparkProperties)
+      case RetrieveSparkAppConfig =>
+        val reply = SparkAppConfig(sparkProperties,
+          SparkEnv.get.securityManager.getIOEncryptionKey())
+        context.reply(reply)
     }
 
     // Make fake resource offers on all executors
diff --git a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
index f41fc38be2080..8e3436f13480d 100644
--- a/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
+++ b/core/src/main/scala/org/apache/spark/security/CryptoStreamUtils.scala
@@ -18,14 +18,13 @@ package org.apache.spark.security
 
 import java.io.{InputStream, OutputStream}
 import java.util.Properties
+import javax.crypto.KeyGenerator
 import javax.crypto.spec.{IvParameterSpec, SecretKeySpec}
 
 import org.apache.commons.crypto.random._
 import org.apache.commons.crypto.stream._
-import org.apache.hadoop.io.Text
 
 import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 
@@ -33,10 +32,6 @@ import org.apache.spark.internal.config._
  * A util class for manipulating IO encryption and decryption streams.
  */
 private[spark] object CryptoStreamUtils extends Logging {
-  /**
-   * Constants and variables for spark IO encryption
-   */
-  val SPARK_IO_TOKEN = new Text("SPARK_IO_TOKEN")
 
   // The initialization vector length in bytes.
   val IV_LENGTH_IN_BYTES = 16
@@ -50,12 +45,11 @@ private[spark] object CryptoStreamUtils extends Logging {
    */
   def createCryptoOutputStream(
       os: OutputStream,
-      sparkConf: SparkConf): OutputStream = {
+      sparkConf: SparkConf,
+      key: Array[Byte]): OutputStream = {
     val properties = toCryptoConf(sparkConf)
     val iv = createInitializationVector(properties)
     os.write(iv)
-    val credentials = SparkHadoopUtil.get.getCurrentUserCredentials()
-    val key = credentials.getSecretKey(SPARK_IO_TOKEN)
     val transformationStr = sparkConf.get(IO_CRYPTO_CIPHER_TRANSFORMATION)
     new CryptoOutputStream(transformationStr, properties, os,
       new SecretKeySpec(key, "AES"), new IvParameterSpec(iv))
@@ -66,12 +60,11 @@ private[spark] object CryptoStreamUtils extends Logging {
    */
   def createCryptoInputStream(
       is: InputStream,
-      sparkConf: SparkConf): InputStream = {
+      sparkConf: SparkConf,
+      key: Array[Byte]): InputStream = {
     val properties = toCryptoConf(sparkConf)
     val iv = new Array[Byte](IV_LENGTH_IN_BYTES)
     is.read(iv, 0, iv.length)
-    val credentials = SparkHadoopUtil.get.getCurrentUserCredentials()
-    val key = credentials.getSecretKey(SPARK_IO_TOKEN)
     val transformationStr = sparkConf.get(IO_CRYPTO_CIPHER_TRANSFORMATION)
     new CryptoInputStream(transformationStr, properties, is,
       new SecretKeySpec(key, "AES"), new IvParameterSpec(iv))
@@ -91,6 +84,17 @@ private[spark] object CryptoStreamUtils extends Logging {
     props
   }
 
+  /**
+   * Creates a new encryption key.
+   */
+  def createKey(conf: SparkConf): Array[Byte] = {
+    val keyLen = conf.get(IO_ENCRYPTION_KEY_SIZE_BITS)
+    val ioKeyGenAlgorithm = conf.get(IO_ENCRYPTION_KEYGEN_ALGORITHM)
+    val keyGen = KeyGenerator.getInstance(ioKeyGenAlgorithm)
+    keyGen.init(keyLen)
+    keyGen.generateKey().getEncoded()
+  }
+
   /**
    * This method to generate an IV (Initialization Vector) using secure random.
    */
diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
index 2156d576f1874..ef8432ec0834a 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
@@ -33,7 +33,12 @@ import org.apache.spark.util.io.{ChunkedByteBuffer, ChunkedByteBufferOutputStrea
  * Component which configures serialization, compression and encryption for various Spark
  * components, including automatic selection of which [[Serializer]] to use for shuffles.
  */
-private[spark] class SerializerManager(defaultSerializer: Serializer, conf: SparkConf) {
+private[spark] class SerializerManager(
+    defaultSerializer: Serializer,
+    conf: SparkConf,
+    encryptionKey: Option[Array[Byte]]) {
+
+  def this(defaultSerializer: Serializer, conf: SparkConf) = this(defaultSerializer, conf, None)
 
   private[this] val kryoSerializer = new KryoSerializer(conf)
 
@@ -63,9 +68,6 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
   // Whether to compress shuffle output temporarily spilled to disk
   private[this] val compressShuffleSpill = conf.getBoolean("spark.shuffle.spill.compress", true)
 
-  // Whether to enable IO encryption
-  private[this] val enableIOEncryption = conf.get(IO_ENCRYPTION_ENABLED)
-
   /* The compression codec to use. Note that the "lazy" val is necessary because we want to delay
    * the initialization of the compression codec until it is first used. The reason is that a Spark
    * program could be using a user-defined codec in a third party jar, which is loaded in
@@ -125,14 +127,18 @@ private[spark] class SerializerManager(defaultSerializer: Serializer, conf: Spar
    * Wrap an input stream for encryption if shuffle encryption is enabled
    */
   private[this] def wrapForEncryption(s: InputStream): InputStream = {
-    if (enableIOEncryption) CryptoStreamUtils.createCryptoInputStream(s, conf) else s
+    encryptionKey
+      .map { key => CryptoStreamUtils.createCryptoInputStream(s, conf, key) }
+      .getOrElse(s)
   }
 
   /**
    * Wrap an output stream for encryption if shuffle encryption is enabled
    */
   private[this] def wrapForEncryption(s: OutputStream): OutputStream = {
-    if (enableIOEncryption) CryptoStreamUtils.createCryptoOutputStream(s, conf) else s
+    encryptionKey
+      .map { key => CryptoStreamUtils.createCryptoOutputStream(s, conf, key) }
+      .getOrElse(s)
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
index 81eb907ac7ba6..a61ec74c7df8b 100644
--- a/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/security/CryptoStreamUtilsSuite.scala
@@ -16,18 +16,21 @@
  */
 package org.apache.spark.security
 
-import java.security.PrivilegedExceptionAction
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.nio.charset.StandardCharsets.UTF_8
+import java.util.UUID
 
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import com.google.common.io.ByteStreams
 
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark._
 import org.apache.spark.internal.config._
 import org.apache.spark.security.CryptoStreamUtils._
+import org.apache.spark.serializer.{JavaSerializer, SerializerManager}
+import org.apache.spark.storage.TempShuffleBlockId
 
 class CryptoStreamUtilsSuite extends SparkFunSuite {
-  val ugi = UserGroupInformation.createUserForTesting("testuser", Array("testgroup"))
 
-  test("Crypto configuration conversion") {
+  test("crypto configuration conversion") {
     val sparkKey1 = s"${SPARK_IO_ENCRYPTION_COMMONS_CONFIG_PREFIX}a.b.c"
     val sparkVal1 = "val1"
     val cryptoKey1 = s"${COMMONS_CRYPTO_CONF_PREFIX}a.b.c"
@@ -43,65 +46,85 @@ class CryptoStreamUtilsSuite extends SparkFunSuite {
     assert(!props.containsKey(cryptoKey2))
   }
 
-  test("Shuffle encryption is disabled by default") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        initCredentials(conf, credentials)
-        assert(credentials.getSecretKey(SPARK_IO_TOKEN) === null)
-      }
-    })
+  test("shuffle encryption key length should be 128 by default") {
+    val conf = createConf()
+    var key = CryptoStreamUtils.createKey(conf)
+    val actual = key.length * (java.lang.Byte.SIZE)
+    assert(actual === 128)
   }
 
-  test("Shuffle encryption key length should be 128 by default") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        initCredentials(conf, credentials)
-        var key = credentials.getSecretKey(SPARK_IO_TOKEN)
-        assert(key !== null)
-        val actual = key.length * (java.lang.Byte.SIZE)
-        assert(actual === 128)
-      }
-    })
+  test("create 256-bit key") {
+    val conf = createConf(IO_ENCRYPTION_KEY_SIZE_BITS.key -> "256")
+    var key = CryptoStreamUtils.createKey(conf)
+    val actual = key.length * (java.lang.Byte.SIZE)
+    assert(actual === 256)
   }
 
-  test("Initial credentials with key length in 256") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        conf.set(IO_ENCRYPTION_KEY_SIZE_BITS, 256)
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        initCredentials(conf, credentials)
-        var key = credentials.getSecretKey(SPARK_IO_TOKEN)
-        assert(key !== null)
-        val actual = key.length * (java.lang.Byte.SIZE)
-        assert(actual === 256)
-      }
-    })
+  test("create key with invalid length") {
+    intercept[IllegalArgumentException] {
+      val conf = createConf(IO_ENCRYPTION_KEY_SIZE_BITS.key -> "328")
+      CryptoStreamUtils.createKey(conf)
+    }
   }
 
-  test("Initial credentials with invalid key length") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        val credentials = UserGroupInformation.getCurrentUser.getCredentials()
-        val conf = new SparkConf()
-        conf.set(IO_ENCRYPTION_KEY_SIZE_BITS, 328)
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        val thrown = intercept[IllegalArgumentException] {
-          initCredentials(conf, credentials)
-        }
-      }
-    })
+  test("serializer manager integration") {
+    val conf = createConf()
+      .set("spark.shuffle.compress", "true")
+      .set("spark.shuffle.spill.compress", "true")
+
+    val plainStr = "hello world"
+    val blockId = new TempShuffleBlockId(UUID.randomUUID())
+    val key = Some(CryptoStreamUtils.createKey(conf))
+    val serializerManager = new SerializerManager(new JavaSerializer(conf), conf,
+      encryptionKey = key)
+
+    val outputStream = new ByteArrayOutputStream()
+    val wrappedOutputStream = serializerManager.wrapStream(blockId, outputStream)
+    wrappedOutputStream.write(plainStr.getBytes(UTF_8))
+    wrappedOutputStream.close()
+
+    val encryptedBytes = outputStream.toByteArray
+    val encryptedStr = new String(encryptedBytes, UTF_8)
+    assert(plainStr !== encryptedStr)
+
+    val inputStream = new ByteArrayInputStream(encryptedBytes)
+    val wrappedInputStream = serializerManager.wrapStream(blockId, inputStream)
+    val decryptedBytes = ByteStreams.toByteArray(wrappedInputStream)
+    val decryptedStr = new String(decryptedBytes, UTF_8)
+    assert(decryptedStr === plainStr)
   }
 
-  private[this] def initCredentials(conf: SparkConf, credentials: Credentials): Unit = {
-    if (conf.get(IO_ENCRYPTION_ENABLED)) {
-      SecurityManager.initIOEncryptionKey(conf, credentials)
+  test("encryption key propagation to executors") {
+    val conf = createConf().setAppName("Crypto Test").setMaster("local-cluster[1,1,1024]")
+    val sc = new SparkContext(conf)
+    try {
+      val content = "This is the content to be encrypted."
+      val encrypted = sc.parallelize(Seq(1))
+        .map { str =>
+          val bytes = new ByteArrayOutputStream()
+          val out = CryptoStreamUtils.createCryptoOutputStream(bytes, SparkEnv.get.conf,
+            SparkEnv.get.securityManager.getIOEncryptionKey().get)
+          out.write(content.getBytes(UTF_8))
+          out.close()
+          bytes.toByteArray()
+        }.collect()(0)
+
+      assert(content != encrypted)
+
+      val in = CryptoStreamUtils.createCryptoInputStream(new ByteArrayInputStream(encrypted),
+        sc.conf, SparkEnv.get.securityManager.getIOEncryptionKey().get)
+      val decrypted = new String(ByteStreams.toByteArray(in), UTF_8)
+      assert(content === decrypted)
+    } finally {
+      sc.stop()
     }
   }
+
+  private def createConf(extra: (String, String)*): SparkConf = {
+    val conf = new SparkConf()
+    extra.foreach { case (k, v) => conf.set(k, v) }
+    conf.set(IO_ENCRYPTION_ENABLED, true)
+    conf
+  }
+
 }
diff --git a/docs/configuration.md b/docs/configuration.md
index c2329b411fc69..a6ba6cf6ee7aa 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -572,7 +572,8 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.io.encryption.enabled</code></td>
   <td>false</td>
   <td>
-    Enable IO encryption. Only supported in YARN mode.
+    Enable IO encryption. Currently supported by all modes except Mesos. It's recommended that RPC encryption
+    be enabled when using this feature.
   </td>
 </tr>
 <tr>
diff --git a/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
index 1937bd30bac51..ee9149ce0208b 100644
--- a/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ b/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -75,7 +75,7 @@ private[spark] class MesosExecutorBackend
     val conf = new SparkConf(loadDefaults = true).setAll(properties)
     val port = conf.getInt("spark.executor.port", 0)
     val env = SparkEnv.createExecutorEnv(
-      conf, executorId, slaveInfo.getHostname, port, cpusPerTask, isLocal = false)
+      conf, executorId, slaveInfo.getHostname, port, cpusPerTask, None, isLocal = false)
 
     executor = new Executor(
       executorId,
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
index a849c4afa24f5..ed29b346ba263 100644
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
+++ b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.scheduler.cluster.mesos
 
 import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 
 /**
@@ -37,6 +38,9 @@ private[spark] class MesosClusterManager extends ExternalClusterManager {
   override def createSchedulerBackend(sc: SparkContext,
       masterURL: String,
       scheduler: TaskScheduler): SchedulerBackend = {
+    require(!sc.conf.get(IO_ENCRYPTION_ENABLED),
+      "I/O encryption is currently not supported in Mesos.")
+
     val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
     val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
     if (coarse) {
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
index 6fce06632c57e..a55855428b471 100644
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
+++ b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.scheduler.cluster.mesos
 
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark._
+import org.apache.spark.internal.config._
 
 class MesosClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
     def testURL(masterURL: String, expectedClass: Class[_], coarse: Boolean) {
@@ -44,4 +45,12 @@ class MesosClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
           classOf[MesosFineGrainedSchedulerBackend],
           coarse = false)
     }
+
+    test("mesos with i/o encryption throws error") {
+      val se = intercept[SparkException] {
+        val conf = new SparkConf().setAppName("test").set(IO_ENCRYPTION_ENABLED, true)
+        sc = new SparkContext("mesos", "test", conf)
+      }
+      assert(se.getCause().isInstanceOf[IllegalArgumentException])
+    }
 }
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index e77fa386dc933..2c7d9d6b3ed02 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1013,12 +1013,7 @@ private[spark] class Client(
     val securityManager = new SecurityManager(sparkConf)
     amContainer.setApplicationACLs(
       YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
-
-    if (sparkConf.get(IO_ENCRYPTION_ENABLED)) {
-      SecurityManager.initIOEncryptionKey(sparkConf, credentials)
-    }
     setupSecurityToken(amContainer)
-
     amContainer
   }
 
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala
deleted file mode 100644
index 1c60315b21ae8..0000000000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/IOEncryptionSuite.scala
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.yarn
-
-import java.io._
-import java.nio.charset.StandardCharsets
-import java.security.PrivilegedExceptionAction
-import java.util.UUID
-
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Matchers}
-
-import org.apache.spark._
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.internal.config._
-import org.apache.spark.serializer._
-import org.apache.spark.storage._
-
-class IOEncryptionSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
-  with BeforeAndAfterEach {
-  private[this] val blockId = new TempShuffleBlockId(UUID.randomUUID())
-  private[this] val conf = new SparkConf()
-  private[this] val ugi = UserGroupInformation.createUserForTesting("testuser", Array("testgroup"))
-  private[this] val serializer = new KryoSerializer(conf)
-
-  override def beforeAll(): Unit = {
-    System.setProperty("SPARK_YARN_MODE", "true")
-    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
-      override def run(): Unit = {
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        val creds = new Credentials()
-        SecurityManager.initIOEncryptionKey(conf, creds)
-        SparkHadoopUtil.get.addCurrentUserCredentials(creds)
-      }
-    })
-  }
-
-  override def afterAll(): Unit = {
-    SparkEnv.set(null)
-    System.clearProperty("SPARK_YARN_MODE")
-  }
-
-  override def beforeEach(): Unit = {
-    super.beforeEach()
-  }
-
-  override def afterEach(): Unit = {
-    super.afterEach()
-    conf.set("spark.shuffle.compress", false.toString)
-    conf.set("spark.shuffle.spill.compress", false.toString)
-  }
-
-  test("IO encryption read and write") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit] {
-      override def run(): Unit = {
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        conf.set("spark.shuffle.compress", false.toString)
-        conf.set("spark.shuffle.spill.compress", false.toString)
-        testYarnIOEncryptionWriteRead()
-      }
-    })
-  }
-
-  test("IO encryption read and write with shuffle compression enabled") {
-    ugi.doAs(new PrivilegedExceptionAction[Unit] {
-      override def run(): Unit = {
-        conf.set(IO_ENCRYPTION_ENABLED, true)
-        conf.set("spark.shuffle.compress", true.toString)
-        conf.set("spark.shuffle.spill.compress", true.toString)
-        testYarnIOEncryptionWriteRead()
-      }
-    })
-  }
-
-  private[this] def testYarnIOEncryptionWriteRead(): Unit = {
-    val plainStr = "hello world"
-    val outputStream = new ByteArrayOutputStream()
-    val serializerManager = new SerializerManager(serializer, conf)
-    val wrappedOutputStream = serializerManager.wrapStream(blockId, outputStream)
-    wrappedOutputStream.write(plainStr.getBytes(StandardCharsets.UTF_8))
-    wrappedOutputStream.close()
-
-    val encryptedBytes = outputStream.toByteArray
-    val encryptedStr = new String(encryptedBytes)
-    assert(plainStr !== encryptedStr)
-
-    val inputStream = new ByteArrayInputStream(encryptedBytes)
-    val wrappedInputStream = serializerManager.wrapStream(blockId, inputStream)
-    val decryptedBytes = new Array[Byte](1024)
-    val len = wrappedInputStream.read(decryptedBytes)
-    val decryptedStr = new String(decryptedBytes, 0, len, StandardCharsets.UTF_8)
-    assert(decryptedStr === plainStr)
-  }
-}

From 1759cf69aa1a7059a5fe78d012a54bc0ba02677c Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Mon, 28 Nov 2016 21:43:33 -0800
Subject: [PATCH 217/534] [SPARK-18058][SQL][TRIVIAL] Use
 dataType.sameResult(...) instead equality on asNullable datatypes

## What changes were proposed in this pull request?
This is absolutely minor. PR https://github.com/apache/spark/pull/15595 uses `dt1.asNullable == dt2.asNullable` expressions in a few places. It is however more efficient to call `dt1.sameType(dt2)`. I have replaced every instance of the first pattern with the second pattern (3/5 were introduced by #15595).

## How was this patch tested?
Existing tests.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #16041 from hvanhovell/SPARK-18058.

(cherry picked from commit d449988b8819775fcfd27da53bb5143a7aab01f7)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/sql/catalyst/analysis/CheckAnalysis.scala  | 2 +-
 .../sql/catalyst/expressions/conditionalExpressions.scala   | 2 +-
 .../sql/catalyst/plans/logical/basicLogicalOperators.scala  | 6 +++---
 .../sql/execution/datasources/DataSourceStrategy.scala      | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 26d26385904f6..db417526ed5b9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -306,7 +306,7 @@ trait CheckAnalysis extends PredicateHelper {
               // Check if the data types match.
               dataTypes(child).zip(ref).zipWithIndex.foreach { case ((dt1, dt2), ci) =>
                 // SPARK-18058: we shall not care about the nullability of columns
-                if (dt1.asNullable != dt2.asNullable) {
+                if (!dt1.sameType(dt2)) {
                   failAnalysis(
                     s"""
                       |${operator.nodeName} can only be performed on tables with the compatible
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index a7d9e2dfcdb62..afc190e6978d4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -41,7 +41,7 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     if (predicate.dataType != BooleanType) {
       TypeCheckResult.TypeCheckFailure(
         s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
-    } else if (trueValue.dataType.asNullable != falseValue.dataType.asNullable) {
+    } else if (!trueValue.dataType.sameType(falseValue.dataType)) {
       TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " +
         s"(${trueValue.dataType.simpleString} and ${falseValue.dataType.simpleString}).")
     } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index dd6c8fd1dcf3e..da42df3366307 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -135,7 +135,7 @@ abstract class SetOperation(left: LogicalPlan, right: LogicalPlan) extends Binar
     childrenResolved &&
       left.output.length == right.output.length &&
       left.output.zip(right.output).forall { case (l, r) =>
-        l.dataType.asNullable == r.dataType.asNullable
+        l.dataType.sameType(r.dataType)
       } && duplicateResolved
 }
 
@@ -212,8 +212,8 @@ case class Union(children: Seq[LogicalPlan]) extends LogicalPlan {
         child.output.length == children.head.output.length &&
         // compare the data types with the first child
         child.output.zip(children.head.output).forall {
-          case (l, r) => l.dataType.asNullable == r.dataType.asNullable }
-      )
+          case (l, r) => l.dataType.sameType(r.dataType)
+        })
     children.length > 1 && childrenResolved && allChildrenCompatible
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 4f19a2d00b0e4..f3d92bf7cc245 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -163,7 +163,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
 
     case i @ logical.InsertIntoTable(
            l @ LogicalRelation(t: HadoopFsRelation, _, table), part, query, overwrite, false)
-        if query.resolved && t.schema.asNullable == query.schema.asNullable =>
+        if query.resolved && t.schema.sameType(query.schema) =>
 
       // Sanity checks
       if (t.location.rootPaths.size != 1) {

From 27a1a5c99ff471ee15b56995d56cfd39b3ffe6e8 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 28 Nov 2016 21:58:01 -0800
Subject: [PATCH 218/534] [SPARK-18544][SQL] Append with df.saveAsTable writes
 data to wrong location

## What changes were proposed in this pull request?

We failed to properly propagate table metadata for existing tables for the saveAsTable command. This caused a downstream component to think the table was MANAGED, writing data to the wrong location.

## How was this patch tested?

Unit test that fails before the patch.

Author: Eric Liang <ekl@databricks.com>

Closes #15983 from ericl/spark-18544.

(cherry picked from commit e2318ede04fa7a756d1c8151775e1f2406a176ca)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/sql/DataFrameWriter.scala    | 21 ++++++++++++-------
 .../command/createDataSourceTables.scala      |  3 ++-
 .../PartitionProviderCompatibilitySuite.scala | 19 +++++++++++++++++
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 2d863422fbabe..8294e4149b1c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -373,8 +373,19 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         throw new AnalysisException(s"Table $tableIdent already exists.")
 
       case _ =>
-        val storage = DataSource.buildStorageFormatFromOptions(extraOptions.toMap)
-        val tableType = if (storage.locationUri.isDefined) {
+        val existingTable = if (tableExists) {
+          Some(df.sparkSession.sessionState.catalog.getTableMetadata(tableIdent))
+        } else {
+          None
+        }
+        val storage = if (tableExists) {
+          existingTable.get.storage
+        } else {
+          DataSource.buildStorageFormatFromOptions(extraOptions.toMap)
+        }
+        val tableType = if (tableExists) {
+          existingTable.get.tableType
+        } else if (storage.locationUri.isDefined) {
           CatalogTableType.EXTERNAL
         } else {
           CatalogTableType.MANAGED
@@ -391,12 +402,6 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         )
         df.sparkSession.sessionState.executePlan(
           CreateTable(tableDesc, mode, Some(df.logicalPlan))).toRdd
-        if (tableDesc.partitionColumnNames.nonEmpty &&
-            df.sparkSession.sqlContext.conf.manageFilesourcePartitions) {
-          // Need to recover partitions into the metastore so our saved data is visible.
-          df.sparkSession.sessionState.executePlan(
-            AlterTableRecoverPartitionsCommand(tableDesc.identifier)).toRdd
-        }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index add732c1afc16..422700c89194a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -212,7 +212,8 @@ case class CreateDataSourceTableAsSelectCommand(
       className = provider,
       partitionColumns = table.partitionColumnNames,
       bucketSpec = table.bucketSpec,
-      options = table.storage.properties ++ pathOption)
+      options = table.storage.properties ++ pathOption,
+      catalogTable = Some(table))
 
     val result = try {
       dataSource.write(mode, df)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index a1aa07456fd36..cace5fa95cad0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -188,6 +188,25 @@ class PartitionProviderCompatibilitySuite
     }
   }
 
+  for (enabled <- Seq(true, false)) {
+    test(s"SPARK-18544 append with saveAsTable - partition management $enabled") {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> enabled.toString) {
+        withTable("test") {
+          withTempDir { dir =>
+            setupPartitionedDatasourceTable("test", dir)
+            if (enabled) {
+              spark.sql("msck repair table test")
+            }
+            assert(spark.sql("select * from test").count() == 5)
+            spark.range(10).selectExpr("id as fieldOne", "id as partCol")
+              .write.partitionBy("partCol").mode("append").saveAsTable("test")
+            assert(spark.sql("select * from test").count() == 15)
+          }
+        }
+      }
+    }
+  }
+
   /**
    * Runs a test against a multi-level partitioned table, then validates that the custom locations
    * were respected by the output writer.

From ea6957da20d3e03b95342a03a188c7ab5880cac7 Mon Sep 17 00:00:00 2001
From: Tyson Condie <tcondie@gmail.com>
Date: Mon, 28 Nov 2016 23:07:17 -0800
Subject: [PATCH 219/534] [SPARK-18339][SPARK-18513][SQL] Don't push down
 current_timestamp for filters in StructuredStreaming and persist batch and
 watermark timestamps to offset log.

## What changes were proposed in this pull request?

For the following workflow:
1. I have a column called time which is at minute level precision in a Streaming DataFrame
2. I want to perform groupBy time, count
3. Then I want my MemorySink to only have the last 30 minutes of counts and I perform this by
.where('time >= current_timestamp().cast("long") - 30 * 60)
what happens is that the `filter` gets pushed down before the aggregation, and the filter happens on the source data for the aggregation instead of the result of the aggregation (where I actually want to filter).
I guess the main issue here is that `current_timestamp` is non-deterministic in the streaming context and shouldn't be pushed down the filter.
Does this require us to store the `current_timestamp` for each trigger of the streaming job, that is something to discuss.

Furthermore, we want to persist current batch timestamp and watermark timestamp to the offset log so that these values are consistent across multiple executions of the same batch.

brkyvz zsxwing tdas

## How was this patch tested?

A test was added to StreamingAggregationSuite ensuring the above use case is handled. The test injects a stream of time values (in seconds) to a query that runs in complete mode and only outputs the (count) aggregation results for the past 10 seconds.

Author: Tyson Condie <tcondie@gmail.com>

Closes #15949 from tcondie/SPARK-18339.

(cherry picked from commit 3c0beea4752d39ee630a107316f40aff4a1b4ae7)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../expressions/datetimeExpressions.scala     |  33 +++++-
 .../streaming/IncrementalExecution.scala      |  19 +++-
 .../execution/streaming/StreamExecution.scala |  67 +++++++++---
 .../execution/streaming/StreamProgress.scala  |   4 +-
 .../sql/execution/streaming/memory.scala      |   4 +
 .../StreamExecutionMetadataSuite.scala        |  35 ++++++
 .../streaming/StreamingAggregationSuite.scala | 100 ++++++++++++++++++
 .../sql/streaming/StreamingQuerySuite.scala   |   4 +-
 .../spark/sql/streaming/WatermarkSuite.scala  |  40 ++++---
 9 files changed, 273 insertions(+), 33 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 1db1d1995d942..ef1ac360daada 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -17,14 +17,14 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.sql.Timestamp
 import java.text.SimpleDateFormat
 import java.util.{Calendar, Locale, TimeZone}
 
 import scala.util.Try
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback,
-  ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -71,6 +71,35 @@ case class CurrentTimestamp() extends LeafExpression with CodegenFallback {
   override def prettyName: String = "current_timestamp"
 }
 
+/**
+ * Expression representing the current batch time, which is used by StreamExecution to
+ * 1. prevent optimizer from pushing this expression below a stateful operator
+ * 2. allow IncrementalExecution to substitute this expression with a Literal(timestamp)
+ *
+ * There is no code generation since this expression should be replaced with a literal.
+ */
+case class CurrentBatchTimestamp(timestampMs: Long, dataType: DataType)
+  extends LeafExpression with Nondeterministic with CodegenFallback {
+
+  override def nullable: Boolean = false
+
+  override def prettyName: String = "current_batch_timestamp"
+
+  override protected def initializeInternal(partitionIndex: Int): Unit = {}
+
+  /**
+   * Need to return literal value in order to support compile time expression evaluation
+   * e.g., select(current_date())
+   */
+  override protected def evalInternal(input: InternalRow): Any = toLiteral.value
+
+  def toLiteral: Literal = dataType match {
+    case _: TimestampType =>
+      Literal(DateTimeUtils.fromJavaTimestamp(new Timestamp(timestampMs)), TimestampType)
+    case _: DateType => Literal(DateTimeUtils.millisToDays(timestampMs), DateType)
+  }
+}
+
 /**
  * Adds a number of days to startdate.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index e9d072f8a98b0..6ab6fa61dc200 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.sql.{InternalOutputModes, SparkSession}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.{CurrentBatchTimestamp, Literal}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode}
@@ -34,7 +36,7 @@ class IncrementalExecution(
     val checkpointLocation: String,
     val currentBatchId: Long,
     val currentEventTimeWatermark: Long)
-  extends QueryExecution(sparkSession, logicalPlan) {
+  extends QueryExecution(sparkSession, logicalPlan) with Logging {
 
   // TODO: make this always part of planning.
   val stateStrategy =
@@ -49,6 +51,19 @@ class IncrementalExecution(
       sparkSession.sessionState.conf,
       stateStrategy)
 
+  /**
+   * See [SPARK-18339]
+   * Walk the optimized logical plan and replace CurrentBatchTimestamp
+   * with the desired literal
+   */
+  override lazy val optimizedPlan: LogicalPlan = {
+    sparkSession.sessionState.optimizer.execute(withCachedData) transformAllExpressions {
+      case ts @ CurrentBatchTimestamp(timestamp, _) =>
+        logInfo(s"Current batch timestamp = $timestamp")
+        ts.toLiteral
+    }
+  }
+
   /**
    * Records the current id for a given stateful operator in the query plan as the `state`
    * preparation walks the query plan.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 3ca6feac05cef..21664d7fd0381 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -25,11 +25,13 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.{QueryExecution, SparkPlan}
@@ -92,8 +94,8 @@ class StreamExecution(
   /** The current batchId or -1 if execution has not yet been initialized. */
   private var currentBatchId: Long = -1
 
-  /** The current eventTime watermark, used to bound the lateness of data that will processed. */
-  private var currentEventTimeWatermark: Long = 0
+  /** Stream execution metadata */
+  private var streamExecutionMetadata = StreamExecutionMetadata()
 
   /** All stream sources present in the query plan. */
   private val sources =
@@ -251,7 +253,7 @@ class StreamExecution(
           this,
           s"Query $name terminated with exception: ${e.getMessage}",
           e,
-          Some(committedOffsets.toOffsetSeq(sources)))
+          Some(committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json)))
         logError(s"Query $name terminated with error", e)
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
         // handle them
@@ -288,7 +290,9 @@ class StreamExecution(
         logInfo(s"Resuming streaming query, starting with batch $batchId")
         currentBatchId = batchId
         availableOffsets = nextOffsets.toStreamProgress(sources)
-        logDebug(s"Found possibly uncommitted offsets $availableOffsets")
+        streamExecutionMetadata = StreamExecutionMetadata(nextOffsets.metadata.getOrElse("{}"))
+        logDebug(s"Found possibly unprocessed offsets $availableOffsets " +
+          s"at batch timestamp ${streamExecutionMetadata.batchTimestampMs}")
 
         offsetLog.get(batchId - 1).foreach {
           case lastOffsets =>
@@ -344,10 +348,14 @@ class StreamExecution(
       }
     }
     if (hasNewData) {
+      // Current batch timestamp in milliseconds
+      streamExecutionMetadata.batchTimestampMs = triggerClock.getTimeMillis()
       reportTimeTaken(OFFSET_WAL_WRITE_LATENCY) {
-        assert(offsetLog.add(currentBatchId, availableOffsets.toOffsetSeq(sources)),
+        assert(offsetLog.add(currentBatchId,
+          availableOffsets.toOffsetSeq(sources, streamExecutionMetadata.json)),
           s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
-        logInfo(s"Committed offsets for batch $currentBatchId.")
+        logInfo(s"Committed offsets for batch $currentBatchId. " +
+          s"Metadata ${streamExecutionMetadata.toString}")
 
         // NOTE: The following code is correct because runBatches() processes exactly one
         // batch at a time. If we add pipeline parallelism (multiple batches in flight at
@@ -422,6 +430,12 @@ class StreamExecution(
     val replacementMap = AttributeMap(replacements)
     val triggerLogicalPlan = withNewSources transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) => replacementMap(a)
+      case ct: CurrentTimestamp =>
+        CurrentBatchTimestamp(streamExecutionMetadata.batchTimestampMs,
+          ct.dataType)
+      case cd: CurrentDate =>
+        CurrentBatchTimestamp(streamExecutionMetadata.batchTimestampMs,
+          cd.dataType)
     }
 
     val executedPlan = reportTimeTaken(OPTIMIZER_LATENCY) {
@@ -431,7 +445,7 @@ class StreamExecution(
         outputMode,
         checkpointFile("state"),
         currentBatchId,
-        currentEventTimeWatermark)
+        streamExecutionMetadata.batchWatermarkMs)
       lastExecution.executedPlan // Force the lazy generation of execution plan
     }
 
@@ -447,11 +461,12 @@ class StreamExecution(
         logTrace(s"Maximum observed eventTime: ${e.maxEventTime.value}")
         (e.maxEventTime.value / 1000) - e.delay.milliseconds()
     }.headOption.foreach { newWatermark =>
-      if (newWatermark > currentEventTimeWatermark) {
+      if (newWatermark > streamExecutionMetadata.batchWatermarkMs) {
         logInfo(s"Updating eventTime watermark to: $newWatermark ms")
-        currentEventTimeWatermark = newWatermark
+        streamExecutionMetadata.batchWatermarkMs = newWatermark
       } else {
-        logTrace(s"Event time didn't move: $newWatermark < $currentEventTimeWatermark")
+        logTrace(s"Event time didn't move: $newWatermark < " +
+          s"$streamExecutionMetadata.currentEventTimeWatermark")
       }
 
       if (newWatermark != 0) {
@@ -713,7 +728,7 @@ class StreamExecution(
     }.toArray
     val sinkStatus = SinkStatus(
       sink.toString,
-      committedOffsets.toOffsetSeq(sources).toString)
+      committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json).toString)
 
     currentStatus =
       StreamingQueryStatus(
@@ -740,6 +755,34 @@ object StreamExecution {
   def nextId: Long = _nextId.getAndIncrement()
 }
 
+/**
+ * Contains metadata associated with a stream execution. This information is
+ * persisted to the offset log via the OffsetSeq metadata field. Current
+ * information contained in this object includes:
+ *
+ * @param batchWatermarkMs: The current eventTime watermark, used to
+ * bound the lateness of data that will processed. Time unit: milliseconds
+ * @param batchTimestampMs: The current batch processing timestamp.
+ * Time unit: milliseconds
+ */
+case class StreamExecutionMetadata(
+    var batchWatermarkMs: Long = 0,
+    var batchTimestampMs: Long = 0) {
+  private implicit val formats = StreamExecutionMetadata.formats
+
+  /**
+   * JSON string representation of this object.
+   */
+  def json: String = Serialization.write(this)
+}
+
+object StreamExecutionMetadata {
+  private implicit val formats = Serialization.formats(NoTypeHints)
+
+  def apply(json: String): StreamExecutionMetadata =
+    Serialization.read[StreamExecutionMetadata](json)
+}
+
 /**
  * A special thread to run the stream query. Some codes require to run in the StreamExecutionThread
  * and will use `classOf[StreamExecutionThread]` to check.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index 05a65476709cd..21b8750ca913d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -26,8 +26,8 @@ class StreamProgress(
     val baseMap: immutable.Map[Source, Offset] = new immutable.HashMap[Source, Offset])
   extends scala.collection.immutable.Map[Source, Offset] {
 
-  def toOffsetSeq(source: Seq[Source]): OffsetSeq = {
-    OffsetSeq(source.map(get))
+  def toOffsetSeq(source: Seq[Source], metadata: String): OffsetSeq = {
+    OffsetSeq(source.map(get), Some(metadata))
   }
 
   override def toString: String =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 582b5481220da..adf6963577f49 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -206,6 +206,10 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }
   }
 
+  def clear(): Unit = {
+    batches.clear()
+  }
+
   override def toString(): String = "MemorySink"
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala
new file mode 100644
index 0000000000000..c7139c588d1d3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import org.apache.spark.sql.execution.streaming.StreamExecutionMetadata
+
+class StreamExecutionMetadataSuite extends StreamTest {
+
+  test("stream execution metadata") {
+    assert(StreamExecutionMetadata(0, 0) ===
+      StreamExecutionMetadata("""{}"""))
+    assert(StreamExecutionMetadata(1, 0) ===
+      StreamExecutionMetadata("""{"batchWatermarkMs":1}"""))
+    assert(StreamExecutionMetadata(0, 2) ===
+      StreamExecutionMetadata("""{"batchTimestampMs":2}"""))
+    assert(StreamExecutionMetadata(1, 2) ===
+      StreamExecutionMetadata(
+        """{"batchWatermarkMs":1,"batchTimestampMs":2}"""))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index e59b5491f90b6..fbe560e8d9181 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.TimeZone
+
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.InternalOutputModes._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
@@ -235,4 +238,101 @@ class StreamingAggregationSuite extends StreamTest with BeforeAndAfterAll {
       CheckLastBatch(("a", 30), ("b", 3), ("c", 1))
     )
   }
+
+  test("prune results by current_time, complete mode") {
+    import testImplicits._
+    val clock = new StreamManualClock
+    val inputData = MemoryStream[Long]
+    val aggregated =
+      inputData.toDF()
+        .groupBy($"value")
+        .agg(count("*"))
+        .where('value >= current_timestamp().cast("long") - 10L)
+
+    testStream(aggregated, Complete)(
+      StartStream(ProcessingTime("10 seconds"), triggerClock = clock),
+
+      // advance clock to 10 seconds, all keys retained
+      AddData(inputData, 0L, 5L, 5L, 10L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
+
+      // advance clock to 20 seconds, should retain keys >= 10
+      AddData(inputData, 15L, 15L, 20L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((10L, 1), (15L, 2), (20L, 1)),
+
+      // advance clock to 30 seconds, should retain keys >= 20
+      AddData(inputData, 0L, 85L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((20L, 1), (85L, 1)),
+
+      // bounce stream and ensure correct batch timestamp is used
+      // i.e., we don't take it from the clock, which is at 90 seconds.
+      StopStream,
+      AssertOnQuery { q => // clear the sink
+        q.sink.asInstanceOf[MemorySink].clear()
+        // advance by a minute i.e., 90 seconds total
+        clock.advance(60 * 1000L)
+        true
+      },
+      StartStream(ProcessingTime("10 seconds"), triggerClock = clock),
+      CheckLastBatch((20L, 1), (85L, 1)),
+      AssertOnQuery { q =>
+        clock.getTimeMillis() == 90000L
+      },
+
+      // advance clock to 100 seconds, should retain keys >= 90
+      AddData(inputData, 85L, 90L, 100L, 105L),
+      AdvanceManualClock(10 * 1000),
+      CheckLastBatch((90L, 1), (100L, 1), (105L, 1))
+    )
+  }
+
+  test("prune results by current_date, complete mode") {
+    import testImplicits._
+    val clock = new StreamManualClock
+    val tz = TimeZone.getDefault.getID
+    val inputData = MemoryStream[Long]
+    val aggregated =
+      inputData.toDF()
+        .select(to_utc_timestamp(from_unixtime('value * DateTimeUtils.SECONDS_PER_DAY), tz))
+        .toDF("value")
+        .groupBy($"value")
+        .agg(count("*"))
+        .where($"value".cast("date") >= date_sub(current_date(), 10))
+        .select(($"value".cast("long") / DateTimeUtils.SECONDS_PER_DAY).cast("long"), $"count(1)")
+    testStream(aggregated, Complete)(
+      StartStream(ProcessingTime("10 day"), triggerClock = clock),
+      // advance clock to 10 days, should retain all keys
+      AddData(inputData, 0L, 5L, 5L, 10L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
+      // advance clock to 20 days, should retain keys >= 10
+      AddData(inputData, 15L, 15L, 20L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((10L, 1), (15L, 2), (20L, 1)),
+      // advance clock to 30 days, should retain keys >= 20
+      AddData(inputData, 85L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((20L, 1), (85L, 1)),
+
+      // bounce stream and ensure correct batch timestamp is used
+      // i.e., we don't take it from the clock, which is at 90 days.
+      StopStream,
+      AssertOnQuery { q => // clear the sink
+        q.sink.asInstanceOf[MemorySink].clear()
+        // advance by 60 days i.e., 90 days total
+        clock.advance(DateTimeUtils.MILLIS_PER_DAY * 60)
+        true
+      },
+      StartStream(ProcessingTime("10 day"), triggerClock = clock),
+      CheckLastBatch((20L, 1), (85L, 1)),
+
+      // advance clock to 100 days, should retain keys >= 90
+      AddData(inputData, 85L, 90L, 100L, 105L),
+      AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10),
+      CheckLastBatch((90L, 1), (100L, 1), (105L, 1))
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index e2e66d6663e19..8ecb33cf9d266 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -103,8 +103,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 2000),
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 10),
       AssertOnQuery(
-        q =>
-          q.exception.get.startOffset.get === q.committedOffsets.toOffsetSeq(Seq(inputData)),
+        q => q.exception.get.startOffset.get.offsets ===
+          q.committedOffsets.toOffsetSeq(Seq(inputData), "{}").offsets,
         "incorrect start offset on exception")
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
index 3617ec0f564c1..3e9488c7dc9af 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.streaming
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions.{count, window}
 
@@ -96,27 +96,41 @@ class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
     )
   }
 
-  ignore("recovery") {
+  test("recovery") {
     val inputData = MemoryStream[Int]
+    val df = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
 
-    val windowedAggregation = inputData.toDF()
-        .withColumn("eventTime", $"value".cast("timestamp"))
-        .withWatermark("eventTime", "10 seconds")
-        .groupBy(window($"eventTime", "5 seconds") as 'window)
-        .agg(count("*") as 'count)
-        .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
-
-    testStream(windowedAggregation)(
+    testStream(df)(
       AddData(inputData, 10, 11, 12, 13, 14, 15),
-      CheckAnswer(),
+      CheckLastBatch(),
       AddData(inputData, 25), // Advance watermark to 15 seconds
       StopStream,
       StartStream(),
-      CheckAnswer(),
+      CheckLastBatch(),
       AddData(inputData, 25), // Evict items less than previous watermark.
+      CheckLastBatch((10, 5)),
       StopStream,
+      AssertOnQuery { q => // clear the sink
+        q.sink.asInstanceOf[MemorySink].clear()
+        true
+      },
       StartStream(),
-      CheckAnswer((10, 5))
+      CheckLastBatch((10, 5)), // Recompute last batch and re-evict timestamp 10
+      AddData(inputData, 30), // Advance watermark to 20 seconds
+      CheckLastBatch(),
+      StopStream,
+      StartStream(), // Watermark should still be 15 seconds
+      AddData(inputData, 17),
+      CheckLastBatch(), // We still do not see next batch
+      AddData(inputData, 30), // Advance watermark to 20 seconds
+      CheckLastBatch(),
+      AddData(inputData, 30), // Evict items less than previous watermark.
+      CheckLastBatch((15, 2)) // Ensure we see next window
     )
   }
 

From 06a56df226aa0c03c21f23258630d8a96385c696 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 29 Nov 2016 00:00:33 -0800
Subject: [PATCH 220/534] [SPARK-18188] add checksum for blocks of broadcast

## What changes were proposed in this pull request?

A TorrentBroadcast is serialized and compressed first, then splitted as fixed size blocks, if any block is corrupt when fetching from remote, the decompression/deserialization will fail without knowing which block is corrupt. Also, the corrupt block is kept in block manager and reported to driver, so other tasks (in same executor or from different executor) will also fail because of it.

This PR add checksum for each block, and check it after fetching a block from remote executor, because it's very likely that the corruption happen in network. When the corruption happen, it will throw the block away and throw an exception to fail the task, which will be retried.

Added a config for it: `spark.broadcast.checksum`, which is true by default.

## How was this patch tested?

Existing tests.

Author: Davies Liu <davies@databricks.com>

Closes #15935 from davies/broadcast_checksum.

(cherry picked from commit 7d5cb3af7621ad6eb85d1ba7f585c3921ca0a242)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../spark/broadcast/TorrentBroadcast.scala    | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index e8d6d587b4824..f350784378795 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -19,6 +19,7 @@ package org.apache.spark.broadcast
 
 import java.io._
 import java.nio.ByteBuffer
+import java.util.zip.Adler32
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -77,6 +78,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
     }
     // Note: use getSizeAsKb (not bytes) to maintain compatibility if no units are provided
     blockSize = conf.getSizeAsKb("spark.broadcast.blockSize", "4m").toInt * 1024
+    checksumEnabled = conf.getBoolean("spark.broadcast.checksum", true)
   }
   setConf(SparkEnv.get.conf)
 
@@ -85,10 +87,27 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   /** Total number of blocks this broadcast variable contains. */
   private val numBlocks: Int = writeBlocks(obj)
 
+  /** Whether to generate checksum for blocks or not. */
+  private var checksumEnabled: Boolean = false
+  /** The checksum for all the blocks. */
+  private var checksums: Array[Int] = _
+
   override protected def getValue() = {
     _value
   }
 
+  private def calcChecksum(block: ByteBuffer): Int = {
+    val adler = new Adler32()
+    if (block.hasArray) {
+      adler.update(block.array, block.arrayOffset + block.position, block.limit - block.position)
+    } else {
+      val bytes = new Array[Byte](block.remaining())
+      block.duplicate.get(bytes)
+      adler.update(bytes)
+    }
+    adler.getValue.toInt
+  }
+
   /**
    * Divide the object into multiple blocks and put those blocks in the block manager.
    *
@@ -105,7 +124,13 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
     }
     val blocks =
       TorrentBroadcast.blockifyObject(value, blockSize, SparkEnv.get.serializer, compressionCodec)
+    if (checksumEnabled) {
+      checksums = new Array[Int](blocks.length)
+    }
     blocks.zipWithIndex.foreach { case (block, i) =>
+      if (checksumEnabled) {
+        checksums(i) = calcChecksum(block)
+      }
       val pieceId = BroadcastBlockId(id, "piece" + i)
       val bytes = new ChunkedByteBuffer(block.duplicate())
       if (!blockManager.putBytes(pieceId, bytes, MEMORY_AND_DISK_SER, tellMaster = true)) {
@@ -135,6 +160,13 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
         case None =>
           bm.getRemoteBytes(pieceId) match {
             case Some(b) =>
+              if (checksumEnabled) {
+                val sum = calcChecksum(b.chunks(0))
+                if (sum != checksums(pid)) {
+                  throw new SparkException(s"corrupt remote block $pieceId of $broadcastId:" +
+                    s" $sum != ${checksums(pid)}")
+                }
+              }
               // We found the block from remote executors/driver's BlockManager, so put the block
               // in this executor's BlockManager.
               if (!bm.putBytes(pieceId, b, StorageLevel.MEMORY_AND_DISK_SER, tellMaster = true)) {

From 84b2af229ca312023cd6343ecd2b1278542d9b9a Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 29 Nov 2016 09:41:32 +0000
Subject: [PATCH 221/534] [SPARK-3359][DOCS] Make javadoc8 working for
 unidoc/genjavadoc compatibility in Java API documentation

## What changes were proposed in this pull request?

This PR make `sbt unidoc` complete with Java 8.

This PR roughly includes several fixes as below:

- Fix unrecognisable class and method links in javadoc by changing it from `[[..]]` to `` `...` ``

  ```diff
  - * A column that will be computed based on the data in a [[DataFrame]].
  + * A column that will be computed based on the data in a `DataFrame`.
  ```

- Fix throws annotations so that they are recognisable in javadoc

- Fix URL links to `<a href="http..."></a>`.

  ```diff
  - * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression.
  + * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">
  + * Decision tree (Wikipedia)</a> model for regression.
  ```

  ```diff
  -   * see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
  +   * see <a href="http://en.wikipedia.org/wiki/Receiver_operating_characteristic">
  +   * Receiver operating characteristic (Wikipedia)</a>
  ```

- Fix < to > to

  - `greater than`/`greater than or equal to` or `less than`/`less than or equal to` where applicable.

  - Wrap it with `{{{...}}}` to print them in javadoc or use `{code ...}` or `{literal ..}`. Please refer https://github.com/apache/spark/pull/16013#discussion_r89665558

- Fix `</p>` complaint

## How was this patch tested?

Manually tested by `jekyll build` with Java 7 and 8

```
java version "1.7.0_80"
Java(TM) SE Runtime Environment (build 1.7.0_80-b15)
Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode)
```

```
java version "1.8.0_45"
Java(TM) SE Runtime Environment (build 1.8.0_45-b14)
Java HotSpot(TM) 64-Bit Server VM (build 25.45-b02, mixed mode)
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #16013 from HyukjinKwon/SPARK-3359-errors-more.

(cherry picked from commit f830bb9170f6b853565d9dd30ca7418b93a54fe3)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/Accumulator.scala  |  2 +-
 .../scala/org/apache/spark/SparkConf.scala    | 12 ++--
 .../scala/org/apache/spark/SparkContext.scala | 14 ++---
 .../scala/org/apache/spark/TaskContext.scala  |  4 +-
 .../org/apache/spark/TaskEndReason.scala      |  2 +-
 .../scala/org/apache/spark/TestUtils.scala    |  2 +-
 .../org/apache/spark/api/java/JavaRDD.scala   |  8 ++-
 .../apache/spark/rdd/DoubleRDDFunctions.scala |  4 +-
 .../org/apache/spark/rdd/HadoopRDD.scala      |  2 +-
 .../scala/org/apache/spark/rdd/JdbcRDD.scala  | 15 ++++-
 .../org/apache/spark/rdd/NewHadoopRDD.scala   |  2 +-
 .../apache/spark/rdd/PairRDDFunctions.scala   | 20 +++----
 .../main/scala/org/apache/spark/rdd/RDD.scala | 24 +++++---
 .../apache/spark/rdd/RDDCheckpointData.scala  |  3 +-
 .../apache/spark/rdd/coalesce-public.scala    |  4 +-
 .../spark/rpc/netty/RpcEndpointVerifier.scala |  4 +-
 .../spark/scheduler/InputFormatInfo.scala     |  2 +-
 .../apache/spark/scheduler/ResultTask.scala   |  2 +-
 .../spark/scheduler/ShuffleMapTask.scala      |  2 +-
 .../org/apache/spark/scheduler/Task.scala     |  2 +-
 .../spark/scheduler/TaskDescription.scala     |  2 +-
 .../spark/storage/BlockManagerMessages.scala  |  2 +-
 .../storage/ShuffleBlockFetcherIterator.scala |  4 +-
 .../scala/org/apache/spark/ui/UIUtils.scala   |  2 +-
 .../scala/org/apache/spark/util/Utils.scala   |  7 ++-
 .../spark/util/random/SamplingUtils.scala     | 18 +++---
 .../util/random/StratifiedSamplingUtils.scala | 33 ++++++----
 .../flume/FlumePollingInputDStream.scala      |  2 +-
 .../spark/streaming/kafka/KafkaCluster.scala  | 20 +++++--
 .../streaming/kafka/KafkaInputDStream.scala   |  2 +-
 .../spark/streaming/kafka/KafkaUtils.scala    | 18 +++---
 .../spark/streaming/kafka/OffsetRange.scala   |  2 +-
 .../org/apache/spark/graphx/GraphLoader.scala |  2 +-
 .../graphx/impl/VertexPartitionBase.scala     |  2 +-
 .../graphx/impl/VertexPartitionBaseOps.scala  |  2 +-
 .../spark/graphx/lib/TriangleCount.scala      |  2 +-
 .../classification/LogisticRegression.scala   | 15 ++---
 .../spark/ml/clustering/BisectingKMeans.scala |  4 +-
 .../spark/ml/clustering/GaussianMixture.scala |  2 +-
 .../org/apache/spark/ml/clustering/LDA.scala  | 10 ++--
 .../apache/spark/ml/feature/Bucketizer.scala  |  2 +-
 .../spark/ml/feature/CountVectorizer.scala    |  9 +--
 .../apache/spark/ml/feature/HashingTF.scala   |  2 +-
 .../org/apache/spark/ml/feature/NGram.scala   |  2 +-
 .../apache/spark/ml/feature/Normalizer.scala  |  2 +-
 .../spark/ml/feature/OneHotEncoder.scala      |  4 +-
 .../org/apache/spark/ml/feature/PCA.scala     |  4 +-
 .../ml/feature/PolynomialExpansion.scala      |  3 +-
 .../ml/feature/QuantileDiscretizer.scala      |  6 +-
 .../spark/ml/feature/SQLTransformer.scala     |  8 ++-
 .../spark/ml/feature/StopWordsRemover.scala   |  2 +-
 .../spark/ml/feature/StringIndexer.scala      |  8 +--
 .../apache/spark/ml/feature/Tokenizer.scala   |  2 +-
 .../spark/ml/feature/VectorIndexer.scala      |  9 +--
 .../spark/ml/feature/VectorSlicer.scala       |  4 +-
 .../apache/spark/ml/feature/package-info.java |  4 +-
 .../org/apache/spark/ml/param/params.scala    | 45 ++++++++------
 .../apache/spark/ml/recommendation/ALS.scala  |  6 +-
 .../ml/regression/AFTSurvivalRegression.scala |  2 +-
 .../ml/regression/DecisionTreeRegressor.scala |  3 +-
 .../spark/ml/regression/GBTRegressor.scala    |  2 +-
 .../GeneralizedLinearRegression.scala         | 22 +++----
 .../ml/regression/IsotonicRegression.scala    |  4 +-
 .../ml/regression/LinearRegression.scala      | 43 ++++++-------
 .../ml/regression/RandomForestRegressor.scala |  2 +-
 .../ml/tree/impl/DecisionTreeMetadata.scala   |  2 +-
 .../apache/spark/ml/util/MetadataUtils.scala  |  2 +-
 .../org/apache/spark/ml/util/ReadWrite.scala  |  8 +--
 .../spark/mllib/classification/SVM.scala      |  2 +-
 .../mllib/clustering/BisectingKMeans.scala    | 14 ++---
 .../mllib/clustering/GaussianMixture.scala    |  2 +-
 .../apache/spark/mllib/clustering/LDA.scala   | 24 ++++----
 .../spark/mllib/clustering/LDAModel.scala     |  4 +-
 .../spark/mllib/clustering/LDAOptimizer.scala |  4 +-
 .../BinaryClassificationMetrics.scala         |  8 ++-
 .../mllib/evaluation/RankingMetrics.scala     |  8 +--
 .../BinaryClassificationMetricComputers.scala |  2 +-
 .../spark/mllib/fpm/AssociationRules.scala    |  4 +-
 .../org/apache/spark/mllib/fpm/FPGrowth.scala |  6 +-
 .../apache/spark/mllib/fpm/PrefixSpan.scala   | 10 ++--
 .../linalg/EigenValueDecomposition.scala      |  2 +-
 .../apache/spark/mllib/linalg/Vectors.scala   |  4 +-
 .../mllib/optimization/GradientDescent.scala  |  6 +-
 .../spark/mllib/optimization/LBFGS.scala      |  3 +-
 .../spark/mllib/optimization/NNLS.scala       |  9 ++-
 .../spark/mllib/optimization/Updater.scala    |  6 +-
 .../spark/mllib/random/RandomRDDs.scala       |  8 +--
 .../apache/spark/mllib/rdd/SlidingRDD.scala   |  4 +-
 .../stat/test/KolmogorovSmirnovTest.scala     |  3 +-
 .../spark/mllib/stat/test/StreamingTest.scala |  6 +-
 .../mllib/stat/test/StreamingTestMethod.scala |  4 +-
 .../spark/mllib/tree/DecisionTree.scala       |  6 +-
 .../mllib/tree/GradientBoostedTrees.scala     |  6 +-
 .../spark/mllib/tree/RandomForest.scala       | 19 +++---
 .../tree/configuration/BoostingStrategy.scala | 12 ++--
 .../mllib/tree/configuration/Strategy.scala   |  8 +--
 .../apache/spark/mllib/tree/model/Split.scala |  2 +-
 .../spark/sql/InternalOutputModes.scala       |  2 +-
 .../main/scala/org/apache/spark/sql/Row.scala |  4 +-
 .../apache/spark/sql/types/DecimalType.scala  |  3 +-
 .../scala/org/apache/spark/sql/Column.scala   | 40 ++++++-------
 .../spark/sql/DataFrameNaFunctions.scala      | 36 +++++------
 .../apache/spark/sql/DataFrameReader.scala    | 43 +++++++------
 .../spark/sql/DataFrameStatFunctions.scala    | 28 +++++----
 .../apache/spark/sql/DataFrameWriter.scala    | 30 +++++-----
 .../scala/org/apache/spark/sql/Dataset.scala  | 44 +++++++-------
 .../org/apache/spark/sql/ForeachWriter.scala  |  3 +-
 .../spark/sql/KeyValueGroupedDataset.scala    |  8 +--
 .../spark/sql/RelationalGroupedDataset.scala  | 30 +++++-----
 .../org/apache/spark/sql/RuntimeConfig.scala  |  5 +-
 .../org/apache/spark/sql/SQLContext.scala     |  4 +-
 .../org/apache/spark/sql/SparkSession.scala   | 60 +++++++++----------
 .../apache/spark/sql/UDFRegistration.scala    |  2 +-
 .../org/apache/spark/sql/functions.scala      | 42 ++++++++-----
 .../spark/sql/internal/CatalogImpl.scala      | 14 ++---
 .../sql/internal/VariableSubstitution.scala   |  2 +-
 .../apache/spark/sql/jdbc/JdbcDialects.scala  |  4 +-
 .../apache/spark/sql/sources/interfaces.scala | 12 ++--
 .../hive/execution/InsertIntoHiveTable.scala  | 12 +++-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |  2 +-
 .../spark/sql/hive/hiveWriterContainers.scala |  2 +-
 .../apache/spark/streaming/StateSpec.scala    |  4 +-
 .../streaming/api/java/JavaPairDStream.scala  |  4 +-
 .../api/java/JavaStreamingContext.scala       |  2 +-
 .../dstream/PairDStreamFunctions.scala        |  4 +-
 125 files changed, 611 insertions(+), 524 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Accumulator.scala b/core/src/main/scala/org/apache/spark/Accumulator.scala
index 9d1f1d59dbce1..7bea636c94aa0 100644
--- a/core/src/main/scala/org/apache/spark/Accumulator.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulator.scala
@@ -26,7 +26,7 @@ package org.apache.spark
  *
  * An accumulator is created from an initial value `v` by calling
  * [[SparkContext#accumulator SparkContext.accumulator]].
- * Tasks running on the cluster can then add to it using the [[Accumulable#+= +=]] operator.
+ * Tasks running on the cluster can then add to it using the `+=` operator.
  * However, they cannot read its value. Only the driver program can read the accumulator's value,
  * using its [[#value]] method.
  *
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 04d657c09afd0..0c1c68de89f81 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -262,7 +262,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a time parameter as seconds; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then seconds are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getTimeAsSeconds(key: String): Long = {
     Utils.timeStringAsSeconds(get(key))
@@ -279,7 +279,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a time parameter as milliseconds; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then milliseconds are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getTimeAsMs(key: String): Long = {
     Utils.timeStringAsMs(get(key))
@@ -296,7 +296,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then bytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsBytes(key: String): Long = {
     Utils.byteStringAsBytes(get(key))
@@ -320,7 +320,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Kibibytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsKb(key: String): Long = {
     Utils.byteStringAsKb(get(key))
@@ -337,7 +337,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Mebibytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsMb(key: String): Long = {
     Utils.byteStringAsMb(get(key))
@@ -354,7 +354,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
   /**
    * Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no
    * suffix is provided then Gibibytes are assumed.
-   * @throws NoSuchElementException
+   * @throws java.util.NoSuchElementException
    */
   def getSizeAsGb(key: String): Long = {
     Utils.byteStringAsGb(get(key))
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index a159a170ebc50..1cb39a4209a1c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -641,7 +641,7 @@ class SparkContext(config: SparkConf) extends Logging {
 
   /**
    * Get a local property set in this thread, or null if it is missing. See
-   * [[org.apache.spark.SparkContext.setLocalProperty]].
+   * `org.apache.spark.SparkContext.setLocalProperty`.
    */
   def getLocalProperty(key: String): String =
     Option(localProperties.get).map(_.getProperty(key)).orNull
@@ -659,7 +659,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Application programmers can use this method to group all those jobs together and give a
    * group description. Once set, the Spark web UI will associate such jobs with this group.
    *
-   * The application can also use [[org.apache.spark.SparkContext.cancelJobGroup]] to cancel all
+   * The application can also use `org.apache.spark.SparkContext.cancelJobGroup` to cancel all
    * running jobs in this group. For example,
    * {{{
    * // In the main thread:
@@ -1380,7 +1380,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates
+   * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates
    * inputs by adding them into the list.
    */
   def collectionAccumulator[T]: CollectionAccumulator[T] = {
@@ -1390,7 +1390,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Create and register a [[CollectionAccumulator]], which starts with empty list and accumulates
+   * Create and register a `CollectionAccumulator`, which starts with empty list and accumulates
    * inputs by adding them into the list.
    */
   def collectionAccumulator[T](name: String): CollectionAccumulator[T] = {
@@ -2039,7 +2039,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Cancel active jobs for the specified group. See [[org.apache.spark.SparkContext.setJobGroup]]
+   * Cancel active jobs for the specified group. See `org.apache.spark.SparkContext.setJobGroup`
    * for more information.
    */
   def cancelJobGroup(groupId: String) {
@@ -2057,7 +2057,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Cancel a given job if it's scheduled or running.
    *
    * @param jobId the job ID to cancel
-   * @throws InterruptedException if the cancel message cannot be sent
+   * @note Throws `InterruptedException` if the cancel message cannot be sent
    */
   def cancelJob(jobId: Int) {
     dagScheduler.cancelJob(jobId)
@@ -2067,7 +2067,7 @@ class SparkContext(config: SparkConf) extends Logging {
    * Cancel a given stage and all jobs associated with it.
    *
    * @param stageId the stage ID to cancel
-   * @throws InterruptedException if the cancel message cannot be sent
+   * @note Throws `InterruptedException` if the cancel message cannot be sent
    */
   def cancelStage(stageId: Int) {
     dagScheduler.cancelStage(stageId)
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 27abccf5ac2a9..0fd777ed12829 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -164,7 +164,7 @@ abstract class TaskContext extends Serializable {
 
   /**
    * Get a local property set upstream in the driver, or null if it is missing. See also
-   * [[org.apache.spark.SparkContext.setLocalProperty]].
+   * `org.apache.spark.SparkContext.setLocalProperty`.
    */
   def getLocalProperty(key: String): String
 
@@ -174,7 +174,7 @@ abstract class TaskContext extends Serializable {
   /**
    * ::DeveloperApi::
    * Returns all metrics sources with the given name which are associated with the instance
-   * which runs the task. For more information see [[org.apache.spark.metrics.MetricsSystem!]].
+   * which runs the task. For more information see `org.apache.spark.metrics.MetricsSystem`.
    */
   @DeveloperApi
   def getMetricsSources(sourceName: String): Seq[Source]
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index 7ca3c103dbf5b..7745387dbceba 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -65,7 +65,7 @@ sealed trait TaskFailedReason extends TaskEndReason {
 
 /**
  * :: DeveloperApi ::
- * A [[org.apache.spark.scheduler.ShuffleMapTask]] that completed successfully earlier, but we
+ * A `org.apache.spark.scheduler.ShuffleMapTask` that completed successfully earlier, but we
  * lost the executor before the stage completed. This means Spark needs to reschedule the task
  * to be re-executed on a different executor.
  */
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 871b9d1ad575b..2909191bd6f14 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -186,7 +186,7 @@ private[spark] object TestUtils {
 
 
 /**
- * A [[SparkListener]] that detects whether spills have occurred in Spark jobs.
+ * A `SparkListener` that detects whether spills have occurred in Spark jobs.
  */
 private class SpillListener extends SparkListener {
   private val stageIdToTaskMetrics = new mutable.HashMap[Int, ArrayBuffer[TaskMetrics]]
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index a20d264be5afd..94e26e687c66b 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -103,7 +103,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be &gt;= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be greater
+   *  than or equal to 0
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
    * of the given `RDD`.
@@ -117,7 +118,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be &gt;= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be greater
+   *  than or equal to 0
    * @param seed seed for the random number generator
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
@@ -167,7 +169,7 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
    * Return an RDD with the elements from `this` that are not in `other`.
    *
    * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
-   * RDD will be &lt;= us.
+   * RDD will be less than or equal to us.
    */
   def subtract(other: JavaRDD[T]): JavaRDD[T] = wrapRDD(rdd.subtract(other))
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
index f3ab324d59119..14331dfd0c987 100644
--- a/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/DoubleRDDFunctions.scala
@@ -152,10 +152,10 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable {
 
   /**
    * Compute a histogram using the provided buckets. The buckets are all open
-   * to the right except for the last which is closed
+   * to the right except for the last which is closed.
    *  e.g. for the array
    *  [1, 10, 20, 50] the buckets are [1, 10) [10, 20) [20, 50]
-   *  e.g 1<=x<10 , 10<=x<20, 20<=x<=50
+   *  e.g {@code <=x<10, 10<=x<20, 20<=x<=50}
    *  And on the input of 1 and 50 we would have a histogram of 1, 0, 1
    *
    * @note If your histogram is evenly spaced (e.g. [0, 10, 20, 30]) this can be switched
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 86351b8c575e5..ae4320d4583d6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -96,7 +96,7 @@ private[spark] class HadoopPartition(rddId: Int, override val index: Int, s: Inp
  * @param minPartitions Minimum number of HadoopRDD partitions (Hadoop Splits) to generate.
  *
  * @note Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.hadoopRDD()]]
+ * `org.apache.spark.SparkContext.hadoopRDD()`
  */
 @DeveloperApi
 class HadoopRDD[K, V](
diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
index 0970b98071675..aab46b8954bf7 100644
--- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala
@@ -41,7 +41,10 @@ private[spark] class JdbcPartition(idx: Int, val lower: Long, val upper: Long) e
  *   The RDD takes care of closing the connection.
  * @param sql the text of the query.
  *   The query must contain two ? placeholders for parameters used to partition the results.
- *   E.g. "select title, author from books where ? <= id and id <= ?"
+ *   For example,
+ *   {{{
+ *   select title, author from books where ? <= id and id <= ?
+ *   }}}
  * @param lowerBound the minimum value of the first placeholder
  * @param upperBound the maximum value of the second placeholder
  *   The lower and upper bounds are inclusive.
@@ -151,7 +154,10 @@ object JdbcRDD {
    *   The RDD takes care of closing the connection.
    * @param sql the text of the query.
    *   The query must contain two ? placeholders for parameters used to partition the results.
-   *   E.g. "select title, author from books where ? <= id and id <= ?"
+   *   For example,
+   *   {{{
+   *   select title, author from books where ? <= id and id <= ?
+   *   }}}
    * @param lowerBound the minimum value of the first placeholder
    * @param upperBound the maximum value of the second placeholder
    *   The lower and upper bounds are inclusive.
@@ -191,7 +197,10 @@ object JdbcRDD {
    *   The RDD takes care of closing the connection.
    * @param sql the text of the query.
    *   The query must contain two ? placeholders for parameters used to partition the results.
-   *   E.g. "select title, author from books where ? <= id and id <= ?"
+   *   For example,
+   *   {{{
+   *   select title, author from books where ? <= id and id <= ?
+   *   }}}
    * @param lowerBound the minimum value of the first placeholder
    * @param upperBound the maximum value of the second placeholder
    *   The lower and upper bounds are inclusive.
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index a5965f597038d..c783e1375283a 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -63,7 +63,7 @@ private[spark] class NewHadoopPartition(
  * @param valueClass Class of the value associated with the inputFormatClass.
  *
  * @note Instantiating this class directly is not recommended, please use
- * [[org.apache.spark.SparkContext.newAPIHadoopRDD()]]
+ * `org.apache.spark.SparkContext.newAPIHadoopRDD()`
  */
 @DeveloperApi
 class NewHadoopRDD[K, V](
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 9ed0f3d8086a5..969cd47038cfa 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -401,9 +401,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
    * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
    *
-   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp > p`
-   * would trigger sparse representation of registers, which may reduce the memory consumption
-   * and increase accuracy when the cardinality is small.
+   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is
+   * greater than `p`) would trigger sparse representation of registers, which may reduce the
+   * memory consumption and increase accuracy when the cardinality is small.
    *
    * @param p The precision value for the normal set.
    *          `p` must be a value between 4 and `sp` if `sp` is not zero (32 max).
@@ -494,8 +494,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * each time the resulting RDD is evaluated.
    *
    * @note This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    *
    * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
    * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
@@ -518,8 +518,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * each group is not guaranteed, and may even differ each time the resulting RDD is evaluated.
    *
    * @note This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    *
    * @note As currently implemented, groupByKey must be able to hold all the key-value pairs for any
    * key in memory. If a key has too many values, it can result in an [[OutOfMemoryError]].
@@ -639,8 +639,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * evaluated.
    *
    * @note This operation may be very expensive. If you are grouping in order to perform an
-   * aggregation (such as a sum or average) over each key, using [[PairRDDFunctions.aggregateByKey]]
-   * or [[PairRDDFunctions.reduceByKey]] will provide much better performance.
+   * aggregation (such as a sum or average) over each key, using `PairRDDFunctions.aggregateByKey`
+   * or `PairRDDFunctions.reduceByKey` will provide much better performance.
    */
   def groupByKey(): RDD[(K, Iterable[V])] = self.withScope {
     groupByKey(defaultPartitioner(self))
@@ -910,7 +910,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
    * Return an RDD with the pairs from `this` whose keys are not in `other`.
    *
    * Uses `this` partitioner/partition size, because even if `other` is huge, the resulting
-   * RDD will be <= us.
+   * RDD will be less than or equal to us.
    */
   def subtractByKey[W: ClassTag](other: RDD[(K, W)]): RDD[(K, V)] = self.withScope {
     subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length)))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8e673447581cf..f723fcb837f88 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -469,7 +469,8 @@ abstract class RDD[T: ClassTag](
    * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
    * @param fraction expected size of the sample as a fraction of this RDD's size
    *  without replacement: probability that each element is chosen; fraction must be [0, 1]
-   *  with replacement: expected number of times each element is chosen; fraction must be &gt;= 0
+   *  with replacement: expected number of times each element is chosen; fraction must be greater
+   *  than or equal to 0
    * @param seed seed for the random number generator
    *
    * @note This is NOT guaranteed to provide exactly the fraction of the count
@@ -750,8 +751,10 @@ abstract class RDD[T: ClassTag](
    *                        print line function (like out.println()) as the 2nd parameter.
    *                        An example of pipe the RDD data of groupBy() in a streaming way,
    *                        instead of constructing a huge String to concat all the elements:
-   *                        def printRDDElement(record:(String, Seq[String]), f:String=&gt;Unit) =
-   *                          for (e &lt;- record._2) {f(e)}
+   *                        {{{
+   *                        def printRDDElement(record:(String, Seq[String]), f:String=>Unit) =
+   *                          for (e <- record._2) {f(e)}
+   *                        }}}
    * @param separateWorkingDir Use separate working directories for each task.
    * @param bufferSize Buffer size for the stdin writer for the piped process.
    * @param encoding Char encoding used for interacting (via stdin, stdout and stderr) with
@@ -1184,8 +1187,13 @@ abstract class RDD[T: ClassTag](
    *
    * @note This method should only be used if the resulting map is expected to be small, as
    * the whole thing is loaded into the driver's memory.
-   * To handle very large results, consider using rdd.map(x =&gt; (x, 1L)).reduceByKey(_ + _), which
-   * returns an RDD[T, Long] instead of a map.
+   * To handle very large results, consider using
+   *
+   * {{{
+   * rdd.map(x => (x, 1L)).reduceByKey(_ + _)
+   * }}}
+   *
+   * , which returns an RDD[T, Long] instead of a map.
    */
   def countByValue()(implicit ord: Ordering[T] = null): Map[T, Long] = withScope {
     map(value => (value, null)).countByKey()
@@ -1223,9 +1231,9 @@ abstract class RDD[T: ClassTag](
    * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
    * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
    *
-   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero `sp &gt; p`
-   * would trigger sparse representation of registers, which may reduce the memory consumption
-   * and increase accuracy when the cardinality is small.
+   * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting a nonzero (`sp` is greater
+   * than `p`) would trigger sparse representation of registers, which may reduce the memory
+   * consumption and increase accuracy when the cardinality is small.
    *
    * @param p The precision value for the normal set.
    *          `p` must be a value between 4 and `sp` if `sp` is not zero (32 max).
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
index 1070bb96b2524..6c552d4d12515 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDDCheckpointData.scala
@@ -23,7 +23,8 @@ import org.apache.spark.Partition
 
 /**
  * Enumeration to manage state transitions of an RDD through checkpointing
- * [ Initialized --> checkpointing in progress --> checkpointed ].
+ *
+ * [ Initialized --{@literal >} checkpointing in progress --{@literal >} checkpointed ]
  */
 private[spark] object CheckpointState extends Enumeration {
   type CheckpointState = Value
diff --git a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
index d8a80aa5aeb15..e00bc22aba44d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/coalesce-public.scala
@@ -35,14 +35,14 @@ trait PartitionCoalescer {
    * @param maxPartitions the maximum number of partitions to have after coalescing
    * @param parent the parent RDD whose partitions to coalesce
    * @return an array of [[PartitionGroup]]s, where each element is itself an array of
-   * [[Partition]]s and represents a partition after coalescing is performed.
+   * `Partition`s and represents a partition after coalescing is performed.
    */
   def coalesce(maxPartitions: Int, parent: RDD[_]): Array[PartitionGroup]
 }
 
 /**
  * ::DeveloperApi::
- * A group of [[Partition]]s
+ * A group of `Partition`s
  * @param prefLoc preferred location for the partition group
  */
 @DeveloperApi
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
index 99f20da2d66aa..430dcc50ba711 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
@@ -20,7 +20,7 @@ package org.apache.spark.rpc.netty
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
 
 /**
- * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists.
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an `RpcEndpoint` exists.
  *
  * This is used when setting up a remote endpoint reference.
  */
@@ -35,6 +35,6 @@ private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher
 private[netty] object RpcEndpointVerifier {
   val NAME = "endpoint-verifier"
 
-  /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */
+  /** A message used to ask the remote [[RpcEndpointVerifier]] if an `RpcEndpoint` exists. */
   case class CheckExistence(name: String)
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
index a6b032cc0084c..66ab9a52b7781 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala
@@ -153,7 +153,7 @@ object InputFormatInfo {
 
     a) For each host, count number of splits hosted on that host.
     b) Decrement the currently allocated containers on that host.
-    c) Compute rack info for each host and update rack -> count map based on (b).
+    c) Compute rack info for each host and update rack to count map based on (b).
     d) Allocate nodes based on (c)
     e) On the allocation result, ensure that we don't allocate "too many" jobs on a single node
        (even if data locality on that is very high) : this is to prevent fragility of job if a
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index 1e7c63af2e797..d19353f2a9930 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -42,7 +42,7 @@ import org.apache.spark.rdd.RDD
  * @param outputId index of the task in this job (a job can launch tasks on only a subset of the
  *                 input RDD's partitions).
  * @param localProperties copy of thread-local properties set by the user on the driver side.
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
  *
  * The parameters below are optional:
  * @param jobId id of the job this task belongs to
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
index 66d6790e168f2..31011de85bf7e 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -42,7 +42,7 @@ import org.apache.spark.shuffle.ShuffleWriter
  *                   the type should be (RDD[_], ShuffleDependency[_, _, _]).
  * @param partition partition of the RDD this task is associated with
  * @param locs preferred task execution locations for locality scheduling
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
  * @param localProperties copy of thread-local properties set by the user on the driver side.
  *
  * The parameters below are optional:
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 9385e3c31e1e4..112b08f2c03a9 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -45,7 +45,7 @@ import org.apache.spark.util._
  * @param stageId id of the stage this task belongs to
  * @param stageAttemptId attempt id of the stage this task belongs to
  * @param partitionId index of the number in the RDD
- * @param metrics a [[TaskMetrics]] that is created at driver side and sent to executor side.
+ * @param metrics a `TaskMetrics` that is created at driver side and sent to executor side.
  * @param localProperties copy of thread-local properties set by the user on the driver side.
  *
  * The parameters below are optional:
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
index 1c7c81c488c3a..45c742cbff5e7 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
@@ -23,7 +23,7 @@ import org.apache.spark.util.SerializableBuffer
 
 /**
  * Description of a task that gets passed onto executors to be executed, usually created by
- * [[TaskSetManager.resourceOffer]].
+ * `TaskSetManager.resourceOffer`.
  */
 private[spark] class TaskDescription(
     val taskId: Long,
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
index 6bded92700504..d71acbb4cf771 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala
@@ -43,7 +43,7 @@ private[spark] object BlockManagerMessages {
     extends ToBlockManagerSlave
 
   /**
-   * Driver -> Executor message to trigger a thread dump.
+   * Driver to Executor message to trigger a thread dump.
    */
   case object TriggerThreadDump extends ToBlockManagerSlave
 
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index 4dc2f362329a0..269c12d6da444 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -247,7 +247,7 @@ final class ShuffleBlockFetcherIterator(
 
   /**
    * Fetch the local blocks while we are fetching remote blocks. This is ok because
-   * [[ManagedBuffer]]'s memory is allocated lazily when we create the input stream, so all we
+   * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we
    * track in-memory are the ManagedBuffer references themselves.
    */
   private[this] def fetchLocalBlocks() {
@@ -423,7 +423,7 @@ object ShuffleBlockFetcherIterator {
    * @param address BlockManager that the block was fetched from.
    * @param size estimated size of the block, used to calculate bytesInFlight.
    *             Note that this is NOT the exact bytes.
-   * @param buf [[ManagedBuffer]] for the content.
+   * @param buf `ManagedBuffer` for the content.
    * @param isNetworkReqDone Is this the last network request for this host in this fetch request.
    */
   private[storage] case class SuccessFetchResult(
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index dbeb970c81dfe..d161843dd2230 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -422,7 +422,7 @@ private[spark] object UIUtils extends Logging {
    * the whole string will rendered as a simple escaped text.
    *
    * Note: In terms of security, only anchor tags with root relative links are supported. So any
-   * attempts to embed links outside Spark UI, or other tags like &lt;script&gt; will cause in
+   * attempts to embed links outside Spark UI, or other tags like {@code <script>} will cause in
    * the whole description to be treated as plain text.
    *
    * @param desc        the original job or stage description string, which may contain html tags.
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index acad2fdf733c8..ded3416299e9a 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1674,7 +1674,7 @@ private[spark] object Utils extends Logging {
 
   /**
    * NaN-safe version of `java.lang.Double.compare()` which allows NaN values to be compared
-   * according to semantics where NaN == NaN and NaN &gt; any non-NaN double.
+   * according to semantics where NaN == NaN and NaN is greater than any non-NaN double.
    */
   def nanSafeCompareDoubles(x: Double, y: Double): Int = {
     val xIsNan: Boolean = java.lang.Double.isNaN(x)
@@ -1688,7 +1688,7 @@ private[spark] object Utils extends Logging {
 
   /**
    * NaN-safe version of `java.lang.Float.compare()` which allows NaN values to be compared
-   * according to semantics where NaN == NaN and NaN &gt; any non-NaN float.
+   * according to semantics where NaN == NaN and NaN is greater than any non-NaN float.
    */
   def nanSafeCompareFloats(x: Float, y: Float): Int = {
     val xIsNan: Boolean = java.lang.Float.isNaN(x)
@@ -2340,8 +2340,9 @@ private[spark] object Utils extends Logging {
    * A spark url (`spark://host:port`) is a special URI that its scheme is `spark` and only contains
    * host and port.
    *
-   * @note Throws `SparkException` if sparkUrl is invalid.
+   * @throws org.apache.spark.SparkException if sparkUrl is invalid.
    */
+  @throws(classOf[SparkException])
   def extractHostPortFromSparkUrl(sparkUrl: String): (String, Int) = {
     try {
       val uri = new java.net.URI(sparkUrl)
diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
index f98932a470165..297524c943e1f 100644
--- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -67,17 +67,19 @@ private[spark] object SamplingUtils {
   }
 
   /**
-   * Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of
-   * the time.
+   * Returns a sampling rate that guarantees a sample of size greater than or equal to
+   * sampleSizeLowerBound 99.99% of the time.
    *
    * How the sampling rate is determined:
+   *
    * Let p = num / total, where num is the sample size and total is the total number of
-   * datapoints in the RDD. We're trying to compute q > p such that
+   * datapoints in the RDD. We're trying to compute q {@literal >} p such that
    *   - when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
-   *     where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
-   *     i.e. the failure rate of not having a sufficiently large sample < 0.0001.
+   *     where we want to guarantee
+   *     Pr[s {@literal <} num] {@literal <} 0.0001 for s = sum(prob_i for i from 0 to total),
+   *     i.e. the failure rate of not having a sufficiently large sample {@literal <} 0.0001.
    *     Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
-   *     num > 12, but we need a slightly larger q (9 empirically determined).
+   *     num {@literal >} 12, but we need a slightly larger q (9 empirically determined).
    *   - when sampling without replacement, we're drawing each datapoint with prob_i
    *     ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
    *     rate, where success rate is defined the same as in sampling with replacement.
@@ -108,14 +110,14 @@ private[spark] object SamplingUtils {
 private[spark] object PoissonBounds {
 
   /**
-   * Returns a lambda such that Pr[X > s] is very small, where X ~ Pois(lambda).
+   * Returns a lambda such that Pr[X {@literal >} s] is very small, where X ~ Pois(lambda).
    */
   def getLowerBound(s: Double): Double = {
     math.max(s - numStd(s) * math.sqrt(s), 1e-15)
   }
 
   /**
-   * Returns a lambda such that Pr[X < s] is very small, where X ~ Pois(lambda).
+   * Returns a lambda such that Pr[X {@literal <} s] is very small, where X ~ Pois(lambda).
    *
    * @param s sample size
    */
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index 67822749112c6..ce46fc8f201be 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -35,13 +35,14 @@ import org.apache.spark.rdd.RDD
  * high probability. This is achieved by maintaining a waitlist of size O(log(s)), where s is the
  * desired sample size for each stratum.
  *
- * Like in simple random sampling, we generate a random value for each item from the
- * uniform  distribution [0.0, 1.0]. All items with values <= min(values of items in the waitlist)
- * are accepted into the sample instantly. The threshold for instant accept is designed so that
- * s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by maintaining a
- * waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size s by adding
- * a portion of the waitlist to the set of items that are instantly accepted. The exact threshold
- * is computed by sorting the values in the waitlist and picking the value at (s - numAccepted).
+ * Like in simple random sampling, we generate a random value for each item from the uniform
+ * distribution [0.0, 1.0]. All items with values less than or equal to min(values of items in the
+ * waitlist) are accepted into the sample instantly. The threshold for instant accept is designed
+ * so that s - numAccepted = O(sqrt(s)), where s is again the desired sample size. Thus, by
+ * maintaining a waitlist size = O(sqrt(s)), we will be able to create a sample of the exact size
+ * s by adding a portion of the waitlist to the set of items that are instantly accepted. The exact
+ * threshold is computed by sorting the values in the waitlist and picking the value at
+ * (s - numAccepted).
  *
  * Note that since we use the same seed for the RNG when computing the thresholds and the actual
  * sample, our computed thresholds are guaranteed to produce the desired sample size.
@@ -160,12 +161,20 @@ private[spark] object StratifiedSamplingUtils extends Logging {
    *
    * To do so, we compute sampleSize = math.ceil(size * samplingRate) for each stratum and compare
    * it to the number of items that were accepted instantly and the number of items in the waitlist
-   * for that stratum. Most of the time, numAccepted <= sampleSize <= (numAccepted + numWaitlisted),
+   * for that stratum.
+   *
+   * Most of the time,
+   * {{{
+   * numAccepted <= sampleSize <= (numAccepted + numWaitlisted)
+   * }}}
    * which means we need to sort the elements in the waitlist by their associated values in order
-   * to find the value T s.t. |{elements in the stratum whose associated values <= T}| = sampleSize.
-   * Note that all elements in the waitlist have values >= bound for instant accept, so a T value
-   * in the waitlist range would allow all elements that were instantly accepted on the first pass
-   * to be included in the sample.
+   * to find the value T s.t.
+   * {{{
+   * |{elements in the stratum whose associated values <= T}| = sampleSize
+   * }}}.
+   * Note that all elements in the waitlist have values greater than or equal to bound for instant
+   * accept, so a T value in the waitlist range would allow all elements that were instantly
+   * accepted on the first pass to be included in the sample.
    */
   def computeThresholdByKey[K](finalResult: Map[K, AcceptanceResult],
       fractions: Map[K, Double]): Map[K, Double] = {
diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 54565840fa665..d84e289272c62 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -36,7 +36,7 @@ import org.apache.spark.streaming.flume.sink._
 import org.apache.spark.streaming.receiver.Receiver
 
 /**
- * A [[ReceiverInputDStream]] that can be used to read data from several Flume agents running
+ * A `ReceiverInputDStream` that can be used to read data from several Flume agents running
  * [[org.apache.spark.streaming.flume.sink.SparkSink]]s.
  * @param _ssc Streaming context that will execute this input stream
  * @param addresses List of addresses at which SparkSinks are listening
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
index 35acb7b09f12b..e0e44d4440272 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala
@@ -231,7 +231,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
   // this 0 here indicates api version, in this case the original ZK backed api.
   private def defaultConsumerApiVersion: Short = 0
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def getConsumerOffsets(
       groupId: String,
       topicAndPartitions: Set[TopicAndPartition]
@@ -250,7 +253,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     }
   }
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def getConsumerOffsetMetadata(
       groupId: String,
       topicAndPartitions: Set[TopicAndPartition]
@@ -287,7 +293,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     Left(errs)
   }
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def setConsumerOffsets(
       groupId: String,
       offsets: Map[TopicAndPartition, Long]
@@ -305,7 +314,10 @@ class KafkaCluster(val kafkaParams: Map[String, String]) extends Serializable {
     setConsumerOffsetMetadata(groupId, meta, consumerApiVersion)
   }
 
-  /** Requires Kafka >= 0.8.1.1.  Defaults to the original ZooKeeper backed api version. */
+  /**
+   * Requires Kafka 0.8.1.1 or later.
+   * Defaults to the original ZooKeeper backed API version.
+   */
   def setConsumerOffsetMetadata(
       groupId: String,
       metadata: Map[TopicAndPartition, OffsetAndMetadata]
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
index 3713bda41b8ee..7ff3a98ca52cd 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala
@@ -38,7 +38,7 @@ import org.apache.spark.util.ThreadUtils
  *
  * @param kafkaParams Map of kafka configuration parameters.
  *                    See: http://kafka.apache.org/configuration.html
- * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+ * @param topics Map of (topic_name to numPartitions) to consume. Each partition is consumed
  * in its own thread.
  * @param storageLevel RDD storage level.
  */
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
index 56f0cb0b166a2..d5aef8184fc87 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
@@ -47,7 +47,7 @@ object KafkaUtils {
    * @param ssc       StreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
    * @param groupId   The group id for this consumer
-   * @param topics    Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics    Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                  in its own thread
    * @param storageLevel  Storage level to use for storing the received objects
    *                      (default: StorageLevel.MEMORY_AND_DISK_SER_2)
@@ -72,7 +72,7 @@ object KafkaUtils {
    * @param ssc         StreamingContext object
    * @param kafkaParams Map of kafka configuration parameters,
    *                    see http://kafka.apache.org/08/configuration.html
-   * @param topics      Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics      Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                    in its own thread.
    * @param storageLevel Storage level to use for storing the received objects
    * @tparam K type of Kafka message key
@@ -97,7 +97,7 @@ object KafkaUtils {
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..)
    * @param groupId   The group id for this consumer
-   * @param topics    Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics    Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                  in its own thread
    * @return DStream of (Kafka message key, Kafka message value)
    */
@@ -115,7 +115,7 @@ object KafkaUtils {
    * @param jssc      JavaStreamingContext object
    * @param zkQuorum  Zookeeper quorum (hostname:port,hostname:port,..).
    * @param groupId   The group id for this consumer.
-   * @param topics    Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics    Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                  in its own thread.
    * @param storageLevel RDD storage level.
    * @return DStream of (Kafka message key, Kafka message value)
@@ -140,7 +140,7 @@ object KafkaUtils {
    * @param valueDecoderClass Type of kafka value decoder
    * @param kafkaParams Map of kafka configuration parameters,
    *                    see http://kafka.apache.org/08/configuration.html
-   * @param topics  Map of (topic_name -> numPartitions) to consume. Each partition is consumed
+   * @param topics  Map of (topic_name to numPartitions) to consume. Each partition is consumed
    *                in its own thread
    * @param storageLevel RDD storage level.
    * @tparam K type of Kafka message key
@@ -396,7 +396,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -448,7 +448,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -499,7 +499,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
@@ -565,7 +565,7 @@ object KafkaUtils {
    *    You can access the offsets used in each batch from the generated RDDs (see
    *    [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
    *  - Failure Recovery: To recover from driver failures, you have to enable checkpointing
-   *    in the [[StreamingContext]]. The information on consumed offset can be
+   *    in the `StreamingContext`. The information on consumed offset can be
    *    recovered from the checkpoint. See the programming guide for details (constraints, etc.).
    *  - End-to-end semantics: This stream ensures that every records is effectively received and
    *    transformed exactly once, but gives no guarantees on whether the transformed data are
diff --git a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
index d9b856e4697a0..10d364f987405 100644
--- a/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
+++ b/external/kafka-0-8/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala
@@ -22,7 +22,7 @@ import kafka.common.TopicAndPartition
 /**
  * Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the
  * offset ranges in RDDs generated by the direct Kafka DStream (see
- * [[KafkaUtils.createDirectStream()]]).
+ * `KafkaUtils.createDirectStream()`).
  * {{{
  *   KafkaUtils.createDirectStream(...).foreachRDD { rdd =>
  *      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
index add21f41ea3ba..f665727ef90db 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@@ -32,7 +32,7 @@ object GraphLoader extends Logging {
    * id and a target id. Skips lines that begin with `#`.
    *
    * If desired the edges can be automatically oriented in the positive
-   * direction (source Id &lt; target Id) by setting `canonicalOrientation` to
+   * direction (source Id is less than target Id) by setting `canonicalOrientation` to
    * true.
    *
    * @example Loads a file in the following format:
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
index 8d608c99b1a1d..8da46db98be81 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala
@@ -57,7 +57,7 @@ private[graphx] object VertexPartitionBase {
  * concrete implementation. [[VertexPartitionBaseOps]] provides a variety of operations for
  * VertexPartitionBase and subclasses that provide implicit evidence of membership in the
  * `VertexPartitionBaseOpsConstructor` typeclass (for example,
- * [[VertexPartition.VertexPartitionOpsConstructor]]).
+ * `VertexPartition.VertexPartitionOpsConstructor`).
  */
 private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag]
   extends Serializable {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
index 43594573cf013..a8ed59b09bbb7 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala
@@ -29,7 +29,7 @@ import org.apache.spark.util.collection.BitSet
 /**
  * A class containing additional operations for subclasses of VertexPartitionBase that provide
  * implicit evidence of membership in the `VertexPartitionBaseOpsConstructor` typeclass (for
- * example, [[VertexPartition.VertexPartitionOpsConstructor]]).
+ * example, `VertexPartition.VertexPartitionOpsConstructor`).
  */
 private[graphx] abstract class VertexPartitionBaseOps
     [VD: ClassTag, Self[X] <: VertexPartitionBase[X]: VertexPartitionBaseOpsConstructor]
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
index 21b22968a1a69..2715137d19ebc 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
@@ -36,7 +36,7 @@ import org.apache.spark.graphx._
  * self cycles and canonicalizes the graph to ensure that the following conditions hold:
  * <ul>
  * <li> There are no self edges</li>
- * <li> All edges are oriented src &gt; dst</li>
+ * <li> All edges are oriented (src is greater than dst)</li>
  * <li> There are no duplicate edges</li>
  * </ul>
  * However, the canonicalization procedure is costly as it requires repartitioning the graph.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 41b84f481633c..ec582266e6a47 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -56,8 +56,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Set threshold in binary classification, in range [0, 1].
    *
-   * If the estimated probability of class label 1 is &gt; threshold, then predict 1, else 0.
-   * A high threshold encourages the model to predict 0 more often;
+   * If the estimated probability of class label 1 is greater than threshold, then predict 1,
+   * else 0. A high threshold encourages the model to predict 0 more often;
    * a low threshold encourages the model to predict 1 more often.
    *
    * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`.
@@ -123,7 +123,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
   /**
    * Set thresholds in multiclass (or binary) classification to adjust the probability of
    * predicting each class. Array must have length equal to the number of classes,
-   * with values &gt; 0, excepting that at most one value may be 0.
+   * with values greater than 0, excepting that at most one value may be 0.
    * The class with largest value p/t is predicted, where p is the original probability of that
    * class and t is the class's threshold.
    *
@@ -210,8 +210,9 @@ class LogisticRegression @Since("1.2.0") (
 
   /**
    * Set the ElasticNet mixing parameter.
-   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 &lt; alpha &lt; 1, the penalty is a combination of L1 and L2.
+   * For alpha = 0, the penalty is an L2 penalty.
+   * For alpha = 1, it is an L1 penalty.
+   * For alpha in (0,1), the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
    * @group setParam
@@ -298,7 +299,7 @@ class LogisticRegression @Since("1.2.0") (
   override def getThresholds: Array[Double] = super.getThresholds
 
   /**
-   * Suggested depth for treeAggregate (&gt;= 2).
+   * Suggested depth for treeAggregate (greater than or equal to 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
@@ -1517,7 +1518,7 @@ private class LogisticAggregator(
     }
 
     /**
-     * When maxMargin &gt; 0, the original formula could cause overflow.
+     * When maxMargin is greater than 0, the original formula could cause overflow.
      * We address this by subtracting maxMargin from all the margins, so it's guaranteed
      * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
      */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index c7a170ddc7351..e58df6ba9108a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -55,8 +55,8 @@ private[clustering] trait BisectingKMeansParams extends Params
   def getK: Int = $(k)
 
   /**
-   * The minimum number of points (if &gt;= 1.0) or the minimum proportion
-   * of points (if &lt; 1.0) of a divisible cluster (default: 1.0).
+   * The minimum number of points (if greater than or equal to 1.0) or the minimum proportion
+   * of points (if less than 1.0) of a divisible cluster (default: 1.0).
    * @group expertParam
    */
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 74109344aac08..c764c3aa32a4c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -44,7 +44,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
   with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol {
 
   /**
-   * Number of independent Gaussians in the mixture model. Must be &gt; 1. Default: 2.
+   * Number of independent Gaussians in the mixture model. Must be greater than 1. Default: 2.
    * @group param
    */
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 6032ab3db9350..cd403d842b694 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -78,11 +78,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *  - EM
    *     - Currently only supports symmetric distributions, so all values in the vector should be
    *       the same.
-   *     - Values should be &gt; 1.0
+   *     - Values should be greater than 1.0
    *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
    *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Values should be &gt;= 0
+   *     - Values should be greater than or equal to 0
    *     - default = uniformly (1.0 / k), following the implementation from
    *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    *
@@ -120,11 +120,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be &gt; 1.0
+   *     - Value should be greater than 1.0
    *     - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
    *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be &gt;= 0
+   *     - Value should be greater than or equal to 0
    *     - default = (1.0 / k), following the implementation from
    *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    *
@@ -247,7 +247,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    *
    * Note that this should be adjusted in synch with `LDA.maxIter`
    * so the entire corpus is used.  Specifically, set both so that
-   * maxIterations * miniBatchFraction &gt;= 1.
+   * maxIterations * miniBatchFraction greater than or equal to 1.
    *
    * Note: This is the same as the `miniBatchFraction` parameter in
    *       [[org.apache.spark.mllib.clustering.OnlineLDAOptimizer]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 1143f0f565ebd..260159f8b7ac4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -44,7 +44,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   /**
    * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
    * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
-   * also includes y. Splits should be of length >= 3 and strictly increasing.
+   * also includes y. Splits should be of length greater than or equal to 3 and strictly increasing.
    * Values at -inf, inf must be explicitly provided to cover all Double values;
    * otherwise, values outside the splits specified will be treated as errors.
    *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 6299f74a6bf96..1ebe29703bc47 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -53,8 +53,9 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   /**
    * Specifies the minimum number of different documents a term must appear in to be included
    * in the vocabulary.
-   * If this is an integer >= 1, this specifies the number of documents the term must appear in;
-   * if this is a double in [0,1), then this specifies the fraction of documents.
+   * If this is an integer greater than or equal to 1, this specifies the number of documents
+   * the term must appear in; if this is a double in [0,1), then this specifies the fraction of
+   * documents.
    *
    * Default: 1.0
    * @group param
@@ -78,8 +79,8 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   /**
    * Filter to ignore rare words in a document. For each document, terms with
    * frequency/count less than the given threshold are ignored.
-   * If this is an integer >= 1, then this specifies a count (of times the term must appear
-   * in the document);
+   * If this is an integer greater than or equal to 1, then this specifies a count (of times the
+   * term must appear in the document);
    * if this is a double in [0,1), then this specifies a fraction (out of the document's token
    * count).
    *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index a8792a35ff4ae..db432b6fefaff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -52,7 +52,7 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Number of features.  Should be > 0.
+   * Number of features. Should be greater than 0.
    * (default = 2^18^)
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
index 4463aea0097e2..c8760f9dc178f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/NGram.scala
@@ -41,7 +41,7 @@ class NGram @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   def this() = this(Identifiable.randomUID("ngram"))
 
   /**
-   * Minimum n-gram length, >= 1.
+   * Minimum n-gram length, greater than or equal to 1.
    * Default: 2, bigram features
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
index eb0690058013f..6e96545c8cb7a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala
@@ -37,7 +37,7 @@ class Normalizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   def this() = this(Identifiable.randomUID("normalizer"))
 
   /**
-   * Normalization in L^p^ space.  Must be >= 1.
+   * Normalization in L^p^ space. Must be greater than equal to 1.
    * (default: p = 2)
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index ea401216aec7b..ba1380bdda451 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -33,14 +33,14 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
  * at most a single one-value per row that indicates the input category index.
  * For example with 5 categories, an input value of 2.0 would map to an output vector of
  * `[0.0, 0.0, 1.0, 0.0]`.
- * The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]]
+ * The last category is not included by default (configurable via `OneHotEncoder!.dropLast`
  * because it makes the vector entries sum up to one, and hence linearly dependent.
  * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
  *
  * @note This is different from scikit-learn's OneHotEncoder, which keeps all categories.
  * The output vectors are sparse.
  *
- * @see [[StringIndexer]] for converting categorical values into category indices
+ * @see `StringIndexer` for converting categorical values into category indices
  */
 @Since("1.4.0")
 class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 6e08bf059124c..4143d864d7930 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -63,7 +63,7 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
 }
 
 /**
- * PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]]
+ * PCA trains a model to project vectors to a lower dimensional space of the top `PCA!.k`
  * principal components.
  */
 @Since("1.5.0")
@@ -144,7 +144,7 @@ class PCAModel private[ml] (
    * Transform a vector by computed Principal Components.
    *
    * @note Vectors to be transformed must be the same length as the source vectors given
-   * to [[PCA.fit()]].
+   * to `PCA.fit()`.
    */
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 4be17da3e9f76..292f9496a456c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -45,7 +45,8 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
   def this() = this(Identifiable.randomUID("poly"))
 
   /**
-   * The polynomial degree to expand, which should be >= 1.  A value of 1 means no expansion.
+   * The polynomial degree to expand, which should be greater than equal to 1. A value of 1 means
+   * no expansion.
    * Default: 2
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index b9e01dde70d85..d8f33cd768dcd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -35,7 +35,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
 
   /**
    * Number of buckets (quantiles, or categories) into which data points are grouped. Must
-   * be >= 2.
+   * be greater than or equal to 2.
    *
    * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
    *
@@ -52,7 +52,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
 
   /**
    * Relative error (see documentation for
-   * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]] for description)
+   * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` for description)
    * Must be in the range [0, 1].
    * default: 0.001
    * @group param
@@ -99,7 +99,7 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * but NaNs will be counted in a special bucket[4].
  *
  * Algorithm: The bin ranges are chosen using an approximate algorithm (see the documentation for
- * [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]]
+ * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile`
  * for a detailed description). The precision of the approximation can be controlled with the
  * `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`,
  * covering all real values.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index b25fff973c441..65db06c0d6085 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -32,9 +32,11 @@ import org.apache.spark.sql.types.StructType
  * the output, it can be any select clause that Spark SQL supports. Users can also
  * use Spark SQL built-in function and UDFs to operate on these selected columns.
  * For example, [[SQLTransformer]] supports statements like:
- *  - SELECT a, a + b AS a_b FROM __THIS__
- *  - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
- *  - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
+ * {{{
+ *  SELECT a, a + b AS a_b FROM __THIS__
+ *  SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
+ *  SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
+ * }}}
  */
 @Since("1.6.0")
 class SQLTransformer @Since("1.6.0") (@Since("1.6.0") override val uid: String) extends Transformer
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index a55816249c74b..3fcd84c029e61 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -52,7 +52,7 @@ class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String
   /**
    * The words to be filtered out.
    * Default: English stop words
-   * @see [[StopWordsRemover.loadDefaultStopWords()]]
+   * @see `StopWordsRemover.loadDefaultStopWords()`
    * @group param
    */
   @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 8b155f00017cf..0a4d31d1654e7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -60,7 +60,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
  * The indices are in [0, numLabels), ordered by label frequencies.
  * So the most frequent label gets index 0.
  *
- * @see [[IndexToString]] for the inverse transformation
+ * @see `IndexToString` for the inverse transformation
  */
 @Since("1.4.0")
 class StringIndexer @Since("1.4.0") (
@@ -116,7 +116,7 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] {
  * @param labels  Ordered list of labels, corresponding to indices to be assigned.
  *
  * @note During transformation, if the input column does not exist,
- * [[StringIndexerModel.transform]] would return the input dataset unmodified.
+ * `StringIndexerModel.transform` would return the input dataset unmodified.
  * This is a temporary fix for the case when target labels do not exist during prediction.
  */
 @Since("1.4.0")
@@ -247,12 +247,12 @@ object StringIndexerModel extends MLReadable[StringIndexerModel] {
 }
 
 /**
- * A [[Transformer]] that maps a column of indices back to a new column of corresponding
+ * A `Transformer` that maps a column of indices back to a new column of corresponding
  * string values.
  * The index-string mapping is either from the ML attributes of the input column,
  * or from user-supplied labels (which take precedence over ML attributes).
  *
- * @see [[StringIndexer]] for converting strings into indices
+ * @see `StringIndexer` for converting strings into indices
  */
 @Since("1.5.0")
 class IndexToString private[ml] (@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 45d8fa94a8f8f..cfaf6c0e610b3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -70,7 +70,7 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   def this() = this(Identifiable.randomUID("regexTok"))
 
   /**
-   * Minimum token length, >= 0.
+   * Minimum token length, greater than or equal to 0.
    * Default: 1, to avoid returning empty strings
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index d1a5c2e82581e..d371da762c55d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -41,8 +41,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
 
   /**
    * Threshold for the number of values a categorical feature can take.
-   * If a feature is found to have > maxCategories values, then it is declared continuous.
-   * Must be >= 2.
+   * If a feature is found to have {@literal >} maxCategories values, then it is declared
+   * continuous. Must be greater than or equal to 2.
    *
    * (default = 20)
    * @group param
@@ -59,7 +59,7 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
 }
 
 /**
- * Class for indexing categorical feature columns in a dataset of [[Vector]].
+ * Class for indexing categorical feature columns in a dataset of `Vector`.
  *
  * This has 2 usage modes:
  *  - Automatically identify categorical features (default behavior)
@@ -76,7 +76,8 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
  *     - Warning: This can cause problems if features are continuous since this will collect ALL
  *       unique values to the driver.
  *     - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.
- *       If maxCategories >= 3, then both features will be declared categorical.
+ *       If maxCategories is greater than or equal to 3, then both features will be declared
+ *       categorical.
  *
  * This returns a model which can transform categorical features to use 0-based indices.
  *
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
index 966ccb85d0e0e..e3e462d07e10c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala
@@ -32,8 +32,8 @@ import org.apache.spark.sql.types.StructType
  * This class takes a feature vector and outputs a new feature vector with a subarray of the
  * original features.
  *
- * The subset of features can be specified with either indices ([[setIndices()]])
- * or names ([[setNames()]]).  At least one feature must be selected. Duplicate features
+ * The subset of features can be specified with either indices (`setIndices()`)
+ * or names (`setNames()`). At least one feature must be selected. Duplicate features
  * are not allowed, so there can be no overlap between selected indices and names.
  *
  * The output vector will order features with the selected indices first (in the order given),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
index dcff4245d1d26..ce7f335056872 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package-info.java
@@ -61,12 +61,12 @@
  *      createStructField("id", IntegerType, false),
  *      createStructField("text", StringType, false),
  *      createStructField("rating", DoubleType, false)));
- *  JavaRDD<Row> rowRDD = jsc.parallelize(
+ *  JavaRDD&lt;Row&gt; rowRDD = jsc.parallelize(
  *    Arrays.asList(
  *      RowFactory.create(0, "Hi I heard about Spark", 3.0),
  *      RowFactory.create(1, "I wish Java could use case classes", 4.0),
  *      RowFactory.create(2, "Logistic regression models are neat", 4.0)));
- *  Dataset<Row> dataset = jsql.createDataFrame(rowRDD, schema);
+ *  Dataset&lt;Row&gt; dataset = jsql.createDataFrame(rowRDD, schema);
  *  // define feature transformers
  *  RegexTokenizer tok = new RegexTokenizer()
  *    .setInputCol("text")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 5bd8ebe0987a9..9adb0fa618f29 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -87,7 +87,7 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
   def ->(value: T): ParamPair[T] = ParamPair(this, value)
   // scalastyle:on
 
-  /** Encodes a param value into JSON, which can be decoded by [[jsonDecode()]]. */
+  /** Encodes a param value into JSON, which can be decoded by `jsonDecode()`. */
   def jsonEncode(value: T): String = {
     value match {
       case x: String =>
@@ -140,7 +140,7 @@ private[ml] object Param {
 
 /**
  * :: DeveloperApi ::
- * Factory methods for common validation functions for [[Param.isValid]].
+ * Factory methods for common validation functions for `Param.isValid`.
  * The numerical methods only support Int, Long, Float, and Double.
  */
 @DeveloperApi
@@ -165,32 +165,39 @@ object ParamValidators {
         s" of unexpected input type: ${value.getClass}")
   }
 
-  /** Check if value > lowerBound */
+  /**
+   * Check if value is greater than lowerBound
+   */
   def gt[T](lowerBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) > lowerBound
   }
 
-  /** Check if value >= lowerBound */
+  /**
+   * Check if value is greater than or equal to lowerBound
+   */
   def gtEq[T](lowerBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) >= lowerBound
   }
 
-  /** Check if value < upperBound */
+  /**
+   * Check if value is less than upperBound
+   */
   def lt[T](upperBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) < upperBound
   }
 
-  /** Check if value <= upperBound */
+  /**
+   * Check if value is less than or equal to upperBound
+   */
   def ltEq[T](upperBound: Double): T => Boolean = { (value: T) =>
     getDouble(value) <= upperBound
   }
 
   /**
    * Check for value in range lowerBound to upperBound.
-   * @param lowerInclusive  If true, check for value >= lowerBound.
-   *                        If false, check for value > lowerBound.
-   * @param upperInclusive  If true, check for value <= upperBound.
-   *                        If false, check for value < upperBound.
+   *
+   * @param lowerInclusive if true, range includes value = lowerBound
+   * @param upperInclusive if true, range includes value = upperBound
    */
   def inRange[T](
       lowerBound: Double,
@@ -203,7 +210,7 @@ object ParamValidators {
     lowerValid && upperValid
   }
 
-  /** Version of [[inRange()]] which uses inclusive be default: [lowerBound, upperBound] */
+  /** Version of `inRange()` which uses inclusive be default: [lowerBound, upperBound] */
   def inRange[T](lowerBound: Double, upperBound: Double): T => Boolean = {
     inRange[T](lowerBound, upperBound, lowerInclusive = true, upperInclusive = true)
   }
@@ -228,7 +235,7 @@ object ParamValidators {
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Double]]] for Java.
+ * Specialized version of `Param[Double]` for Java.
  */
 @DeveloperApi
 class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean)
@@ -288,7 +295,7 @@ private[param] object DoubleParam {
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Int]]] for Java.
+ * Specialized version of `Param[Int]` for Java.
  */
 @DeveloperApi
 class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean)
@@ -317,7 +324,7 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Float]]] for Java.
+ * Specialized version of `Param[Float]` for Java.
  */
 @DeveloperApi
 class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean)
@@ -378,7 +385,7 @@ private object FloatParam {
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Long]]] for Java.
+ * Specialized version of `Param[Long]` for Java.
  */
 @DeveloperApi
 class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean)
@@ -407,7 +414,7 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Boolean]]] for Java.
+ * Specialized version of `Param[Boolean]` for Java.
  */
 @DeveloperApi
 class BooleanParam(parent: String, name: String, doc: String) // No need for isValid
@@ -430,7 +437,7 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Array[String]]]] for Java.
+ * Specialized version of `Param[Array[String]]` for Java.
  */
 @DeveloperApi
 class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array[String] => Boolean)
@@ -455,7 +462,7 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Array[Double]]]] for Java.
+ * Specialized version of `Param[Array[Double]]` for Java.
  */
 @DeveloperApi
 class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array[Double] => Boolean)
@@ -485,7 +492,7 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
 
 /**
  * :: DeveloperApi ::
- * Specialized version of [[Param[Array[Int]]]] for Java.
+ * Specialized version of `Param[Array[Int]]` for Java.
  */
 @DeveloperApi
 class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[Int] => Boolean)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index 02e2384afe530..4e636dbd9f5fc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -355,8 +355,8 @@ object ALSModel extends MLReadable[ALSModel] {
  *
  * Essentially instead of finding the low-rank approximations to the rating matrix `R`,
  * this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r &gt; 0 and 0 if r &lt;= 0. The ratings then act as 'confidence' values related to strength of
- * indicated user
+ * r is greater than 0 and 0 if r is less than or equal to 0. The ratings then act as 'confidence'
+ * values related to strength of indicated user
  * preferences rather than explicit ratings given to items.
  */
 @Since("1.3.0")
@@ -877,7 +877,7 @@ object ALS extends DefaultParamsReadable[ALS] with Logging {
   }
 
   /**
-   * Builder for [[RatingBlock]]. [[mutable.ArrayBuilder]] is used to avoid boxing/unboxing.
+   * Builder for [[RatingBlock]]. `mutable.ArrayBuilder` is used to avoid boxing/unboxing.
    */
   private[recommendation] class RatingBlockBuilder[@specialized(Int, Long) ID: ClassTag]
     extends Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index d6ad1ea6d1096..af68e7b9d5809 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -185,7 +185,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
   setDefault(tol -> 1E-6)
 
   /**
-   * Suggested depth for treeAggregate (>= 2).
+   * Suggested depth for treeAggregate (greater than or equal to 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 894b6a2ca2041..0b0c46144bfbe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -132,7 +132,8 @@ object DecisionTreeRegressor extends DefaultParamsReadable[DecisionTreeRegressor
 }
 
 /**
- * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] model for regression.
+ * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">
+ * Decision tree (Wikipedia)</a> model for regression.
  * It supports both continuous and categorical features.
  * @param rootNode  Root of the decision tree
  */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 6d8159aa3bdcf..6e62c8d03c708 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -229,7 +229,7 @@ class GBTRegressionModel private[ml](
    * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
    * and follows the implementation from scikit-learn.
    *
-   * @see [[DecisionTreeRegressionModel.featureImportances]]
+   * @see `DecisionTreeRegressionModel.featureImportances`
    */
   @Since("2.0.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 1201ecd5e4e61..e718cda2623a0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -131,10 +131,10 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
  * It supports "gaussian", "binomial", "poisson" and "gamma" as family.
  * Valid link functions for each family is listed below. The first link function of each family
  * is the default one.
- *  - "gaussian" -> "identity", "log", "inverse"
- *  - "binomial" -> "logit", "probit", "cloglog"
- *  - "poisson"  -> "log", "identity", "sqrt"
- *  - "gamma"    -> "inverse", "identity", "log"
+ *  - "gaussian" : "identity", "log", "inverse"
+ *  - "binomial" : "logit", "probit", "cloglog"
+ *  - "poisson"  : "log", "identity", "sqrt"
+ *  - "gamma"    : "inverse", "identity", "log"
  */
 @Experimental
 @Since("2.0.0")
@@ -1066,7 +1066,7 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   import GeneralizedLinearRegression._
 
   /**
-   * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver.
+   * Whether the underlying `WeightedLeastSquares` using the "normal" solver.
    */
   private[ml] val isNormalSolver: Boolean = {
     diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
@@ -1074,10 +1074,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
 
   /**
    * Standard error of estimated coefficients and intercept.
-   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * This value is only available when the underlying `WeightedLeastSquares`
    * using the "normal" solver.
    *
-   * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+   * If `GeneralizedLinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
@@ -1092,10 +1092,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
 
   /**
    * T-statistic of estimated coefficients and intercept.
-   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * This value is only available when the underlying `WeightedLeastSquares`
    * using the "normal" solver.
    *
-   * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+   * If `GeneralizedLinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
@@ -1115,10 +1115,10 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
 
   /**
    * Two-sided p-value of estimated coefficients and intercept.
-   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * This value is only available when the underlying `WeightedLeastSquares`
    * using the "normal" solver.
    *
-   * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+   * If `GeneralizedLinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 4d274f3a5bbf1..c378a99e3c230 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -56,7 +56,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
   final def getIsotonic: Boolean = $(isotonic)
 
   /**
-   * Param for the index of the feature if [[featuresCol]] is a vector column (default: `0`), no
+   * Param for the index of the feature if `featuresCol` is a vector column (default: `0`), no
    * effect otherwise.
    * @group param
    */
@@ -194,7 +194,7 @@ object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] {
  * Model fitted by IsotonicRegression.
  * Predicts using a piecewise linear function.
  *
- * For detailed rules see [[org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()]].
+ * For detailed rules see `org.apache.spark.mllib.regression.IsotonicRegressionModel.predict()`.
  *
  * @param oldModel A [[org.apache.spark.mllib.regression.IsotonicRegressionModel]]
  *                 model trained by [[org.apache.spark.mllib.regression.IsotonicRegression]].
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 19ddf36a718c4..534ef87ec64ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -60,11 +60,11 @@ private[regression] trait LinearRegressionParams extends PredictorParams
  * The learning objective is to minimize the squared error, with regularization.
  * The specific squared error loss function used is:
  *
- * <p><blockquote>
+ * <blockquote>
  *    $$
  *    L = 1/2n ||A coefficients - y||^2^
  *    $$
- * </blockquote></p>
+ * </blockquote>
  *
  * This supports multiple types of regularization:
  *  - none (a.k.a. ordinary least squares)
@@ -118,8 +118,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
 
   /**
    * Set the ElasticNet mixing parameter.
-   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+   * For alpha = 0, the penalty is an L2 penalty.
+   * For alpha = 1, it is an L1 penalty.
+   * For alpha in (0,1), the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
    * @group setParam
@@ -165,7 +166,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    *  - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
    *    optimization method.
    *  - "normal" denotes using Normal Equation as an analytical solution to the linear regression
-   *    problem.  This solver is limited to [[LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER]].
+   *    problem.  This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`.
    *  - "auto" (default) means that the solver algorithm is selected automatically.
    *    The Normal Equations solver will be used when possible, but this will automatically fall
    *    back to iterative optimization methods when needed.
@@ -181,7 +182,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   setDefault(solver -> "auto")
 
   /**
-   * Suggested depth for treeAggregate (>= 2).
+   * Suggested depth for treeAggregate (greater than or equal to 2).
    * If the dimensions of features or the number of partitions are large,
    * this param could be adjusted to a larger size.
    * Default is 2.
@@ -338,12 +339,12 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       /*
          Note that in Linear Regression, the objective history (loss + regularization) returned
          from optimizer is computed in the scaled space given by the following formula.
-         <p><blockquote>
+         <blockquote>
             $$
             L &= 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2
                  + regTerms \\
             $$
-         </blockquote></p>
+         </blockquote>
        */
       val arrayBuilder = mutable.ArrayBuilder.make[Double]
       var state: optimizer.State = null
@@ -414,7 +415,7 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
   override def load(path: String): LinearRegression = super.load(path)
 
   /**
-   * When using [[LinearRegression.solver]] == "normal", the solver must limit the number of
+   * When using `LinearRegression.solver` == "normal", the solver must limit the number of
    * features to at most this number.  The entire covariance matrix X^T^X will be collected
    * to the driver. This limit helps prevent memory overflow errors.
    */
@@ -584,7 +585,7 @@ class LinearRegressionTrainingSummary private[regression] (
    *
    * This value is only available when using the "l-bfgs" solver.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   @Since("1.5.0")
   val totalIterations = objectiveHistory.length
@@ -624,7 +625,7 @@ class LinearRegressionSummary private[regression] (
    * Reference: <a href="http://en.wikipedia.org/wiki/Explained_variation">
    * Wikipedia explain variation</a>
    *
-   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
    * This will change in later Spark versions.
    */
   @Since("1.5.0")
@@ -634,7 +635,7 @@ class LinearRegressionSummary private[regression] (
    * Returns the mean absolute error, which is a risk function corresponding to the
    * expected value of the absolute error loss or l1-norm loss.
    *
-   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
    * This will change in later Spark versions.
    */
   @Since("1.5.0")
@@ -644,7 +645,7 @@ class LinearRegressionSummary private[regression] (
    * Returns the mean squared error, which is a risk function corresponding to the
    * expected value of the squared error loss or quadratic loss.
    *
-   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
    * This will change in later Spark versions.
    */
   @Since("1.5.0")
@@ -654,7 +655,7 @@ class LinearRegressionSummary private[regression] (
    * Returns the root mean squared error, which is defined as the square root of
    * the mean squared error.
    *
-   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
    * This will change in later Spark versions.
    */
   @Since("1.5.0")
@@ -665,7 +666,7 @@ class LinearRegressionSummary private[regression] (
    * Reference: <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
    * Wikipedia coefficient of determination</a>
    *
-   * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+   * @note This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`.
    * This will change in later Spark versions.
    */
   @Since("1.5.0")
@@ -711,10 +712,10 @@ class LinearRegressionSummary private[regression] (
    * Standard error of estimated coefficients and intercept.
    * This value is only available when using the "normal" solver.
    *
-   * If [[LinearRegression.fitIntercept]] is set to true,
+   * If `LinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   lazy val coefficientStandardErrors: Array[Double] = {
     if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -739,10 +740,10 @@ class LinearRegressionSummary private[regression] (
    * T-statistic of estimated coefficients and intercept.
    * This value is only available when using the "normal" solver.
    *
-   * If [[LinearRegression.fitIntercept]] is set to true,
+   * If `LinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   lazy val tValues: Array[Double] = {
     if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
@@ -762,10 +763,10 @@ class LinearRegressionSummary private[regression] (
    * Two-sided p-value of estimated coefficients and intercept.
    * This value is only available when using the "normal" solver.
    *
-   * If [[LinearRegression.fitIntercept]] is set to true,
+   * If `LinearRegression.fitIntercept` is set to true,
    * then the last element returned corresponds to the intercept.
    *
-   * @see [[LinearRegression.solver]]
+   * @see `LinearRegression.solver`
    */
   lazy val pValues: Array[Double] = {
     if (diagInvAtWA.length == 1 && diagInvAtWA(0) == 0) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 90d89c51c5740..62dd729a2994a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -200,7 +200,7 @@ class RandomForestRegressionModel private[ml] (
    * (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
    * and follows the implementation from scikit-learn.
    *
-   * @see [[DecisionTreeRegressionModel.featureImportances]]
+   * @see `DecisionTreeRegressionModel.featureImportances`
    */
   @Since("1.5.0")
   lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
index 442f52bf0231d..bc3c86a57c85b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala
@@ -35,7 +35,7 @@ import org.apache.spark.rdd.RDD
  * @param numClasses    For classification: labels can take values {0, ..., numClasses - 1}.
  *                      For regression: fixed at 0 (no meaning).
  * @param maxBins  Maximum number of bins, for all features.
- * @param featureArity  Map: categorical feature index --> arity.
+ * @param featureArity  Map: categorical feature index to arity.
  *                      I.e., the feature takes values in {0, ..., arity - 1}.
  * @param numBins  Number of bins for each feature.
  */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
index f34a8310ddf1c..3e19f27183942 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/MetadataUtils.scala
@@ -48,7 +48,7 @@ private[spark] object MetadataUtils {
    *                        If a feature does not have metadata, it is assumed to be continuous.
    *                        If a feature is Nominal, then it must have the number of values
    *                        specified.
-   * @return  Map: feature index --> number of categories.
+   * @return  Map: feature index to number of categories.
    *          The map's set of keys will be the set of categorical feature indices.
    */
   def getCategoricalFeatures(featuresSchema: StructField): Map[Int, Int] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index bbb9886391697..95f480455ee45 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -76,7 +76,7 @@ private[util] sealed trait BaseReadWrite {
    */
   protected final def sqlContext: SQLContext = sparkSession.sqlContext
 
-  /** Returns the underlying [[SparkContext]]. */
+  /** Returns the underlying `SparkContext`. */
   protected final def sc: SparkContext = sparkSession.sparkContext
 }
 
@@ -169,7 +169,7 @@ trait MLWritable {
  * This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
  * [[org.apache.spark.sql.Dataset]].
  *
- * @see  [[DefaultParamsReadable]], the counterpart to this trait
+ * @see `DefaultParamsReadable`, the counterpart to this trait
  */
 @DeveloperApi
 trait DefaultParamsWritable extends MLWritable { self: Params =>
@@ -238,7 +238,7 @@ trait MLReadable[T] {
  * [[org.apache.spark.sql.Dataset]].
  *
  * @tparam T ML instance type
- * @see  [[DefaultParamsWritable]], the counterpart to this trait
+ * @see `DefaultParamsWritable`, the counterpart to this trait
  */
 @DeveloperApi
 trait DefaultParamsReadable[T] extends MLReadable[T] {
@@ -345,7 +345,7 @@ private[ml] object DefaultParamsReader {
   /**
    * All info from metadata file.
    *
-   * @param params  paramMap, as a [[JValue]]
+   * @param params  paramMap, as a `JValue`
    * @param metadata  All metadata, including the other fields
    * @param metadataJson  Full metadata file String (for debugging)
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index aec1526b55c49..5fb04ed0ee9a2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -124,7 +124,7 @@ object SVMModel extends Loader[SVMModel] {
 
 /**
  * Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2
- * regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
+ * regularization is used, which can be changed via `SVMWithSGD.optimizer`.
  *
  * @note Labels used in SVM should be {0, 1}.
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index 31f51417528b3..336f2fc114309 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -43,9 +43,9 @@ import org.apache.spark.storage.StorageLevel
  * @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if
  *          there are no divisible leaf clusters.
  * @param maxIterations the max number of k-means iterations to split clusters (default: 20)
- * @param minDivisibleClusterSize the minimum number of points (if &gt;= 1.0) or the minimum
- *                                proportion of points (if &lt; 1.0) of a divisible cluster
- *                                (default: 1)
+ * @param minDivisibleClusterSize the minimum number of points (if greater than or equal 1.0) or
+ *                                the minimum proportion of points (if less than 1.0) of a divisible
+ *                                cluster (default: 1)
  * @param seed a random seed (default: hash value of the class name)
  *
  * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
@@ -101,8 +101,8 @@ class BisectingKMeans private (
   def getMaxIterations: Int = this.maxIterations
 
   /**
-   * Sets the minimum number of points (if &gt;= `1.0`) or the minimum proportion of points
-   * (if &lt; `1.0`) of a divisible cluster (default: 1).
+   * Sets the minimum number of points (if greater than or equal to `1.0`) or the minimum proportion
+   * of points (if less than `1.0`) of a divisible cluster (default: 1).
    */
   @Since("1.6.0")
   def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = {
@@ -113,8 +113,8 @@ class BisectingKMeans private (
   }
 
   /**
-   * Gets the minimum number of points (if &gt;= `1.0`) or the minimum proportion of points
-   * (if &lt; `1.0`) of a divisible cluster.
+   * Gets the minimum number of points (if greater than or equal to `1.0`) or the minimum proportion
+   * of points (if less than `1.0`) of a divisible cluster.
    */
   @Since("1.6.0")
   def getMinDivisibleClusterSize: Double = minDivisibleClusterSize
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 6873d4277a8db..10bd8468b35cf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -274,7 +274,7 @@ class GaussianMixture private (
 private[clustering] object GaussianMixture {
   /**
    * Heuristic to distribute the computation of the `MultivariateGaussian`s, approximately when
-   * d &gt; 25 except for when k is very small.
+   * d is greater than 25 except for when k is very small.
    * @param k  Number of topics
    * @param d  Number of features
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 16742bd284e69..4cb9200030293 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -120,11 +120,11 @@ class LDA private (
    *  - EM
    *     - Currently only supports symmetric distributions, so all values in the vector should be
    *       the same.
-   *     - Values should be &gt; 1.0
+   *     - Values should be greater than 1.0
    *     - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
    *       from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Values should be &gt;= 0
+   *     - Values should be greater than or equal to 0
    *     - default = uniformly (1.0 / k), following the implementation from
    *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    */
@@ -195,11 +195,11 @@ class LDA private (
    *
    * Optimizer-specific parameter settings:
    *  - EM
-   *     - Value should be &gt; 1.0
+   *     - Value should be greater than 1.0
    *     - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
    *       Asuncion et al. (2009), who recommend a +1 adjustment for EM.
    *  - Online
-   *     - Value should be &gt;= 0
+   *     - Value should be greater than or equal to 0
    *     - default = (1.0 / k), following the implementation from
    *       <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
    */
@@ -216,7 +216,7 @@ class LDA private (
   def getBeta: Double = getTopicConcentration
 
   /**
-   * Alias for [[setTopicConcentration()]]
+   * Alias for `setTopicConcentration()`
    */
   @Since("1.3.0")
   def setBeta(beta: Double): this.type = setTopicConcentration(beta)
@@ -261,11 +261,11 @@ class LDA private (
   def getCheckpointInterval: Int = checkpointInterval
 
   /**
-   * Parameter for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that
-   * the cache will get checkpointed every 10 iterations. Checkpointing helps with recovery
-   * (when nodes fail). It also helps with eliminating temporary shuffle files on disk, which can be
-   * important when LDA is run for many iterations. If the checkpoint directory is not set in
-   * [[org.apache.spark.SparkContext]], this setting is ignored. (default = 10)
+   * Parameter for set checkpoint interval (greater than or equal to 1) or disable checkpoint (-1).
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations. Checkpointing helps
+   * with recovery (when nodes fail). It also helps with eliminating temporary shuffle files on
+   * disk, which can be important when LDA is run for many iterations. If the checkpoint directory
+   * is not set in [[org.apache.spark.SparkContext]], this setting is ignored. (default = 10)
    *
    * @see [[org.apache.spark.SparkContext#setCheckpointDir]]
    */
@@ -321,7 +321,7 @@ class LDA private (
    * @param documents  RDD of documents, which are term (word) count vectors paired with IDs.
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary
    *                   (where the vocabulary size is the length of the vector).
-   *                   Document IDs must be unique and &gt;= 0.
+   *                   Document IDs must be unique and greater than or equal to 0.
    * @return  Inferred LDA model
    */
   @Since("1.3.0")
@@ -340,7 +340,7 @@ class LDA private (
   }
 
   /**
-   * Java-friendly version of [[run()]]
+   * Java-friendly version of `run()`
    */
   @Since("1.3.0")
   def run(documents: JavaPairRDD[java.lang.Long, Vector]): LDAModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 017fbc6feb0d7..25ffd8561fe37 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -171,7 +171,7 @@ abstract class LDAModel private[clustering] extends Saveable {
    *                   The term count vectors are "bags of words" with a fixed-size vocabulary
    *                   (where the vocabulary size is the length of the vector).
    *                   This must use the same vocabulary (ordering of term counts) as in training.
-   *                   Document IDs must be unique and &gt;= 0.
+   *                   Document IDs must be unique and greater than or equal to 0.
    * @return  Estimated topic distribution for each document.
    *          The returned RDD may be zipped with the given RDD, where each returned vector
    *          is a multinomial distribution over topics.
@@ -392,7 +392,7 @@ class LocalLDAModel private[spark] (
    * literature).  Returns a vector of zeros for an empty document.
    *
    * Note this means to allow quick query for single document. For batch documents, please refer
-   * to [[topicDistributions()]] to avoid overhead.
+   * to `topicDistributions()` to avoid overhead.
    *
    * @param document document to predict topic mixture distributions for
    * @return topic mixture distribution for the document
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 9687fc8804e89..96b49bcc0aac1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -350,9 +350,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
    * Mini-batch fraction in (0, 1], which sets the fraction of document sampled and used in
    * each iteration.
    *
-   * @note This should be adjusted in synch with [[LDA.setMaxIterations()]]
+   * @note This should be adjusted in synch with `LDA.setMaxIterations()`
    * so the entire corpus is used.  Specifically, set both so that
-   * maxIterations * miniBatchFraction >= 1.
+   * maxIterations * miniBatchFraction is at least 1.
    *
    * Default: 0.05, i.e., 5% of total documents.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 92cd7f22dc439..9b7cd0427f5ed 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -78,7 +78,8 @@ class BinaryClassificationMetrics @Since("1.3.0") (
    * Returns the receiver operating characteristic (ROC) curve,
    * which is an RDD of (false positive rate, true positive rate)
    * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
-   * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+   * @see <a href="http://en.wikipedia.org/wiki/Receiver_operating_characteristic">
+   * Receiver operating characteristic (Wikipedia)</a>
    */
   @Since("1.0.0")
   def roc(): RDD[(Double, Double)] = {
@@ -98,7 +99,8 @@ class BinaryClassificationMetrics @Since("1.3.0") (
   /**
    * Returns the precision-recall curve, which is an RDD of (recall, precision),
    * NOT (precision, recall), with (0.0, 1.0) prepended to it.
-   * @see http://en.wikipedia.org/wiki/Precision_and_recall
+   * @see <a href="http://en.wikipedia.org/wiki/Precision_and_recall">
+   * Precision and recall (Wikipedia)</a>
    */
   @Since("1.0.0")
   def pr(): RDD[(Double, Double)] = {
@@ -118,7 +120,7 @@ class BinaryClassificationMetrics @Since("1.3.0") (
    * Returns the (threshold, F-Measure) curve.
    * @param beta the beta factor in F-Measure computation.
    * @return an RDD of (threshold, F-Measure) pairs.
-   * @see http://en.wikipedia.org/wiki/F1_score
+   * @see <a href="http://en.wikipedia.org/wiki/F1_score">F1 score (Wikipedia)</a>
    */
   @Since("1.0.0")
   def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index e29b51c3a19da..b98aa0534152b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -30,7 +30,7 @@ import org.apache.spark.rdd.RDD
 /**
  * Evaluator for ranking algorithms.
  *
- * Java users should use [[RankingMetrics$.of]] to create a [[RankingMetrics]] instance.
+ * Java users should use `RankingMetrics$.of` to create a [[RankingMetrics]] instance.
  *
  * @param predictionAndLabels an RDD of (predicted ranking, ground truth set) pairs.
  */
@@ -41,9 +41,9 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])]
   /**
    * Compute the average precision of all the queries, truncated at ranking position k.
    *
-   * If for a query, the ranking algorithm returns n (n < k) results, the precision value will be
-   * computed as #(relevant items retrieved) / k. This formula also applies when the size of the
-   * ground truth set is less than k.
+   * If for a query, the ranking algorithm returns n (n is less than k) results, the precision
+   * value will be computed as #(relevant items retrieved) / k. This formula also applies when
+   * the size of the ground truth set is less than k.
    *
    * If a query has an empty ground truth set, zero will be used as precision together with
    * a log warning.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
index be3319d60ce25..5a4c6aef50b7b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala
@@ -62,7 +62,7 @@ private[evaluation] object Recall extends BinaryClassificationMetricComputer {
  * F-Measure. Defined as 0 if both precision and recall are 0. EG in the case that all examples
  * are false positives.
  * @param beta the beta constant in F-Measure
- * @see http://en.wikipedia.org/wiki/F1_score
+ * @see <a href="http://en.wikipedia.org/wiki/F1_score">F1 score (Wikipedia)</a>
  */
 private[evaluation] case class FMeasure(beta: Double) extends BinaryClassificationMetricComputer {
   private val beta2 = beta * beta
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 3c26d2670841b..dca031477d3b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset
 import org.apache.spark.rdd.RDD
 
 /**
- * Generates association rules from a [[RDD[FreqItemset[Item]]]. This method only generates
+ * Generates association rules from a `RDD[FreqItemset[Item]]`. This method only generates
  * association rules which have a single item as the consequent.
  *
  */
@@ -56,7 +56,7 @@ class AssociationRules private[fpm] (
   /**
    * Computes the association rules with confidence above [[minConfidence]].
    * @param freqItemsets frequent itemset model obtained from [[FPGrowth]]
-   * @return a [[Set[Rule[Item]]] containing the association rules.
+   * @return a `Set[Rule[Item]]` containing the association rules.
    *
    */
   @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index b53386012280d..e3cf0d4979ed4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -44,7 +44,7 @@ import org.apache.spark.storage.StorageLevel
 
 /**
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
- * @param freqItemsets frequent itemset, which is an RDD of [[FreqItemset]]
+ * @param freqItemsets frequent itemset, which is an RDD of `FreqItemset`
  * @tparam Item item type
  */
 @Since("1.3.0")
@@ -69,7 +69,7 @@ class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
    *  - human-readable (JSON) model metadata to path/metadata/
    *  - Parquet formatted data to path/data/
    *
-   * The model may be loaded using [[FPGrowthModel.load]].
+   * The model may be loaded using `FPGrowthModel.load`.
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
@@ -309,7 +309,7 @@ object FPGrowth {
 
   /**
    * Frequent itemset.
-   * @param items items in this itemset. Java users should call [[FreqItemset#javaItems]] instead.
+   * @param items items in this itemset. Java users should call `FreqItemset.javaItems` instead.
    * @param freq frequency
    * @tparam Item item type
    *
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index a5641672218dd..327cb974ef96c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -211,7 +211,7 @@ class PrefixSpan private (
   }
 
   /**
-   * A Java-friendly version of [[run()]] that reads sequences from a [[JavaRDD]] and returns
+   * A Java-friendly version of `run()` that reads sequences from a `JavaRDD` and returns
    * frequent sequences in a [[PrefixSpanModel]].
    * @param data ordered sequences of itemsets stored as Java Iterable of Iterables
    * @tparam Item item type
@@ -366,13 +366,13 @@ object PrefixSpan extends Logging {
    * Items are represented by positive integers, and items in each itemset must be distinct and
    * ordered.
    * we use 0 as the delimiter between itemsets.
-   * For example, a sequence `<(12)(31)1>` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`.
-   * The postfix of this sequence w.r.t. to prefix `<1>` is `<(_2)(13)1>`.
+   * For example, a sequence `(12)(31)1` is represented by `[0, 1, 2, 0, 1, 3, 0, 1, 0]`.
+   * The postfix of this sequence w.r.t. to prefix `1` is `(_2)(13)1`.
    * We may reuse the original items array `[0, 1, 2, 0, 1, 3, 0, 1, 0]` to represent the postfix,
    * and mark the start index of the postfix, which is `2` in this example.
    * So the active items in this postfix are `[2, 0, 1, 3, 0, 1, 0]`.
    * We also remember the start indices of partial projections, the ones that split an itemset.
-   * For example, another possible partial projection w.r.t. `<1>` is `<(_3)1>`.
+   * For example, another possible partial projection w.r.t. `1` is `(_3)1`.
    * We remember the start indices of partial projections, which is `[2, 5]` in this example.
    * This data structure makes it easier to do projections.
    *
@@ -583,7 +583,7 @@ class PrefixSpanModel[Item] @Since("1.5.0") (
    *  - human-readable (JSON) model metadata to path/metadata/
    *  - Parquet formatted data to path/data/
    *
-   * The model may be loaded using [[PrefixSpanModel.load]].
+   * The model may be loaded using `PrefixSpanModel.load`.
    *
    * @param sc  Spark context used to save model data.
    * @param path  Path specifying the directory in which to save this model.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index bb94745f078e8..7695aabf4313d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -32,7 +32,7 @@ private[mllib] object EigenValueDecomposition {
    *
    * @param mul a function that multiplies the symmetric matrix with a DenseVector.
    * @param n dimension of the square matrix (maximum Int.MaxValue).
-   * @param k number of leading eigenvalues required, 0 < k < n.
+   * @param k number of leading eigenvalues required, where k must be positive and less than n.
    * @param tol tolerance of the eigs computation.
    * @param maxIterations the maximum number of Arnoldi update iterations.
    * @return a dense vector of eigenvalues in descending order and a dense matrix of eigenvectors
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index c94d7890cf557..63ea9d3264b0f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -77,7 +77,7 @@ sealed trait Vector extends Serializable {
 
   /**
    * Returns a hash code value for the vector. The hash code is based on its size and its first 128
-   * nonzero entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
+   * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`.
    */
   override def hashCode(): Int = {
     // This is a reference implementation. It calls return in foreachActive, which is slow.
@@ -351,7 +351,7 @@ object Vectors {
   }
 
   /**
-   * Parses a string resulted from [[Vector.toString]] into a [[Vector]].
+   * Parses a string resulted from `Vector.toString` into a [[Vector]].
    */
   @Since("1.1.0")
   def parse(s: String): Vector = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 67da88e804da2..8979707666a2f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -88,11 +88,11 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
    * convergenceTol is a condition which decides iteration termination.
    * The end of iteration is decided based on below logic.
    *
-   *  - If the norm of the new solution vector is &gt;1, the diff of solution vectors
+   *  - If the norm of the new solution vector is greater than 1, the diff of solution vectors
    *    is compared to relative tolerance which means normalizing by the norm of
    *    the new solution vector.
-   *  - If the norm of the new solution vector is &lt;=1, the diff of solution vectors
-   *    is compared to absolute tolerance which is not normalizing.
+   *  - If the norm of the new solution vector is less than or equal to 1, the diff of solution
+   *    vectors is compared to absolute tolerance which is not normalizing.
    *
    * Must be between 0.0 and 1.0 inclusively.
    */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index 6232ff30a747e..900eec18489c1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -49,8 +49,7 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
    * Set the number of corrections used in the LBFGS update. Default 10.
    * Values of numCorrections less than 3 are not recommended; large values
    * of numCorrections will result in excessive computing time.
-   * 3 &lt; numCorrections &lt; 10 is recommended.
-   * Restriction: numCorrections &gt; 0
+   * numCorrections must be positive, and values from 4 to 9 are generally recommended.
    */
   def setNumCorrections(corrections: Int): this.type = {
     require(corrections > 0,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
index b7c9fcfbfe60f..86632ae335957 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
@@ -53,8 +53,13 @@ private[spark] object NNLS {
    * projected gradient method.  That is, find x minimising ||Ax - b||_2 given A^T A and A^T b.
    *
    * We solve the problem
-   *   min_x      1/2 x^T ata x^T - x^T atb
-   *   subject to x &gt;= 0
+   *
+   * <blockquote>
+   *    $$
+   *    min_x 1/2 x^T ata x^T - x^T atb
+   *    $$
+   * </blockquote>
+   * where x is nonnegative.
    *
    * The method used is similar to one described by Polyak (B. T. Polyak, The conjugate gradient
    * method in extremal problems, Zh. Vychisl. Mat. Mat. Fiz. 9(4)(1969), pp. 94-112) for bound-
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
index aa7dd1aaa60fe..142f0ec6b9021 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
@@ -95,9 +95,9 @@ class SimpleUpdater extends Updater {
  * The corresponding proximal operator for the L1 norm is the soft-thresholding
  * function. That is, each weight component is shrunk towards 0 by shrinkageVal.
  *
- * If w &gt; shrinkageVal, set weight component to w-shrinkageVal.
- * If w &lt; -shrinkageVal, set weight component to w+shrinkageVal.
- * If -shrinkageVal &lt; w &lt; shrinkageVal, set weight component to 0.
+ * If w is greater than shrinkageVal, set weight component to w-shrinkageVal.
+ * If w is less than -shrinkageVal, set weight component to w+shrinkageVal.
+ * If w is (-shrinkageVal, shrinkageVal), set weight component to 0.
  *
  * Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 6d60136ddc38f..85d4d7f37f2c0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -249,8 +249,8 @@ object RandomRDDs {
    *  shape and scale.
    *
    * @param sc SparkContext used to create the RDD.
-   * @param shape shape parameter (> 0) for the gamma distribution
-   * @param scale scale parameter (> 0) for the gamma distribution
+   * @param shape shape parameter (greater than 0) for the gamma distribution
+   * @param scale scale parameter (greater than 0) for the gamma distribution
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
@@ -766,8 +766,8 @@ object RandomRDDs {
    * gamma distribution with the input shape and scale.
    *
    * @param sc SparkContext used to create the RDD.
-   * @param shape shape parameter (> 0) for the gamma distribution.
-   * @param scale scale parameter (> 0) for the gamma distribution.
+   * @param shape shape parameter (greater than 0) for the gamma distribution.
+   * @param scale scale parameter (greater than 0) for the gamma distribution.
    * @param numRows Number of Vectors in the RDD.
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
index adb5e51947f6d..365b2a06110f6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/SlidingRDD.scala
@@ -42,8 +42,8 @@ class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T]
  * @param windowSize the window size, must be greater than 1
  * @param step step size for windows
  *
- * @see [[org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*]]
- * @see [[scala.collection.IterableLike.sliding(Int, Int)*]]
+ * @see `org.apache.spark.mllib.rdd.RDDFunctions.sliding(Int, Int)*`
+ * @see `scala.collection.IterableLike.sliding(Int, Int)*`
  */
 private[mllib]
 class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
index a8b5955a7285d..d17f7047c5b2b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD
  * distribution of the sample data and the theoretical distribution we can provide a test for the
  * the null hypothesis that the sample data comes from that theoretical distribution.
  * For more information on KS Test:
- * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+ * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test">
+ * Kolmogorov-Smirnov test (Wikipedia)</a>
  *
  * Implementation note: We seek to implement the KS test with a minimal number of distributed
  * passes. We sort the RDD, and then perform the following operations on a per-partition basis:
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
index 97c032de7a813..d680237bf687f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTest.scala
@@ -47,7 +47,7 @@ case class BinarySample @Since("1.6.0") (
  * of the observation.
  *
  * To address novelty affects, the `peacePeriod` specifies a set number of initial
- * [[org.apache.spark.rdd.RDD]] batches of the [[DStream]] to be dropped from significance testing.
+ * [[org.apache.spark.rdd.RDD]] batches of the `DStream` to be dropped from significance testing.
  *
  * The `windowSize` sets the number of batches each significance test is to be performed over. The
  * window is sliding with a stride length of 1 batch. Setting windowSize to 0 will perform
@@ -97,7 +97,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
   }
 
   /**
-   * Register a [[DStream]] of values for significance testing.
+   * Register a `DStream` of values for significance testing.
    *
    * @param data stream of BinarySample(key,value) pairs where the key denotes group membership
    *             (true = experiment, false = control) and the value is the numerical metric to
@@ -114,7 +114,7 @@ class StreamingTest @Since("1.6.0") () extends Logging with Serializable {
   }
 
   /**
-   * Register a [[JavaDStream]] of values for significance testing.
+   * Register a `JavaDStream` of values for significance testing.
    *
    * @param data stream of BinarySample(isExperiment,value) pairs where the isExperiment denotes
    *             group (true = experiment, false = control) and the value is the numerical metric
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
index ff27f28459e26..14ac14d6d61f4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
@@ -73,7 +73,7 @@ private[stat] sealed trait StreamingTestMethod extends Serializable {
  * This test does not assume equal variance between the two samples and does not assume equal
  * sample size.
  *
- * @see http://en.wikipedia.org/wiki/Welch%27s_t_test
+ * @see <a href="http://en.wikipedia.org/wiki/Welch%27s_t_test">Welch's t-test (Wikipedia)</a>
  */
 private[stat] object WelchTTest extends StreamingTestMethod with Logging {
 
@@ -115,7 +115,7 @@ private[stat] object WelchTTest extends StreamingTestMethod with Logging {
  * mean. This test assumes equal variance between the two samples and does not assume equal sample
  * size. For unequal variances, Welch's t-test should be used instead.
  *
- * @see http://en.wikipedia.org/wiki/Student%27s_t-test
+ * @see <a href="http://en.wikipedia.org/wiki/Student%27s_t-test">Student's t-test (Wikipedia)</a>
  */
 private[stat] object StudentTTest extends StreamingTestMethod with Logging {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index d846c43cf2913..499c80767aea7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -162,7 +162,7 @@ object DecisionTree extends Serializable with Logging {
    * @param numClasses Number of classes for classification. Default value of 2.
    * @param maxBins Maximum number of bins used for splitting features.
    * @param quantileCalculationStrategy  Algorithm for calculating quantiles.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @return DecisionTreeModel that can be used for prediction.
@@ -192,7 +192,7 @@ object DecisionTree extends Serializable with Logging {
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels should take values {0, 1, ..., numClasses-1}.
    * @param numClasses Number of classes for classification.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param impurity Criterion used for information gain calculation.
@@ -238,7 +238,7 @@ object DecisionTree extends Serializable with Logging {
    *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels are real numbers.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param impurity Criterion used for information gain calculation.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index cdeef16135015..3e85678906b33 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -74,7 +74,7 @@ class GradientBoostedTrees private[spark] (
   }
 
   /**
-   * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]].
+   * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.run`.
    */
   @Since("1.2.0")
   def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
@@ -89,7 +89,7 @@ class GradientBoostedTrees private[spark] (
    *                        This dataset should be different from the training dataset,
    *                        but it should follow the same distribution.
    *                        E.g., these two datasets could be created from an original dataset
-   *                        by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+   *                        by using `org.apache.spark.rdd.RDD.randomSplit()`
    * @return GradientBoostedTreesModel that can be used for prediction.
    */
   @Since("1.4.0")
@@ -106,7 +106,7 @@ class GradientBoostedTrees private[spark] (
   }
 
   /**
-   * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
+   * Java-friendly API for `org.apache.spark.mllib.tree.GradientBoostedTrees.runWithValidation`.
    */
   @Since("1.4.0")
   def runWithValidation(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 428af21406092..1f6cb086cefa9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -53,14 +53,15 @@ import org.apache.spark.util.Utils
  *                 the type of random forest (classification or regression), feature type
  *                 (continuous, categorical), depth of the tree, quantile calculation strategy,
  *                 etc.
- * @param numTrees If 1, then no bootstrapping is used.  If > 1, then bootstrapping is done.
+ * @param numTrees If 1, then no bootstrapping is used.  If greater than 1, then bootstrapping is
+ *                 done.
  * @param featureSubsetStrategy Number of features to consider for splits at each node.
  *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
  *                              Supported numerical values: "(0.0-1.0]", "[1-n]".
  *                              If "auto" is set, this parameter is set based on numTrees:
  *                                if numTrees == 1, set to "all";
- *                                if numTrees > 1 (forest) set to "sqrt" for classification and
- *                                  to "onethird" for regression.
+ *                                if numTrees is greater than 1 (forest) set to "sqrt" for
+ *                                  classification and to "onethird" for regression.
  *                              If a real value "n" in the range (0, 1.0] is set,
  *                                use n * number of features.
  *                              If an integer value "n" in the range (1, num features) is set,
@@ -111,7 +112,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt".
+   *                                if numTrees is greater than 1 (forest) set to "sqrt".
    * @param seed Random seed for bootstrapping and choosing feature subsets.
    * @return RandomForestModel that can be used for prediction.
    */
@@ -134,7 +135,7 @@ object RandomForest extends Serializable with Logging {
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels should take values {0, 1, ..., numClasses-1}.
    * @param numClasses Number of classes for classification.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param numTrees Number of trees in the random forest.
@@ -142,7 +143,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt".
+   *                                if numTrees is greater than 1 (forest) set to "sqrt".
    * @param impurity Criterion used for information gain calculation.
    *                 Supported values: "gini" (recommended) or "entropy".
    * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
@@ -200,7 +201,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "onethird".
+   *                                if numTrees is greater than 1 (forest) set to "onethird".
    * @param seed Random seed for bootstrapping and choosing feature subsets.
    * @return RandomForestModel that can be used for prediction.
    */
@@ -222,7 +223,7 @@ object RandomForest extends Serializable with Logging {
    *
    * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
    *              Labels are real numbers.
-   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k)
+   * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n to k)
    *                                indicates that feature n is categorical with k categories
    *                                indexed from 0: {0, 1, ..., k-1}.
    * @param numTrees Number of trees in the random forest.
@@ -230,7 +231,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported values: "auto", "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "onethird".
+   *                                if numTrees is greater than 1 (forest) set to "onethird".
    * @param impurity Criterion used for information gain calculation.
    *                 The only supported value for regression is "variance".
    * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index d8405d13ce904..4334b316cc83a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -36,14 +36,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, Loss, SquaredError}
  * @param validationTol validationTol is a condition which decides iteration termination when
  *                      runWithValidation is used.
  *                      The end of iteration is decided based on below logic:
- *                      If the current loss on the validation set is > 0.01, the diff
+ *                      If the current loss on the validation set is greater than 0.01, the diff
  *                      of validation error is compared to relative tolerance which is
  *                      validationTol * (current loss on the validation set).
- *                      If the current loss on the validation set is <= 0.01, the diff
- *                      of validation error is compared to absolute tolerance which is
+ *                      If the current loss on the validation set is less than or equal to 0.01,
+ *                      the diff of validation error is compared to absolute tolerance which is
  *                      validationTol * 0.01.
  *                      Ignored when
- *                      [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
+ *                      `org.apache.spark.mllib.tree.GradientBoostedTrees.run()` is used.
  */
 @Since("1.2.0")
 case class BoostingStrategy @Since("1.4.0") (
@@ -92,8 +92,8 @@ object BoostingStrategy {
   /**
    * Returns default configuration for the boosting algorithm
    * @param algo Learning goal.  Supported:
-   *             [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
-   *             [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
+   *             `org.apache.spark.mllib.tree.configuration.Algo.Classification`,
+   *             `org.apache.spark.mllib.tree.configuration.Algo.Regression`
    * @return Configuration for boosting algorithm
    */
   @Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index b34e1b1b56c43..58e8f5be7b9f0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -28,8 +28,8 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
 /**
  * Stores all the configuration options for tree construction
  * @param algo  Learning goal.  Supported:
- *              [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
- *              [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
+ *              `org.apache.spark.mllib.tree.configuration.Algo.Classification`,
+ *              `org.apache.spark.mllib.tree.configuration.Algo.Regression`
  * @param impurity Criterion used for information gain calculation.
  *                 Supported for Classification: [[org.apache.spark.mllib.tree.impurity.Gini]],
  *                  [[org.apache.spark.mllib.tree.impurity.Entropy]].
@@ -43,9 +43,9 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance}
  *                for choosing how to split on features at each node.
  *                More bins give higher granularity.
  * @param quantileCalculationStrategy Algorithm for calculating quantiles.  Supported:
- *                             [[org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort]]
+ *                             `org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort`
  * @param categoricalFeaturesInfo A map storing information about the categorical variables and the
- *                                number of discrete values they take. An entry (n -> k)
+ *                                number of discrete values they take. An entry (n to k)
  *                                indicates that feature n is categorical with k categories
  *                                indexed from 0: {0, 1, ..., k-1}.
  * @param minInstancesPerNode Minimum number of instances each child must have after split.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index be2704df3444f..bda5e662779c7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -25,7 +25,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
  * Split applied to a feature
  * @param feature feature index
  * @param threshold Threshold for continuous feature.
- *                  Split left if feature &lt;= threshold, else right.
+ *                  Split left if feature is less than or equal to threshold, else right.
  * @param featureType type of feature -- categorical or continuous
  * @param categories Split left if categorical feature value is in this set, else right.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
index 153f9f57faf42..594c41c2c7446 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/InternalOutputModes.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import org.apache.spark.sql.streaming.OutputMode
 
 /**
- * Internal helper class to generate objects representing various [[OutputMode]]s,
+ * Internal helper class to generate objects representing various `OutputMode`s,
  */
 private[sql] object InternalOutputModes {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index a821d2ca34579..c362104b26ffb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -74,7 +74,7 @@ object Row {
  * It is invalid to use the native primitive interface to retrieve a value that is null, instead a
  * user must check `isNullAt` before attempting to retrieve a value that might be null.
  *
- * To create a new Row, use [[RowFactory.create()]] in Java or [[Row.apply()]] in Scala.
+ * To create a new Row, use `RowFactory.create()` in Java or `Row.apply()` in Scala.
  *
  * A [[Row]] object can be constructed by providing field values. Example:
  * {{{
@@ -343,7 +343,7 @@ trait Row extends Serializable {
   }
 
   /**
-   * Returns a Map(name -&gt; value) for the requested fieldNames
+   * Returns a Map consisting of names and values for the requested fieldNames
    * For primitive types if value is null it returns 'zero value' specific for primitive
    * ie. 0 for Int - use isNullAt to ensure that value is not null
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
index cecad3b7b4c0a..4dc06fc9cf09b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -92,7 +92,8 @@ case class DecimalType(precision: Int, scale: Int) extends FractionalType {
   }
 
   /**
-   * The default size of a value of the DecimalType is 8 bytes (precision &lt;= 18) or 16 bytes.
+   * The default size of a value of the DecimalType is 8 bytes when precision is at most 18,
+   * and 16 bytes otherwise.
    */
   override def defaultSize: Int = if (precision <= Decimal.MAX_LONG_DIGITS) 8 else 16
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index fa3b2b9de5d5d..e99d7865bda91 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -97,7 +97,7 @@ class TypedColumn[-T, U](
 }
 
 /**
- * A column that will be computed based on the data in a [[DataFrame]].
+ * A column that will be computed based on the data in a `DataFrame`.
  *
  * A new column is constructed based on the input columns present in a dataframe:
  *
@@ -801,7 +801,7 @@ class Column(val expr: Expression) extends Logging {
 
   /**
    * An expression that gets an item at position `ordinal` out of an array,
-   * or gets a value by key `key` in a [[MapType]].
+   * or gets a value by key `key` in a `MapType`.
    *
    * @group expr_ops
    * @since 1.3.0
@@ -809,7 +809,7 @@ class Column(val expr: Expression) extends Logging {
   def getItem(key: Any): Column = withExpr { UnresolvedExtractValue(expr, Literal(key)) }
 
   /**
-   * An expression that gets a field by name in a [[StructType]].
+   * An expression that gets a field by name in a `StructType`.
    *
    * @group expr_ops
    * @since 1.3.0
@@ -1195,92 +1195,92 @@ class Column(val expr: Expression) extends Logging {
 class ColumnName(name: String) extends Column(name) {
 
   /**
-   * Creates a new [[StructField]] of type boolean.
+   * Creates a new `StructField` of type boolean.
    * @since 1.3.0
    */
   def boolean: StructField = StructField(name, BooleanType)
 
   /**
-   * Creates a new [[StructField]] of type byte.
+   * Creates a new `StructField` of type byte.
    * @since 1.3.0
    */
   def byte: StructField = StructField(name, ByteType)
 
   /**
-   * Creates a new [[StructField]] of type short.
+   * Creates a new `StructField` of type short.
    * @since 1.3.0
    */
   def short: StructField = StructField(name, ShortType)
 
   /**
-   * Creates a new [[StructField]] of type int.
+   * Creates a new `StructField` of type int.
    * @since 1.3.0
    */
   def int: StructField = StructField(name, IntegerType)
 
   /**
-   * Creates a new [[StructField]] of type long.
+   * Creates a new `StructField` of type long.
    * @since 1.3.0
    */
   def long: StructField = StructField(name, LongType)
 
   /**
-   * Creates a new [[StructField]] of type float.
+   * Creates a new `StructField` of type float.
    * @since 1.3.0
    */
   def float: StructField = StructField(name, FloatType)
 
   /**
-   * Creates a new [[StructField]] of type double.
+   * Creates a new `StructField` of type double.
    * @since 1.3.0
    */
   def double: StructField = StructField(name, DoubleType)
 
   /**
-   * Creates a new [[StructField]] of type string.
+   * Creates a new `StructField` of type string.
    * @since 1.3.0
    */
   def string: StructField = StructField(name, StringType)
 
   /**
-   * Creates a new [[StructField]] of type date.
+   * Creates a new `StructField` of type date.
    * @since 1.3.0
    */
   def date: StructField = StructField(name, DateType)
 
   /**
-   * Creates a new [[StructField]] of type decimal.
+   * Creates a new `StructField` of type decimal.
    * @since 1.3.0
    */
   def decimal: StructField = StructField(name, DecimalType.USER_DEFAULT)
 
   /**
-   * Creates a new [[StructField]] of type decimal.
+   * Creates a new `StructField` of type decimal.
    * @since 1.3.0
    */
   def decimal(precision: Int, scale: Int): StructField =
     StructField(name, DecimalType(precision, scale))
 
   /**
-   * Creates a new [[StructField]] of type timestamp.
+   * Creates a new `StructField` of type timestamp.
    * @since 1.3.0
    */
   def timestamp: StructField = StructField(name, TimestampType)
 
   /**
-   * Creates a new [[StructField]] of type binary.
+   * Creates a new `StructField` of type binary.
    * @since 1.3.0
    */
   def binary: StructField = StructField(name, BinaryType)
 
   /**
-   * Creates a new [[StructField]] of type array.
+   * Creates a new `StructField` of type array.
    * @since 1.3.0
    */
   def array(dataType: DataType): StructField = StructField(name, ArrayType(dataType))
 
   /**
-   * Creates a new [[StructField]] of type map.
+   * Creates a new `StructField` of type map.
    * @since 1.3.0
    */
   def map(keyType: DataType, valueType: DataType): StructField =
@@ -1289,13 +1289,13 @@ class ColumnName(name: String) extends Column(name) {
   def map(mapType: MapType): StructField = StructField(name, mapType)
 
   /**
-   * Creates a new [[StructField]] of type struct.
+   * Creates a new `StructField` of type struct.
    * @since 1.3.0
    */
   def struct(fields: StructField*): StructField = struct(StructType(fields))
 
   /**
-   * Creates a new [[StructField]] of type struct.
+   * Creates a new `StructField` of type struct.
    * @since 1.3.0
    */
   def struct(structType: StructType): StructField = StructField(name, structType)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 0d43f09bc54cd..184c5a11298d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
 
 
 /**
- * Functionality for working with missing data in [[DataFrame]]s.
+ * Functionality for working with missing data in `DataFrame`s.
  *
  * @since 1.3.1
  */
@@ -36,14 +36,14 @@ import org.apache.spark.sql.types._
 final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing any null or NaN values.
+   * Returns a new `DataFrame` that drops rows containing any null or NaN values.
    *
    * @since 1.3.1
    */
   def drop(): DataFrame = drop("any", df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing null or NaN values.
+   * Returns a new `DataFrame` that drops rows containing null or NaN values.
    *
    * If `how` is "any", then drop rows containing any null or NaN values.
    * If `how` is "all", then drop rows only if every column is null or NaN for that row.
@@ -53,7 +53,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(how: String): DataFrame = drop(how, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing any null or NaN values
+   * Returns a new `DataFrame` that drops rows containing any null or NaN values
    * in the specified columns.
    *
    * @since 1.3.1
@@ -61,7 +61,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(cols: Array[String]): DataFrame = drop(cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing any null or NaN values
+   * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values
    * in the specified columns.
    *
    * @since 1.3.1
@@ -69,7 +69,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(cols: Seq[String]): DataFrame = drop(cols.size, cols)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing null or NaN values
+   * Returns a new `DataFrame` that drops rows containing null or NaN values
    * in the specified columns.
    *
    * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
@@ -80,7 +80,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing null or NaN values
+   * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values
    * in the specified columns.
    *
    * If `how` is "any", then drop rows containing any null or NaN values in the specified columns.
@@ -97,7 +97,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing
+   * Returns a new `DataFrame` that drops rows containing
    * less than `minNonNulls` non-null and non-NaN values.
    *
    * @since 1.3.1
@@ -105,7 +105,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(minNonNulls: Int): DataFrame = drop(minNonNulls, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that drops rows containing
+   * Returns a new `DataFrame` that drops rows containing
    * less than `minNonNulls` non-null and non-NaN values in the specified columns.
    *
    * @since 1.3.1
@@ -113,7 +113,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def drop(minNonNulls: Int, cols: Array[String]): DataFrame = drop(minNonNulls, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that drops rows containing less than
+   * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than
    * `minNonNulls` non-null and non-NaN values in the specified columns.
    *
    * @since 1.3.1
@@ -126,21 +126,21 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that replaces null or NaN values in numeric columns with `value`.
+   * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`.
    *
    * @since 1.3.1
    */
   def fill(value: Double): DataFrame = fill(value, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values in string columns with `value`.
+   * Returns a new `DataFrame` that replaces null values in string columns with `value`.
    *
    * @since 1.3.1
    */
   def fill(value: String): DataFrame = fill(value, df.columns)
 
   /**
-   * Returns a new [[DataFrame]] that replaces null or NaN values in specified numeric columns.
+   * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns.
    * If a specified column is not a numeric column, it is ignored.
    *
    * @since 1.3.1
@@ -148,7 +148,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null or NaN values in specified
+   * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified
    * numeric columns. If a specified column is not a numeric column, it is ignored.
    *
    * @since 1.3.1
@@ -167,7 +167,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values in specified string columns.
+   * Returns a new `DataFrame` that replaces null values in specified string columns.
    * If a specified column is not a string column, it is ignored.
    *
    * @since 1.3.1
@@ -175,7 +175,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def fill(value: String, cols: Array[String]): DataFrame = fill(value, cols.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null values in
+   * (Scala-specific) Returns a new `DataFrame` that replaces null values in
    * specified string columns. If a specified column is not a string column, it is ignored.
    *
    * @since 1.3.1
@@ -194,7 +194,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Returns a new [[DataFrame]] that replaces null values.
+   * Returns a new `DataFrame` that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
    * The value must be of the following type:
@@ -213,7 +213,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
   def fill(valueMap: java.util.Map[String, Any]): DataFrame = fill0(valueMap.asScala.toSeq)
 
   /**
-   * (Scala-specific) Returns a new [[DataFrame]] that replaces null values.
+   * (Scala-specific) Returns a new `DataFrame` that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
    * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 5be9a99369997..1af2f9afea5eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.types.StructType
 
 /**
  * Interface used to load a [[Dataset]] from external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[SparkSession.read]] to access this.
+ * key-value stores, etc). Use `SparkSession.read` to access this.
  *
  * @since 1.4.0
  */
@@ -116,7 +116,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data sources that don't require a path (e.g. external
+   * Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external
    * key-value stores).
    *
    * @since 1.4.0
@@ -126,7 +126,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by
+   * Loads input in as a `DataFrame`, for data sources that require a path (e.g. data backed by
    * a local or distributed file system).
    *
    * @since 1.4.0
@@ -136,7 +136,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads input in as a [[DataFrame]], for data sources that support multiple paths.
+   * Loads input in as a `DataFrame`, for data sources that support multiple paths.
    * Only works if the source is a HadoopFsRelationProvider.
    *
    * @since 1.6.0
@@ -153,7 +153,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
    *
    * @since 1.4.0
@@ -163,7 +163,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table. Partitions of the table will be retrieved in parallel based on the parameters
    * passed to this function.
    *
@@ -198,10 +198,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
+   * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table using connection properties. The `predicates` parameter gives a list
    * expressions suitable for inclusion in WHERE clauses; each one defines one partition
-   * of the [[DataFrame]].
+   * of the `DataFrame`.
    *
    * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
    * your external database systems.
@@ -240,7 +240,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
   /**
    * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
-   * newline-delimited JSON</a>) and returns the result as a [[DataFrame]].
+   * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    * See the documentation on the overloaded `json()` method with varargs for more details.
    *
    * @since 1.4.0
@@ -252,7 +252,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
   /**
    * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
-   * newline-delimited JSON</a>) and returns the result as a [[DataFrame]].
+   * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    *
    * This function goes through the input once to determine the input schema. If you know the
    * schema in advance, use the version that specifies the schema to avoid the extra scan.
@@ -299,7 +299,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   /**
    * Loads a `JavaRDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON
    * Lines text format or newline-delimited JSON</a>) and returns the result as
-   * a [[DataFrame]].
+   * a `DataFrame`.
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
@@ -311,7 +311,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
   /**
    * Loads an `RDD[String]` storing JSON objects (<a href="http://jsonlines.org/">JSON Lines
-   * text format or newline-delimited JSON</a>) and returns the result as a [[DataFrame]].
+   * text format or newline-delimited JSON</a>) and returns the result as a `DataFrame`.
    *
    * Unless the schema is specified using [[schema]] function, this function goes through the
    * input once to determine the input schema.
@@ -341,7 +341,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a [[DataFrame]]. See the documentation on the
+   * Loads a CSV file and returns the result as a `DataFrame`. See the documentation on the
    * other overloaded `csv()` method for more details.
    *
    * @since 2.0.0
@@ -352,7 +352,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a CSV file and returns the result as a [[DataFrame]].
+   * Loads a CSV file and returns the result as a `DataFrame`.
    *
    * This function will go through the input once to determine the input schema if `inferSchema`
    * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
@@ -392,7 +392,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
-   * `java.sql.Timestamp.valueOf()` and `java.sql.Date.valueOf()` or ISO 8601 format.</li>
    * <li>`maxColumns` (default `20480`): defines a hard limit of how many columns
    * a record can have.</li>
    * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
@@ -415,7 +414,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def csv(paths: String*): DataFrame = format("csv").load(paths : _*)
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]]. See the documentation
+   * Loads a Parquet file, returning the result as a `DataFrame`. See the documentation
    * on the other overloaded `parquet()` method for more details.
    *
    * @since 2.0.0
@@ -426,7 +425,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads a Parquet file, returning the result as a [[DataFrame]].
+   * Loads a Parquet file, returning the result as a `DataFrame`.
    *
    * You can set the following Parquet-specific option(s) for reading Parquet files:
    * <ul>
@@ -442,7 +441,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads an ORC file and returns the result as a [[DataFrame]].
+   * Loads an ORC file and returns the result as a `DataFrame`.
    *
    * @param path input path
    * @since 1.5.0
@@ -454,7 +453,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads an ORC file and returns the result as a [[DataFrame]].
+   * Loads an ORC file and returns the result as a `DataFrame`.
    *
    * @param paths input paths
    * @since 2.0.0
@@ -464,7 +463,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   def orc(paths: String*): DataFrame = format("orc").load(paths: _*)
 
   /**
-   * Returns the specified table as a [[DataFrame]].
+   * Returns the specified table as a `DataFrame`.
    *
    * @since 1.4.0
    */
@@ -475,7 +474,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+   * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any. See the documentation on
    * the other overloaded `text()` method for more details.
    *
@@ -487,7 +486,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   }
 
   /**
-   * Loads text files and returns a [[DataFrame]] whose schema starts with a string column named
+   * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any.
    *
    * Each line in the text files is a new row in the resulting DataFrame. For example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index a9a861c4635b2..89c3a74f4f067 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.sketch.{BloomFilter, CountMinSketch}
 
 /**
- * Statistic functions for [[DataFrame]]s.
+ * Statistic functions for `DataFrame`s.
  *
  * @since 1.4.0
  */
@@ -44,7 +44,9 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * of `x` is close to (p * N).
    * More precisely,
    *
-   *   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+   * {{{
+   *   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N)
+   * }}}
    *
    * This method implements a variation of the Greenwald-Khanna algorithm (with some speed
    * optimizations).
@@ -55,7 +57,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param probabilities a list of quantile probabilities
    *   Each number must belong to [0, 1].
    *   For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
-   * @param relativeError The relative target precision to achieve (>= 0).
+   * @param relativeError The relative target precision to achieve (greater or equal to 0).
    *   If set to zero, the exact quantiles are computed, which could be very expensive.
    *   Note that values greater than 1 are accepted but give the same result as 1.
    * @return the approximate quantiles at the given probabilities
@@ -189,7 +191,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * The `support` should be greater than 1e-4.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @param support The minimum frequency for an item to be considered `frequent`. Should be greater
@@ -236,7 +238,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * Uses a `default` support of 1%.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
@@ -254,7 +256,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * and Papadimitriou.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
@@ -299,7 +301,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * Uses a `default` support of 1%.
    *
    * This function is meant for exploratory data analysis, as we make no guarantee about the
-   * backward compatibility of the schema of the resulting [[DataFrame]].
+   * backward compatibility of the schema of the resulting `DataFrame`.
    *
    * @param cols the names of the columns to search frequent items in.
    * @return A Local DataFrame with the Array of frequent items for each column.
@@ -317,7 +319,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *                  its fraction as zero.
    * @param seed random seed
    * @tparam T stratum type
-   * @return a new [[DataFrame]] that represents the stratified sample
+   * @return a new `DataFrame` that represents the stratified sample
    *
    * {{{
    *    val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
@@ -354,7 +356,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *                  its fraction as zero.
    * @param seed random seed
    * @tparam T stratum type
-   * @return a new [[DataFrame]] that represents the stratified sample
+   * @return a new `DataFrame` that represents the stratified sample
    *
    * @since 1.5.0
    */
@@ -369,7 +371,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param depth depth of the sketch
    * @param width width of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(colName: String, depth: Int, width: Int, seed: Int): CountMinSketch = {
@@ -383,7 +385,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param eps relative error of the sketch
    * @param confidence confidence of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(
@@ -398,7 +400,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param depth depth of the sketch
    * @param width width of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(col: Column, depth: Int, width: Int, seed: Int): CountMinSketch = {
@@ -412,7 +414,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    * @param eps relative error of the sketch
    * @param confidence confidence of the sketch
    * @param seed random seed
-   * @return a [[CountMinSketch]] over column `colName`
+   * @return a `CountMinSketch` over column `colName`
    * @since 2.0.0
    */
   def countMinSketch(col: Column, eps: Double, confidence: Double, seed: Int): CountMinSketch = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 8294e4149b1c4..fa8e8cb985ef5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType
 
 /**
  * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems,
- * key-value stores, etc). Use [[Dataset.write]] to access this.
+ * key-value stores, etc). Use `Dataset.write` to access this.
  *
  * @since 1.4.0
  */
@@ -189,7 +189,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] at the specified path.
+   * Saves the content of the `DataFrame` at the specified path.
    *
    * @since 1.4.0
    */
@@ -199,7 +199,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] as the specified table.
+   * Saves the content of the `DataFrame` as the specified table.
    *
    * @since 1.4.0
    */
@@ -215,8 +215,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     dataSource.write(mode, df)
   }
   /**
-   * Inserts the content of the [[DataFrame]] to the specified table. It requires that
-   * the schema of the [[DataFrame]] is the same as the schema of the table.
+   * Inserts the content of the `DataFrame` to the specified table. It requires that
+   * the schema of the `DataFrame` is the same as the schema of the table.
    *
    * @note Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based
    * resolution. For example:
@@ -322,15 +322,15 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] as the specified table.
+   * Saves the content of the `DataFrame` as the specified table.
    *
    * In the case the table already exists, behavior of this function depends on the
    * save mode, specified by the `mode` function (default to throwing an exception).
-   * When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
+   * When `mode` is `Overwrite`, the schema of the `DataFrame` does not need to be
    * the same as that of the existing table.
    *
    * When `mode` is `Append`, if there is an existing table, we will use the format and options of
-   * the existing table. The column order in the schema of the [[DataFrame]] doesn't need to be same
+   * the existing table. The column order in the schema of the `DataFrame` doesn't need to be same
    * as that of the existing table. Unlike `insertInto`, `saveAsTable` will use the column names to
    * find the correct column positions. For example:
    *
@@ -346,7 +346,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *    +---+---+
    * }}}
    *
-   * When the DataFrame is created from a non-partitioned [[HadoopFsRelation]] with a single input
+   * When the DataFrame is created from a non-partitioned `HadoopFsRelation` with a single input
    * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC
    * and Parquet), the table is persisted in a Hive compatible format, which means other systems
    * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL
@@ -406,7 +406,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] to an external database table via JDBC. In the case the
+   * Saves the content of the `DataFrame` to an external database table via JDBC. In the case the
    * table already exists in the external database, behavior of this function depends on the
    * save mode, specified by the `mode` function (default to throwing an exception).
    *
@@ -447,7 +447,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in JSON format (<a href="http://jsonlines.org/">
+   * Saves the content of the `DataFrame` in JSON format (<a href="http://jsonlines.org/">
    * JSON Lines text format or newline-delimited JSON</a>) at the specified path.
    * This is equivalent to:
    * {{{
@@ -474,7 +474,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in Parquet format at the specified path.
+   * Saves the content of the `DataFrame` in Parquet format at the specified path.
    * This is equivalent to:
    * {{{
    *   format("parquet").save(path)
@@ -495,7 +495,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in ORC format at the specified path.
+   * Saves the content of the `DataFrame` in ORC format at the specified path.
    * This is equivalent to:
    * {{{
    *   format("orc").save(path)
@@ -516,7 +516,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in a text file at the specified path.
+   * Saves the content of the `DataFrame` in a text file at the specified path.
    * The DataFrame must have only one column that is of string type.
    * Each row becomes a new line in the output file. For example:
    * {{{
@@ -541,7 +541,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
   }
 
   /**
-   * Saves the content of the [[DataFrame]] in CSV format at the specified path.
+   * Saves the content of the `DataFrame` in CSV format at the specified path.
    * This is equivalent to:
    * {{{
    *   format("csv").save(path)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 7ba6ffce278cf..fcc02e5eb3ef9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -68,7 +68,7 @@ private[sql] object Dataset {
 /**
  * A Dataset is a strongly typed collection of domain-specific objects that can be transformed
  * in parallel using functional or relational operations. Each Dataset also has an untyped view
- * called a [[DataFrame]], which is a Dataset of [[Row]].
+ * called a `DataFrame`, which is a Dataset of [[Row]].
  *
  * Operations available on Datasets are divided into transformations and actions. Transformations
  * are the ones that produce new Datasets, and actions are the ones that trigger computation and
@@ -363,7 +363,7 @@ class Dataset[T] private[sql](
    *  - When `U` is a tuple, the columns will be be mapped by ordinal (i.e. the first column will
    *    be assigned to `_1`).
    *  - When `U` is a primitive type (i.e. String, Int, etc), then the first column of the
-   *    [[DataFrame]] will be used.
+   *    `DataFrame` will be used.
    *
    * If the schema of the Dataset does not match the desired `U` type, you can use `select`
    * along with `alias` or `as` to rearrange or rename as required.
@@ -377,7 +377,7 @@ class Dataset[T] private[sql](
 
   /**
    * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed.
-   * This can be quite convenient in conversion from an RDD of tuples into a [[DataFrame]] with
+   * This can be quite convenient in conversion from an RDD of tuples into a `DataFrame` with
    * meaningful names. For example:
    * {{{
    *   val rdd: RDD[(Int, String)] = ...
@@ -472,8 +472,8 @@ class Dataset[T] private[sql](
   /**
    * Returns true if this Dataset contains one or more sources that continuously
    * return data as it arrives. A Dataset that reads data from a streaming source
-   * must be executed as a [[StreamingQuery]] using the `start()` method in
-   * [[DataStreamWriter]]. Methods that return a single answer, e.g. `count()` or
+   * must be executed as a `StreamingQuery` using the `start()` method in
+   * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or
    * `collect()`, will throw an [[AnalysisException]] when there is a streaming
    * source present.
    *
@@ -685,7 +685,7 @@ class Dataset[T] private[sql](
   def stat: DataFrameStatFunctions = new DataFrameStatFunctions(toDF())
 
   /**
-   * Join with another [[DataFrame]].
+   * Join with another `DataFrame`.
    *
    * Behaves as an INNER JOIN and requires a subsequent join predicate.
    *
@@ -699,7 +699,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Inner equi-join with another [[DataFrame]] using the given column.
+   * Inner equi-join with another `DataFrame` using the given column.
    *
    * Different from other join functions, the join column will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
@@ -713,7 +713,7 @@ class Dataset[T] private[sql](
    * @param usingColumn Name of the column to join on. This column must exist on both sides.
    *
    * @note If you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * `DataFrame`s, you will NOT be able to reference any columns after the join, since
    * there is no way to disambiguate which side of the join you would like to reference.
    *
    * @group untypedrel
@@ -724,7 +724,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Inner equi-join with another [[DataFrame]] using the given columns.
+   * Inner equi-join with another `DataFrame` using the given columns.
    *
    * Different from other join functions, the join columns will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
@@ -738,7 +738,7 @@ class Dataset[T] private[sql](
    * @param usingColumns Names of the columns to join on. This columns must exist on both sides.
    *
    * @note If you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * `DataFrame`s, you will NOT be able to reference any columns after the join, since
    * there is no way to disambiguate which side of the join you would like to reference.
    *
    * @group untypedrel
@@ -749,7 +749,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Equi-join with another [[DataFrame]] using the given columns.
+   * Equi-join with another `DataFrame` using the given columns.
    *
    * Different from other join functions, the join columns will only appear once in the output,
    * i.e. similar to SQL's `JOIN USING` syntax.
@@ -759,7 +759,7 @@ class Dataset[T] private[sql](
    * @param joinType One of: `inner`, `outer`, `left_outer`, `right_outer`, `leftsemi`.
    *
    * @note If you perform a self-join using this function without aliasing the input
-   * [[DataFrame]]s, you will NOT be able to reference any columns after the join, since
+   * `DataFrame`s, you will NOT be able to reference any columns after the join, since
    * there is no way to disambiguate which side of the join you would like to reference.
    *
    * @group untypedrel
@@ -782,7 +782,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Inner join with another [[DataFrame]], using the given join expression.
+   * Inner join with another `DataFrame`, using the given join expression.
    *
    * {{{
    *   // The following two are equivalent:
@@ -796,7 +796,7 @@ class Dataset[T] private[sql](
   def join(right: Dataset[_], joinExprs: Column): DataFrame = join(right, joinExprs, "inner")
 
   /**
-   * Join with another [[DataFrame]], using the given join expression. The following performs
+   * Join with another `DataFrame`, using the given join expression. The following performs
    * a full outer join between `df1` and `df2`.
    *
    * {{{
@@ -860,7 +860,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Explicit cartesian join with another [[DataFrame]].
+   * Explicit cartesian join with another `DataFrame`.
    *
    * @param right Right side of the join operation.
    *
@@ -875,7 +875,7 @@ class Dataset[T] private[sql](
 
   /**
    * :: Experimental ::
-   * Joins this Dataset returning a [[Tuple2]] for each pair where `condition` evaluates to
+   * Joins this Dataset returning a `Tuple2` for each pair where `condition` evaluates to
    * true.
    *
    * This is similar to the relation `join` function with one important difference in the
@@ -956,7 +956,7 @@ class Dataset[T] private[sql](
 
   /**
    * :: Experimental ::
-   * Using inner equi-join to join this Dataset returning a [[Tuple2]] for each pair
+   * Using inner equi-join to join this Dataset returning a `Tuple2` for each pair
    * where `condition` evaluates to true.
    *
    * @param other Right side of the join.
@@ -2232,7 +2232,7 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns a new [[DataFrame]] that contains the result of applying a serialized R function
+   * Returns a new `DataFrame` that contains the result of applying a serialized R function
    * `func` to each partition.
    */
   private[sql] def mapPartitionsInR(
@@ -2446,7 +2446,7 @@ class Dataset[T] private[sql](
 
   /**
    * Returns a new Dataset that has exactly `numPartitions` partitions.
-   * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
+   * Similar to coalesce defined on an `RDD`, this operation results in a narrow dependency, e.g.
    * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
    * the 100 new partitions will claim 10 of the current partitions.
    *
@@ -2536,7 +2536,7 @@ class Dataset[T] private[sql](
   def unpersist(): this.type = unpersist(blocking = false)
 
   /**
-   * Represents the content of the Dataset as an [[RDD]] of [[T]].
+   * Represents the content of the Dataset as an `RDD` of [[T]].
    *
    * @group basic
    * @since 1.6.0
@@ -2550,14 +2550,14 @@ class Dataset[T] private[sql](
   }
 
   /**
-   * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s.
+   * Returns the content of the Dataset as a `JavaRDD` of [[T]]s.
    * @group basic
    * @since 1.6.0
    */
   def toJavaRDD: JavaRDD[T] = rdd.toJavaRDD()
 
   /**
-   * Returns the content of the Dataset as a [[JavaRDD]] of [[T]]s.
+   * Returns the content of the Dataset as a `JavaRDD` of [[T]]s.
    * @group basic
    * @since 1.6.0
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
index 1163035e315fc..b94ad59fa2f6e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/ForeachWriter.scala
@@ -18,11 +18,10 @@
 package org.apache.spark.sql
 
 import org.apache.spark.annotation.{Experimental, InterfaceStability}
-import org.apache.spark.sql.streaming.StreamingQuery
 
 /**
  * :: Experimental ::
- * A class to consume data generated by a [[StreamingQuery]]. Typically this is used to send the
+ * A class to consume data generated by a `StreamingQuery`. Typically this is used to send the
  * generated data to external systems. Each partition will use a new deserialized instance, so you
  * usually should do all the initialization (e.g. opening a connection or initiating a transaction)
  * in the `open` method.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
index 31ce8eb25e808..395d709f26591 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala
@@ -131,7 +131,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
@@ -160,7 +160,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
@@ -182,7 +182,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
@@ -205,7 +205,7 @@ class KeyValueGroupedDataset[K, V] private[sql](
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index f019d1e9daceb..0fe8d87ebd6ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.NumericType
 import org.apache.spark.sql.types.StructType
 
 /**
- * A set of methods for aggregations on a [[DataFrame]], created by [[Dataset.groupBy]].
+ * A set of methods for aggregations on a `DataFrame`, created by `Dataset.groupBy`.
  *
  * The main method is the agg function, which has multiple variants. This class also contains
  * convenience some first order statistics such as mean, sum for convenience.
@@ -129,7 +129,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * (Scala-specific) Compute aggregates by specifying the column names and
-   * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+   * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
    *
    * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
    * {{{
@@ -150,7 +150,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * (Scala-specific) Compute aggregates by specifying a map from column name to
-   * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+   * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
    *
    * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
    * {{{
@@ -171,7 +171,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * (Java-specific) Compute aggregates by specifying a map from column name to
-   * aggregate methods. The resulting [[DataFrame]] will also contain the grouping columns.
+   * aggregate methods. The resulting `DataFrame` will also contain the grouping columns.
    *
    * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`.
    * {{{
@@ -228,7 +228,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Count the number of rows for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    *
    * @since 1.3.0
    */
@@ -236,7 +236,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the average value for each numeric columns for each group. This is an alias for `avg`.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the average values for them.
    *
    * @since 1.3.0
@@ -248,7 +248,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the max value for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the max values for them.
    *
    * @since 1.3.0
@@ -260,7 +260,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the mean value for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the mean values for them.
    *
    * @since 1.3.0
@@ -272,7 +272,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the min value for each numeric column for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the min values for them.
    *
    * @since 1.3.0
@@ -284,7 +284,7 @@ class RelationalGroupedDataset protected[sql](
 
   /**
    * Compute the sum for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
+   * The resulting `DataFrame` will also contain the grouping columns.
    * When specified columns are given, only compute the sum for them.
    *
    * @since 1.3.0
@@ -295,7 +295,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+   * Pivots a column of the current `DataFrame` and perform the specified aggregation.
    * There are two versions of pivot function: one that requires the caller to specify the list
    * of distinct values to pivot on, and one that does not. The latter is more concise but less
    * efficient, because Spark needs to first compute the list of distinct values internally.
@@ -335,7 +335,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+   * Pivots a column of the current `DataFrame` and perform the specified aggregation.
    * There are two versions of pivot function: one that requires the caller to specify the list
    * of distinct values to pivot on, and one that does not. The latter is more concise but less
    * efficient, because Spark needs to first compute the list of distinct values internally.
@@ -367,7 +367,7 @@ class RelationalGroupedDataset protected[sql](
   }
 
   /**
-   * Pivots a column of the current [[DataFrame]] and perform the specified aggregation.
+   * Pivots a column of the current `DataFrame` and perform the specified aggregation.
    * There are two versions of pivot function: one that requires the caller to specify the list
    * of distinct values to pivot on, and one that does not. The latter is more concise but less
    * efficient, because Spark needs to first compute the list of distinct values internally.
@@ -392,12 +392,12 @@ class RelationalGroupedDataset protected[sql](
    * Applies the given serialized R function `func` to each group of data. For each unique group,
    * the function will be passed the group key and an iterator that contains all of the elements in
    * the group. The function can return an iterator containing elements of an arbitrary type which
-   * will be returned as a new [[DataFrame]].
+   * will be returned as a new `DataFrame`.
    *
    * This function does not support partial aggregation, and as a result requires shuffling all
    * the data in the [[Dataset]]. If an application intends to perform an aggregation over each
    * key, it is best to use the reduce function or an
-   * [[org.apache.spark.sql.expressions#Aggregator Aggregator]].
+   * `org.apache.spark.sql.expressions#Aggregator`.
    *
    * Internally, the implementation will spill to disk if any given group is too large to fit into
    * memory.  However, users must take care to avoid materializing the whole iterator for a group
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
index 9108d19d0a0c2..edfcd7d56dc8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 
 
 /**
- * Runtime configuration interface for Spark. To access this, use [[SparkSession.conf]].
+ * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`.
  *
  * Options set here are automatically propagated to the Hadoop configuration during I/O.
  *
@@ -65,7 +65,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) {
   /**
    * Returns the value of Spark runtime configuration property for the given key.
    *
-   * @throws NoSuchElementException if the key is not set and does not have a default value
+   * @throws java.util.NoSuchElementException if the key is not set and does not have a default
+   *                                          value
    * @since 2.0.0
    */
   @throws[NoSuchElementException]("if the key is not set")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 858fa4c7609b6..6554359806a01 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -84,7 +84,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   /**
    * Returns a [[SQLContext]] as new session, with separated SQL configurations, temporary
-   * tables, registered functions, but sharing the same [[SparkContext]], cached data and
+   * tables, registered functions, but sharing the same `SparkContext`, cached data and
    * other things.
    *
    * @since 1.6.0
@@ -883,7 +883,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
   }
 
   /**
-   * Loads an JavaRDD<String> storing JSON objects (one object per record) and applies the given
+   * Loads an JavaRDD[String] storing JSON objects (one object per record) and applies the given
    * schema, returning the result as a `DataFrame`.
    *
    * @group specificdata
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 71b1880dc0715..08d74ac0185b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -93,7 +93,7 @@ class SparkSession private(
    * ----------------------- */
 
   /**
-   * State shared across sessions, including the [[SparkContext]], cached data, listener,
+   * State shared across sessions, including the `SparkContext`, cached data, listener,
    * and a catalog that interacts with external systems.
    */
   @transient
@@ -125,7 +125,7 @@ class SparkSession private(
    *
    * This is the interface through which the user can get and set all Spark and Hadoop
    * configurations that are relevant to Spark SQL. When getting the value of a config,
-   * this defaults to the value set in the underlying [[SparkContext]], if any.
+   * this defaults to the value set in the underlying `SparkContext`, if any.
    *
    * @since 2.0.0
    */
@@ -189,8 +189,8 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Returns a [[StreamingQueryManager]] that allows managing all the
-   * [[StreamingQuery StreamingQueries]] active on `this`.
+   * Returns a `StreamingQueryManager` that allows managing all the
+   * `StreamingQuery`s active on `this`.
    *
    * @since 2.0.0
    */
@@ -200,9 +200,9 @@ class SparkSession private(
 
   /**
    * Start a new session with isolated SQL configurations, temporary tables, registered
-   * functions are isolated, but sharing the underlying [[SparkContext]] and cached data.
+   * functions are isolated, but sharing the underlying `SparkContext` and cached data.
    *
-   * @note Other than the [[SparkContext]], all shared state is initialized lazily.
+   * @note Other than the `SparkContext`, all shared state is initialized lazily.
    * This method will force the initialization of the shared state to ensure that parent
    * and child sessions are set up with the same shared state. If the underlying catalog
    * implementation is Hive, this will initialize the metastore, which may take some time.
@@ -219,7 +219,7 @@ class SparkSession private(
    * --------------------------------- */
 
   /**
-   * Returns a [[DataFrame]] with no rows or columns.
+   * Returns a `DataFrame` with no rows or columns.
    *
    * @since 2.0.0
    */
@@ -243,7 +243,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples).
+   * Creates a `DataFrame` from an RDD of Product (e.g. case classes, tuples).
    *
    * @since 2.0.0
    */
@@ -257,7 +257,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[DataFrame]] from a local Seq of Product.
+   * Creates a `DataFrame` from a local Seq of Product.
    *
    * @since 2.0.0
    */
@@ -272,7 +272,7 @@ class SparkSession private(
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from an [[RDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from an `RDD` containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    * Example:
@@ -309,7 +309,7 @@ class SparkSession private(
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[JavaRDD]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a `JavaRDD` containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided RDD matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -323,7 +323,7 @@ class SparkSession private(
 
   /**
    * :: DeveloperApi ::
-   * Creates a [[DataFrame]] from a [[java.util.List]] containing [[Row]]s using the given schema.
+   * Creates a `DataFrame` from a [[java.util.List]] containing [[Row]]s using the given schema.
    * It is important to make sure that the structure of every [[Row]] of the provided List matches
    * the provided schema. Otherwise, there will be runtime exception.
    *
@@ -381,7 +381,7 @@ class SparkSession private(
   }
 
   /**
-   * Convert a [[BaseRelation]] created for external data sources into a [[DataFrame]].
+   * Convert a `BaseRelation` created for external data sources into a `DataFrame`.
    *
    * @since 2.0.0
    */
@@ -470,7 +470,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from 0 to `end` (exclusive) with step value 1.
    *
    * @since 2.0.0
@@ -481,7 +481,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with step value 1.
    *
    * @since 2.0.0
@@ -494,7 +494,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with a step value.
    *
    * @since 2.0.0
@@ -507,7 +507,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Creates a [[Dataset]] with a single [[LongType]] column named `id`, containing elements
+   * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements
    * in a range from `start` to `end` (exclusive) with a step value, with partition number
    * specified.
    *
@@ -520,7 +520,7 @@ class SparkSession private(
   }
 
   /**
-   * Creates a [[DataFrame]] from an RDD[Row].
+   * Creates a `DataFrame` from an RDD[Row].
    * User can specify whether the input rows should be converted to Catalyst rows.
    */
   private[sql] def internalCreateDataFrame(
@@ -533,7 +533,7 @@ class SparkSession private(
   }
 
   /**
-   * Creates a [[DataFrame]] from an RDD[Row].
+   * Creates a `DataFrame` from an RDD[Row].
    * User can specify whether the input rows should be converted to Catalyst rows.
    */
   private[sql] def createDataFrame(
@@ -566,7 +566,7 @@ class SparkSession private(
   @transient lazy val catalog: Catalog = new CatalogImpl(self)
 
   /**
-   * Returns the specified table as a [[DataFrame]].
+   * Returns the specified table as a `DataFrame`.
    *
    * @since 2.0.0
    */
@@ -583,7 +583,7 @@ class SparkSession private(
    * ----------------- */
 
   /**
-   * Executes a SQL query using Spark, returning the result as a [[DataFrame]].
+   * Executes a SQL query using Spark, returning the result as a `DataFrame`.
    * The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'.
    *
    * @since 2.0.0
@@ -594,7 +594,7 @@ class SparkSession private(
 
   /**
    * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a
-   * [[DataFrame]].
+   * `DataFrame`.
    * {{{
    *   sparkSession.read.parquet("/path/to/file.parquet")
    *   sparkSession.read.schema(schema).json("/path/to/file.json")
@@ -606,7 +606,7 @@ class SparkSession private(
 
   /**
    * :: Experimental ::
-   * Returns a [[DataStreamReader]] that can be used to read streaming data in as a [[DataFrame]].
+   * Returns a `DataStreamReader` that can be used to read streaming data in as a `DataFrame`.
    * {{{
    *   sparkSession.readStream.parquet("/path/to/directory/of/parquet/files")
    *   sparkSession.readStream.schema(schema).json("/path/to/directory/of/json/files")
@@ -624,7 +624,7 @@ class SparkSession private(
   /**
    * :: Experimental ::
    * (Scala-specific) Implicit methods available in Scala for converting
-   * common Scala objects into [[DataFrame]]s.
+   * common Scala objects into `DataFrame`s.
    *
    * {{{
    *   val sparkSession = SparkSession.builder.getOrCreate()
@@ -641,7 +641,7 @@ class SparkSession private(
   // scalastyle:on
 
   /**
-   * Stop the underlying [[SparkContext]].
+   * Stop the underlying `SparkContext`.
    *
    * @since 2.0.0
    */
@@ -726,7 +726,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -737,7 +737,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -748,7 +748,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -759,7 +759,7 @@ object SparkSession {
 
     /**
      * Sets a config option. Options set using this method are automatically propagated to
-     * both [[SparkConf]] and SparkSession's own configuration.
+     * both `SparkConf` and SparkSession's own configuration.
      *
      * @since 2.0.0
      */
@@ -769,7 +769,7 @@ object SparkSession {
     }
 
     /**
-     * Sets a list of config options based on the given [[SparkConf]].
+     * Sets a list of config options based on the given `SparkConf`.
      *
      * @since 2.0.0
      */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 6043c5ee14b54..c8be89c646957 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.types.{DataType, DataTypes}
 import org.apache.spark.util.Utils
 
 /**
- * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this.
+ * Functions for registering user-defined functions. Use `SQLContext.udf` to access this.
  *
  * @note The user-defined functions must be deterministic.
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index d5940c638acdb..650439a193015 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -474,7 +474,9 @@ object functions {
   /**
    * Aggregate function: returns the level of grouping, equals to
    *
-   *   (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+   * {{{
+   *   (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
+   * }}}
    *
    * @note The list of columns should match with grouping columns exactly, or empty (means all the
    * grouping columns).
@@ -487,7 +489,9 @@ object functions {
   /**
    * Aggregate function: returns the level of grouping, equals to
    *
-   *   (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
+   * {{{
+   *   (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn)
+   * }}}
    *
    * @note The list of columns should match with grouping columns exactly.
    *
@@ -1048,9 +1052,12 @@ object functions {
    * within each partition in the lower 33 bits. The assumption is that the data frame has
    * less than 1 billion partitions, and each partition has less than 8 billion records.
    *
-   * As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+   * As an example, consider a `DataFrame` with two partitions, each with 3 records.
    * This expression would return the following IDs:
+   *
+   * {{{
    * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+   * }}}
    *
    * @group normal_funcs
    * @since 1.4.0
@@ -1066,9 +1073,12 @@ object functions {
    * within each partition in the lower 33 bits. The assumption is that the data frame has
    * less than 1 billion partitions, and each partition has less than 8 billion records.
    *
-   * As an example, consider a [[DataFrame]] with two partitions, each with 3 records.
+   * As an example, consider a `DataFrame` with two partitions, each with 3 records.
    * This expression would return the following IDs:
+   *
+   * {{{
    * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
+   * }}}
    *
    * @group normal_funcs
    * @since 1.6.0
@@ -1184,7 +1194,7 @@ object functions {
 
   /**
    * Creates a new struct column.
-   * If the input column is a column in a [[DataFrame]], or a derived column expression
+   * If the input column is a column in a `DataFrame`, or a derived column expression
    * that is named (i.e. aliased), its name would be remained as the StructField's name,
    * otherwise, the newly generated StructField's name would be auto generated as col${index + 1},
    * i.e. col1, col2, col3, ...
@@ -1846,8 +1856,8 @@ object functions {
   def round(e: Column): Column = round(e, 0)
 
   /**
-   * Round the value of `e` to `scale` decimal places if `scale` >= 0
-   * or at integral part when `scale` < 0.
+   * Round the value of `e` to `scale` decimal places if `scale` is greater than or equal to 0
+   * or at integral part when `scale` is less than 0.
    *
    * @group math_funcs
    * @since 1.5.0
@@ -1864,7 +1874,7 @@ object functions {
 
   /**
    * Round the value of `e` to `scale` decimal places with HALF_EVEN round mode
-   * if `scale` >= 0 or at integral part when `scale` < 0.
+   * if `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0.
    *
    * @group math_funcs
    * @since 2.0.0
@@ -2172,7 +2182,7 @@ object functions {
    * and returns the result as a string column.
    *
    * If d is 0, the result has no decimal point or fractional part.
-   * If d < 0, the result will be null.
+   * If d is less than 0, the result will be null.
    *
    * @group string_funcs
    * @since 1.5.0
@@ -2888,7 +2898,7 @@ object functions {
   }
 
   /**
-   * (Scala-specific) Parses a column containing a JSON string into a [[StructType]] with the
+   * (Scala-specific) Parses a column containing a JSON string into a `StructType` with the
    * specified schema. Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2904,7 +2914,7 @@ object functions {
   }
 
   /**
-   * (Java-specific) Parses a column containing a JSON string into a [[StructType]] with the
+   * (Java-specific) Parses a column containing a JSON string into a `StructType` with the
    * specified schema. Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2919,7 +2929,7 @@ object functions {
     from_json(e, schema, options.asScala.toMap)
 
   /**
-   * Parses a column containing a JSON string into a [[StructType]] with the specified schema.
+   * Parses a column containing a JSON string into a `StructType` with the specified schema.
    * Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2932,7 +2942,7 @@ object functions {
     from_json(e, schema, Map.empty[String, String])
 
   /**
-   * Parses a column containing a JSON string into a [[StructType]] with the specified schema.
+   * Parses a column containing a JSON string into a `StructType` with the specified schema.
    * Returns `null`, in the case of an unparseable string.
    *
    * @param e a string column containing JSON data.
@@ -2946,7 +2956,7 @@ object functions {
 
 
   /**
-   * (Scala-specific) Converts a column containing a [[StructType]] into a JSON string with the
+   * (Scala-specific) Converts a column containing a `StructType` into a JSON string with the
    * specified schema. Throws an exception, in the case of an unsupported type.
    *
    * @param e a struct column.
@@ -2961,7 +2971,7 @@ object functions {
   }
 
   /**
-   * (Java-specific) Converts a column containing a [[StructType]] into a JSON string with the
+   * (Java-specific) Converts a column containing a `StructType` into a JSON string with the
    * specified schema. Throws an exception, in the case of an unsupported type.
    *
    * @param e a struct column.
@@ -2975,7 +2985,7 @@ object functions {
     to_json(e, options.asScala.toMap)
 
   /**
-   * Converts a column containing a [[StructType]] into a JSON string with the
+   * Converts a column containing a `StructType` into a JSON string with the
    * specified schema. Throws an exception, in the case of an unsupported type.
    *
    * @param e a struct column.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index d3e323cb12891..6d984621ccca1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.StructType
 
 
 /**
- * Internal implementation of the user-facing [[Catalog]].
+ * Internal implementation of the user-facing `Catalog`.
  */
 class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
@@ -175,8 +175,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
   }
 
   /**
-   * Get the database with the specified name. This throws an [[AnalysisException]] when no
-   * [[Database]] can be found.
+   * Get the database with the specified name. This throws an `AnalysisException` when no
+   * `Database` can be found.
    */
   override def getDatabase(dbName: String): Database = {
     makeDatabase(dbName)
@@ -184,7 +184,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
   /**
    * Get the table or view with the specified name. This table can be a temporary view or a
-   * table/view in the current database. This throws an [[AnalysisException]] when no [[Table]]
+   * table/view in the current database. This throws an `AnalysisException` when no `Table`
    * can be found.
    */
   override def getTable(tableName: String): Table = {
@@ -193,7 +193,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
   /**
    * Get the table or view with the specified name in the specified database. This throws an
-   * [[AnalysisException]] when no [[Table]] can be found.
+   * `AnalysisException` when no `Table` can be found.
    */
   override def getTable(dbName: String, tableName: String): Table = {
     makeTable(TableIdentifier(tableName, Option(dbName)))
@@ -201,7 +201,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
 
   /**
    * Get the function with the specified name. This function can be a temporary function or a
-   * function in the current database. This throws an [[AnalysisException]] when no [[Function]]
+   * function in the current database. This throws an `AnalysisException` when no `Function`
    * can be found.
    */
   override def getFunction(functionName: String): Function = {
@@ -209,7 +209,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
   }
 
   /**
-   * Get the function with the specified name. This returns [[None]] when no [[Function]] can be
+   * Get the function with the specified name. This returns `None` when no `Function` can be
    * found.
    */
   override def getFunction(dbName: String, functionName: String): Function = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
index 791a9cf813b6a..4e7c813be9922 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala
@@ -23,7 +23,7 @@ import org.apache.spark.internal.config._
  * A helper class that enables substitution using syntax like
  * `${var}`, `${system:var}` and `${env:var}`.
  *
- * Variable substitution is controlled by [[SQLConf.variableSubstituteEnabled]].
+ * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`.
  */
 class VariableSubstitution(conf: SQLConf) {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 83857c322a0ec..e328b86437d62 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -40,8 +40,8 @@ case class JdbcType(databaseTypeDefinition : String, jdbcNullType : Int)
  * SQL dialect of a certain database or jdbc driver.
  * Lots of databases define types that aren't explicitly supported
  * by the JDBC spec.  Some JDBC drivers also report inaccurate
- * information---for instance, BIT(n&gt;1) being reported as a BIT type is quite
- * common, even though BIT in JDBC is meant for single-bit values.  Also, there
+ * information---for instance, BIT(n{@literal >}1) being reported as a BIT type is quite
+ * common, even though BIT in JDBC is meant for single-bit values. Also, there
  * does not appear to be a standard name for an unbounded string or binary
  * type; we use BLOB and CLOB by default but override with database-specific
  * alternatives when these are absent or do not behave correctly.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index ff6dd8cb0cf92..f288ad61410f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -112,7 +112,7 @@ trait SchemaRelationProvider {
 
 /**
  * ::Experimental::
- * Implemented by objects that can produce a streaming [[Source]] for a specific format or system.
+ * Implemented by objects that can produce a streaming `Source` for a specific format or system.
  *
  * @since 2.0.0
  */
@@ -143,7 +143,7 @@ trait StreamSourceProvider {
 
 /**
  * ::Experimental::
- * Implemented by objects that can produce a streaming [[Sink]] for a specific format or system.
+ * Implemented by objects that can produce a streaming `Sink` for a specific format or system.
  *
  * @since 2.0.0
  */
@@ -185,7 +185,7 @@ trait CreatableRelationProvider {
 
 /**
  * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must
- * be able to produce the schema of their data in the form of a [[StructType]]. Concrete
+ * be able to produce the schema of their data in the form of a `StructType`. Concrete
  * implementation should inherit from one of the descendant `Scan` classes, which define various
  * abstract methods for execution.
  *
@@ -216,10 +216,10 @@ abstract class BaseRelation {
 
   /**
    * Whether does it need to convert the objects in Row to internal representation, for example:
-   *  java.lang.String -> UTF8String
-   *  java.lang.Decimal -> Decimal
+   *  java.lang.String to UTF8String
+   *  java.lang.Decimal to Decimal
    *
-   * If `needConversion` is `false`, buildScan() should return an [[RDD]] of [[InternalRow]]
+   * If `needConversion` is `false`, buildScan() should return an `RDD` of `InternalRow`
    *
    * @note The internal representation is not stable across releases and thus data sources outside
    * of Spark SQL should leave this as true.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index a2d64da0012f1..5f5c8e2432d6c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -57,9 +57,17 @@ import org.apache.spark.util.SerializableJobConf
  * @param partition a map from the partition key to the partition value (optional). If the partition
  *                  value is optional, dynamic partition insert will be performed.
  *                  As an example, `INSERT INTO tbl PARTITION (a=1, b=2) AS ...` would have
- *                  Map('a' -&gt; Some('1'), 'b' -&gt; Some('2')),
+ *
+ *                  {{{
+ *                  Map('a' -> Some('1'), 'b' -> Some('2'))
+ *                  }}}
+ *
  *                  and `INSERT INTO tbl PARTITION (a=1, b) AS ...`
- *                  would have Map('a' -&gt; Some('1'), 'b' -&gt; None).
+ *                  would have
+ *
+ *                  {{{
+ *                  Map('a' -> Some('1'), 'b' -> None)
+ *                  }}}.
  * @param child the logical plan representing data to write to.
  * @param overwrite overwrite existing table or partitions.
  * @param ifNotExists If true, only write if the table or partition does not exist.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 42033080dc34b..e30e0f9611f59 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -175,7 +175,7 @@ private[hive] case class HiveGenericUDF(
 
 /**
  * Converts a Hive Generic User Defined Table Generating Function (UDTF) to a
- * [[Generator]].  Note that the semantics of Generators do not allow
+ * `Generator`. Note that the semantics of Generators do not allow
  * Generators to maintain state in between input rows.  Thus UDTFs that rely on partitioning
  * dependent operations like calls to `close()` before producing output will not operate the same as
  * in Hive.  However, in practice this should not affect compatibility for most sane UDTFs
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
index e53c3e4d4833b..16cfa9d1cc5c4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveWriterContainers.scala
@@ -48,7 +48,7 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 
 /**
  * Internal helper class that saves an RDD using a Hive OutputFormat.
- * It is based on [[SparkHadoopWriter]].
+ * It is based on `SparkHadoopWriter`.
  */
 private[hive] class SparkHiveWriterContainer(
     @transient private val jobConf: JobConf,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
index 7c1ea2f89ddb8..c3b28bd516da5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
@@ -30,7 +30,7 @@ import org.apache.spark.util.ClosureCleaner
  * `mapWithState` operation of a
  * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a
  * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java).
- * Use [[org.apache.spark.streaming.StateSpec.function() StateSpec.function]] factory methods
+ * Use `org.apache.spark.streaming.StateSpec.function()` factory methods
  * to create instances of this class.
  *
  * Example in Scala:
@@ -100,7 +100,7 @@ sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] exten
 
 /**
  * :: Experimental ::
- * Builder object for creating instances of [[org.apache.spark.streaming.StateSpec StateSpec]]
+ * Builder object for creating instances of `org.apache.spark.streaming.StateSpec`
  * that is used for specifying the parameters of the DStream transformation `mapWithState`
  * that is used for specifying the parameters of the DStream transformation
  * `mapWithState` operation of a
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index aa4003c62e1e7..2ec907c8cfd5f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -434,8 +434,8 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
    * Return a [[JavaMapWithStateDStream]] by applying a function to every key-value element of
    * `this` stream, while maintaining some state data for each unique key. The mapping function
    * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
-   * transformation can be specified using [[StateSpec]] class. The state data is accessible in
-   * as a parameter of type [[State]] in the mapping function.
+   * transformation can be specified using `StateSpec` class. The state data is accessible in
+   * as a parameter of type `State` in the mapping function.
    *
    * Example of using `mapWithState`:
    * {{{
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index b43b9405def97..982e72cffbf3f 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.scheduler.StreamingListener
  * A Java-friendly version of [[org.apache.spark.streaming.StreamingContext]] which is the main
  * entry point for Spark Streaming functionality. It provides methods to create
  * [[org.apache.spark.streaming.api.java.JavaDStream]] and
- * [[org.apache.spark.streaming.api.java.JavaPairDStream.]] from input sources. The internal
+ * [[org.apache.spark.streaming.api.java.JavaPairDStream]] from input sources. The internal
  * org.apache.spark.api.java.JavaSparkContext (see core Spark documentation) can be accessed
  * using `context.sparkContext`. After creating and transforming DStreams, the streaming
  * computation can be started and stopped using `context.start()` and `context.stop()`,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index ac739411fd212..f38c1e7996595 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -356,8 +356,8 @@ class PairDStreamFunctions[K, V](self: DStream[(K, V)])
    * Return a [[MapWithStateDStream]] by applying a function to every key-value element of
    * `this` stream, while maintaining some state data for each unique key. The mapping function
    * and other specification (e.g. partitioners, timeouts, initial state data, etc.) of this
-   * transformation can be specified using [[StateSpec]] class. The state data is accessible in
-   * as a parameter of type [[State]] in the mapping function.
+   * transformation can be specified using `StateSpec` class. The state data is accessible in
+   * as a parameter of type `State` in the mapping function.
    *
    * Example of using `mapWithState`:
    * {{{

From 124944ab639b879c43c07415ceb6de6b4dc2517a Mon Sep 17 00:00:00 2001
From: aokolnychyi <okolnychyyanton@gmail.com>
Date: Tue, 29 Nov 2016 13:49:39 +0000
Subject: [PATCH 222/534] [MINOR][DOCS] Updates to the Accumulator example in
 the programming guide. Fixed typos, AccumulatorV2 in Java

## What changes were proposed in this pull request?

This pull request contains updates to Scala and Java Accumulator code snippets in the programming guide.

- For Scala, the pull request fixes the signature of the 'add()' method in the custom Accumulator, which contained two params (as the old AccumulatorParam) instead of one (as in AccumulatorV2).

- The Java example was updated to use the AccumulatorV2 class since AccumulatorParam is marked as deprecated.

- Scala and Java examples are more consistent now.

## How was this patch tested?

This patch was tested manually by building the docs locally.

![image](https://cloud.githubusercontent.com/assets/6235869/20652099/77d98d18-b4f3-11e6-8565-a995fe8cf8e5.png)

Author: aokolnychyi <okolnychyyanton@gmail.com>

Closes #16024 from aokolnychyi/fixed_accumulator_example.

(cherry picked from commit f045d9dade66d44f5ca4768bfe6a484e9288ec8d)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/programming-guide.md | 54 ++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 58bf17b4a84ef..4267b8cae8110 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -1378,18 +1378,23 @@ res2: Long = 10
 
 While this code used the built-in support for accumulators of type Long, programmers can also
 create their own types by subclassing [AccumulatorV2](api/scala/index.html#org.apache.spark.util.AccumulatorV2).
-The AccumulatorV2 abstract class has several methods which need to override: 
-`reset` for resetting the accumulator to zero, and `add` for add anothor value into the accumulator, `merge` for merging another same-type accumulator into this one. Other methods need to override can refer to scala API document. For example, supposing we had a `MyVector` class
+The AccumulatorV2 abstract class has several methods which one has to override: `reset` for resetting
+the accumulator to zero, `add` for adding another value into the accumulator,
+`merge` for merging another same-type accumulator into this one. Other methods that must be overridden
+are contained in the [API documentation](api/scala/index.html#org.apache.spark.util.AccumulatorV2). For example, supposing we had a `MyVector` class
 representing mathematical vectors, we could write:
 
 {% highlight scala %}
-object VectorAccumulatorV2 extends AccumulatorV2[MyVector, MyVector] {
-  val vec_ : MyVector = MyVector.createZeroVector
-  def reset(): MyVector = {
-    vec_.reset()
+class VectorAccumulatorV2 extends AccumulatorV2[MyVector, MyVector] {
+
+  private val myVector: MyVector = MyVector.createZeroVector
+
+  def reset(): Unit = {
+    myVector.reset()
   }
-  def add(v1: MyVector, v2: MyVector): MyVector = {
-    vec_.add(v2)
+
+  def add(v: MyVector): Unit = {
+    myVector.add(v)
   }
   ...
 }
@@ -1424,29 +1429,36 @@ accum.value();
 // returns 10
 {% endhighlight %}
 
-Programmers can also create their own types by subclassing
-[AccumulatorParam](api/java/index.html?org/apache/spark/AccumulatorParam.html).
-The AccumulatorParam interface has two methods: `zero` for providing a "zero value" for your data
-type, and `addInPlace` for adding two values together. For example, supposing we had a `Vector` class
+While this code used the built-in support for accumulators of type Long, programmers can also
+create their own types by subclassing [AccumulatorV2](api/scala/index.html#org.apache.spark.util.AccumulatorV2).
+The AccumulatorV2 abstract class has several methods which one has to override: `reset` for resetting
+the accumulator to zero, `add` for adding another value into the accumulator,
+`merge` for merging another same-type accumulator into this one. Other methods that must be overridden
+are contained in the [API documentation](api/scala/index.html#org.apache.spark.util.AccumulatorV2). For example, supposing we had a `MyVector` class
 representing mathematical vectors, we could write:
 
 {% highlight java %}
-class VectorAccumulatorParam implements AccumulatorParam<Vector> {
-  public Vector zero(Vector initialValue) {
-    return Vector.zeros(initialValue.size());
+class VectorAccumulatorV2 implements AccumulatorV2<MyVector, MyVector> {
+
+  private MyVector myVector = MyVector.createZeroVector();
+
+  public void reset() {
+    myVector.reset();
   }
-  public Vector addInPlace(Vector v1, Vector v2) {
-    v1.addInPlace(v2); return v1;
+
+  public void add(MyVector v) {
+    myVector.add(v);
   }
+  ...
 }
 
 // Then, create an Accumulator of this type:
-Accumulator<Vector> vecAccum = sc.accumulator(new Vector(...), new VectorAccumulatorParam());
+VectorAccumulatorV2 myVectorAcc = new VectorAccumulatorV2();
+// Then, register it into spark context:
+jsc.sc().register(myVectorAcc, "MyVectorAcc1");
 {% endhighlight %}
 
-In Java, Spark also supports the more general [Accumulable](api/java/index.html?org/apache/spark/Accumulable.html)
-interface to accumulate data where the resulting type is not the same as the elements added (e.g. build
-a list by collecting together elements).
+Note that, when programmers define their own type of AccumulatorV2, the resulting type can be different than that of the elements added.
 
 </div>
 

From 086a3bdb283c0b234495385bd99b6077d3ea05bc Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 29 Nov 2016 13:50:24 +0000
Subject: [PATCH 223/534] [SPARK-18615][DOCS] Switch to multi-line doc to avoid
 a genjavadoc bug for backticks

## What changes were proposed in this pull request?

Currently, single line comment does not mark down backticks to `<code>..</code>` but prints as they are (`` `..` ``). For example, the line below:

```scala
/** Return an RDD with the pairs from `this` whose keys are not in `other`. */
```

So, we could work around this as below:

```scala
/**
 * Return an RDD with the pairs from `this` whose keys are not in `other`.
 */
```

- javadoc

  - **Before**
    ![2016-11-29 10 39 14](https://cloud.githubusercontent.com/assets/6477701/20693606/e64c8f90-b622-11e6-8dfc-4a029216e23d.png)

  - **After**
    ![2016-11-29 10 39 08](https://cloud.githubusercontent.com/assets/6477701/20693607/e7280d36-b622-11e6-8502-d2e21cd5556b.png)

- scaladoc (this one looks fine either way)

  - **Before**
    ![2016-11-29 10 38 22](https://cloud.githubusercontent.com/assets/6477701/20693640/12c18aa8-b623-11e6-901a-693e2f6f8066.png)

  - **After**
    ![2016-11-29 10 40 05](https://cloud.githubusercontent.com/assets/6477701/20693642/14eb043a-b623-11e6-82ac-7cd0000106d1.png)

I suspect this is related with SPARK-16153 and genjavadoc issue in ` typesafehub/genjavadoc#85`.

## How was this patch tested?

I found them via

```
grep -r "\/\*\*.*\`" . | grep .scala
````

and then checked if each is in the public API documentation with manually built docs (`jekyll build`) with Java 7.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #16050 from HyukjinKwon/javadoc-markdown.

(cherry picked from commit 1a870090e4266df570c3f56c1e2ea12d090d03d1)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../scala/org/apache/spark/SparkConf.scala    |  4 +++-
 .../apache/spark/api/java/JavaDoubleRDD.scala |  4 +++-
 .../apache/spark/api/java/JavaPairRDD.scala   | 12 ++++++++---
 .../org/apache/spark/api/java/JavaRDD.scala   |  4 +++-
 .../apache/spark/rdd/PairRDDFunctions.scala   |  8 ++++++--
 .../main/scala/org/apache/spark/rdd/RDD.scala |  8 ++++++--
 .../spark/graphx/impl/EdgeRDDImpl.scala       |  4 +++-
 .../apache/spark/graphx/impl/GraphImpl.scala  | 12 ++++++++---
 .../spark/graphx/impl/VertexRDDImpl.scala     |  4 +++-
 .../org/apache/spark/ml/linalg/Matrices.scala | 16 +++++++++++----
 .../scala/org/apache/spark/ml/Pipeline.scala  |  4 +++-
 .../spark/ml/attribute/AttributeGroup.scala   |  4 +++-
 .../spark/ml/attribute/attributes.scala       | 20 ++++++++++++++-----
 .../classification/LogisticRegression.scala   |  4 +++-
 .../GeneralizedLinearRegression.scala         |  4 +++-
 .../spark/mllib/feature/ChiSqSelector.scala   |  8 ++++++--
 .../apache/spark/mllib/linalg/Matrices.scala  | 16 +++++++++++----
 .../linalg/distributed/BlockMatrix.scala      |  4 +++-
 .../linalg/distributed/CoordinateMatrix.scala |  4 +++-
 .../linalg/distributed/IndexedRowMatrix.scala |  4 +++-
 .../apache/spark/mllib/stat/Statistics.scala  |  8 ++++++--
 .../scala/org/apache/spark/sql/Encoder.scala  |  4 +++-
 .../apache/spark/sql/types/ArrayType.scala    |  4 +++-
 .../apache/spark/streaming/StateSpec.scala    |  8 ++++++--
 24 files changed, 129 insertions(+), 43 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 0c1c68de89f81..d78b9f1b29685 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -378,7 +378,9 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     settings.entrySet().asScala.map(x => (x.getKey, x.getValue)).toArray
   }
 
-  /** Get all parameters that start with `prefix` */
+  /**
+   * Get all parameters that start with `prefix`
+   */
   def getAllWithPrefix(prefix: String): Array[(String, String)] = {
     getAll.filter { case (k, v) => k.startsWith(prefix) }
       .map { case (k, v) => (k.substring(prefix.length), v) }
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
index a32a4b28c1731..b71af0d42cdb0 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaDoubleRDD.scala
@@ -45,7 +45,9 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double])
 
   import JavaDoubleRDD.fromRDD
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): JavaDoubleRDD = fromRDD(srdd.cache())
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index d7e3a1b1be48c..766aea213a972 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -54,7 +54,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
 
   // Common RDD functions
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.cache())
 
   /**
@@ -454,13 +456,17 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
     fromRDD(rdd.subtractByKey(other))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, V] = {
     implicit val ctag: ClassTag[W] = fakeClassTag
     fromRDD(rdd.subtractByKey(other, numPartitions))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W](other: JavaPairRDD[K, W], p: Partitioner): JavaPairRDD[K, V] = {
     implicit val ctag: ClassTag[W] = fakeClassTag
     fromRDD(rdd.subtractByKey(other, p))
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index 94e26e687c66b..41b5cab601c36 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -34,7 +34,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
 
   // Common RDD functions
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): JavaRDD[T] = wrapRDD(rdd.cache())
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 969cd47038cfa..dc123e23b781c 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -916,14 +916,18 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     subtractByKey(other, self.partitioner.getOrElse(new HashPartitioner(self.partitions.length)))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W: ClassTag](
       other: RDD[(K, W)],
       numPartitions: Int): RDD[(K, V)] = self.withScope {
     subtractByKey(other, new HashPartitioner(numPartitions))
   }
 
-  /** Return an RDD with the pairs from `this` whose keys are not in `other`. */
+  /**
+   * Return an RDD with the pairs from `this` whose keys are not in `other`.
+   */
   def subtractByKey[W: ClassTag](other: RDD[(K, W)], p: Partitioner): RDD[(K, V)] = self.withScope {
     new SubtractedRDD[K, V, W](self, other, p)
   }
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index f723fcb837f88..d285e917b8a67 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -195,10 +195,14 @@ abstract class RDD[T: ClassTag](
     }
   }
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)
 
-  /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
+  /**
+   * Persist this RDD with the default storage level (`MEMORY_ONLY`).
+   */
   def cache(): this.type = persist()
 
   /**
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
index faa985594ec08..376c7b06f9d2b 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala
@@ -63,7 +63,9 @@ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
     this
   }
 
-  /** Persists the edge partitions using `targetStorageLevel`, which defaults to MEMORY_ONLY. */
+  /**
+   * Persists the edge partitions using `targetStorageLevel`, which defaults to MEMORY_ONLY.
+   */
   override def cache(): this.type = {
     partitionsRDD.persist(targetStorageLevel)
     this
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index 3810110099993..5d2a53782b55d 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -277,7 +277,9 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
 
 object GraphImpl {
 
-  /** Create a graph from edges, setting referenced vertices to `defaultVertexAttr`. */
+  /**
+   * Create a graph from edges, setting referenced vertices to `defaultVertexAttr`.
+   */
   def apply[VD: ClassTag, ED: ClassTag](
       edges: RDD[Edge[ED]],
       defaultVertexAttr: VD,
@@ -286,7 +288,9 @@ object GraphImpl {
     fromEdgeRDD(EdgeRDD.fromEdges(edges), defaultVertexAttr, edgeStorageLevel, vertexStorageLevel)
   }
 
-  /** Create a graph from EdgePartitions, setting referenced vertices to `defaultVertexAttr`. */
+  /**
+   * Create a graph from EdgePartitions, setting referenced vertices to `defaultVertexAttr`.
+   */
   def fromEdgePartitions[VD: ClassTag, ED: ClassTag](
       edgePartitions: RDD[(PartitionID, EdgePartition[ED, VD])],
       defaultVertexAttr: VD,
@@ -296,7 +300,9 @@ object GraphImpl {
       vertexStorageLevel)
   }
 
-  /** Create a graph from vertices and edges, setting missing vertices to `defaultVertexAttr`. */
+  /**
+   * Create a graph from vertices and edges, setting missing vertices to `defaultVertexAttr`.
+   */
   def apply[VD: ClassTag, ED: ClassTag](
       vertices: RDD[(VertexId, VD)],
       edges: RDD[Edge[ED]],
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
index d314522de9916..3c6f22d97360d 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala
@@ -63,7 +63,9 @@ class VertexRDDImpl[VD] private[graphx] (
     this
   }
 
-  /** Persists the vertex partitions at `targetStorageLevel`, which defaults to MEMORY_ONLY. */
+  /**
+   * Persists the vertex partitions at `targetStorageLevel`, which defaults to MEMORY_ONLY.
+   */
   override def cache(): this.type = {
     partitionsRDD.persist(targetStorageLevel)
     this
diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index 4d4b06b0952bd..d9ffdeb797fb8 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -85,11 +85,15 @@ sealed trait Matrix extends Serializable {
   @Since("2.0.0")
   def copy: Matrix
 
-  /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */
+  /**
+   * Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data.
+   */
   @Since("2.0.0")
   def transpose: Matrix
 
-  /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`DenseMatrix` multiplication.
+   */
   @Since("2.0.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
     val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
@@ -97,13 +101,17 @@ sealed trait Matrix extends Serializable {
     C
   }
 
-  /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
+  /**
+   * Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility.
+   */
   @Since("2.0.0")
   def multiply(y: DenseVector): DenseVector = {
     multiply(y.asInstanceOf[Vector])
   }
 
-  /** Convenience method for `Matrix`-`Vector` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`Vector` multiplication.
+   */
   @Since("2.0.0")
   def multiply(y: Vector): DenseVector = {
     val output = new DenseVector(new Array[Double](numRows))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
index 38176b96ba2ed..08e9cb9ba8668 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -216,7 +216,9 @@ object Pipeline extends MLReadable[Pipeline] {
     }
   }
 
-  /** Methods for `MLReader` and `MLWriter` shared between [[Pipeline]] and [[PipelineModel]] */
+  /**
+   * Methods for `MLReader` and `MLWriter` shared between [[Pipeline]] and [[PipelineModel]]
+   */
   private[ml] object SharedReadWrite {
 
     import org.json4s.JsonDSL._
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 527cb2d547b63..21a246e454c83 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -239,7 +239,9 @@ object AttributeGroup {
     }
   }
 
-  /** Creates an attribute group from a `StructField` instance. */
+  /**
+   * Creates an attribute group from a `StructField` instance.
+   */
   def fromStructField(field: StructField): AttributeGroup = {
     require(field.dataType == new VectorUDT)
     if (field.metadata.contains(ML_ATTR)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index cc7e8bc301ad3..7fbfee75e96a9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -109,7 +109,9 @@ sealed abstract class Attribute extends Serializable {
     StructField(name.get, DoubleType, nullable = false, newMetadata)
   }
 
-  /** Converts to a `StructField`. */
+  /**
+   * Converts to a `StructField`.
+   */
   def toStructField(): StructField = toStructField(Metadata.empty)
 
   override def toString: String = toMetadataImpl(withType = true).toString
@@ -369,12 +371,16 @@ class NominalAttribute private[ml] (
   override def withIndex(index: Int): NominalAttribute = copy(index = Some(index))
   override def withoutIndex: NominalAttribute = copy(index = None)
 
-  /** Copy with new values and empty `numValues`. */
+  /**
+   * Copy with new values and empty `numValues`.
+   */
   def withValues(values: Array[String]): NominalAttribute = {
     copy(numValues = None, values = Some(values))
   }
 
-  /** Copy with new values and empty `numValues`. */
+  /**
+   * Copy with new values and empty `numValues`.
+   */
   @varargs
   def withValues(first: String, others: String*): NominalAttribute = {
     copy(numValues = None, values = Some((first +: others).toArray))
@@ -385,12 +391,16 @@ class NominalAttribute private[ml] (
     copy(values = None)
   }
 
-  /** Copy with a new `numValues` and empty `values`. */
+  /**
+   * Copy with a new `numValues` and empty `values`.
+   */
   def withNumValues(numValues: Int): NominalAttribute = {
     copy(numValues = Some(numValues), values = None)
   }
 
-  /** Copy without the `numValues`. */
+  /**
+   * Copy without the `numValues`.
+   */
   def withoutNumValues: NominalAttribute = copy(numValues = None)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index ec582266e6a47..d3ae62e243302 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1105,7 +1105,9 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
  */
 sealed trait LogisticRegressionSummary extends Serializable {
 
-  /** Dataframe output by the model's `transform` method. */
+  /**
+   * Dataframe output by the model's `transform` method.
+   */
   def predictions: DataFrame
 
   /** Field in "predictions" which gives the probability of each class as a vector. */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index e718cda2623a0..770a2571bb9c2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -886,7 +886,9 @@ class GeneralizedLinearRegressionSummary private[regression] (
   protected val model: GeneralizedLinearRegressionModel =
     origModel.copy(ParamMap.empty).setPredictionCol(predictionCol)
 
-  /** Predictions output by the model's `transform` method. */
+  /**
+   * Predictions output by the model's `transform` method.
+   */
   @Since("2.0.0") @transient val predictions: DataFrame = model.transform(dataset)
 
   private[regression] lazy val family: Family = Family.fromName(model.getFamily)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index f9156b642785f..05ad2492f8c43 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -255,10 +255,14 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
 private[spark] object ChiSqSelector {
 
-  /** String name for `numTopFeatures` selector type. */
+  /**
+   * String name for `numTopFeatures` selector type.
+   */
   val NumTopFeatures: String = "numTopFeatures"
 
-  /** String name for `percentile` selector type. */
+  /**
+   * String name for `percentile` selector type.
+   */
   val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 542a69b3ef8cf..6c39fe5d84865 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -91,11 +91,15 @@ sealed trait Matrix extends Serializable {
   @Since("1.2.0")
   def copy: Matrix
 
-  /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */
+  /**
+   * Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data.
+   */
   @Since("1.3.0")
   def transpose: Matrix
 
-  /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`DenseMatrix` multiplication.
+   */
   @Since("1.2.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
     val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
@@ -103,13 +107,17 @@ sealed trait Matrix extends Serializable {
     C
   }
 
-  /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */
+  /**
+   * Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility.
+   */
   @Since("1.2.0")
   def multiply(y: DenseVector): DenseVector = {
     multiply(y.asInstanceOf[Vector])
   }
 
-  /** Convenience method for `Matrix`-`Vector` multiplication. */
+  /**
+   * Convenience method for `Matrix`-`Vector` multiplication.
+   */
   @Since("1.4.0")
   def multiply(y: Vector): DenseVector = {
     val output = new DenseVector(new Array[Double](numRows))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 9e75217410d36..ff81a2f03e2a8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -295,7 +295,9 @@ class BlockMatrix @Since("1.3.0") (
     new IndexedRowMatrix(rows)
   }
 
-  /** Collect the distributed matrix on the driver as a `DenseMatrix`. */
+  /**
+   * Collect the distributed matrix on the driver as a `DenseMatrix`.
+   */
   @Since("1.3.0")
   def toLocalMatrix(): Matrix = {
     require(numRows() < Int.MaxValue, "The number of rows of this matrix should be less than " +
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index d2c5b14a5b128..26ca1ef9be870 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -101,7 +101,9 @@ class CoordinateMatrix @Since("1.0.0") (
     toIndexedRowMatrix().toRowMatrix()
   }
 
-  /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */
+  /**
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024.
+   */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index 590e959daa1f4..d7255d527f036 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -90,7 +90,9 @@ class IndexedRowMatrix @Since("1.0.0") (
     new RowMatrix(rows.map(_.vector), 0L, nCols)
   }
 
-  /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */
+  /**
+   * Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024.
+   */
   @Since("1.3.0")
   def toBlockMatrix(): BlockMatrix = {
     toBlockMatrix(1024, 1024)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 7ba9b292969e7..5ebbfb2b6298d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -176,7 +176,9 @@ object Statistics {
     ChiSqTest.chiSquaredFeatures(data)
   }
 
-  /** Java-friendly version of `chiSqTest()` */
+  /**
+   * Java-friendly version of `chiSqTest()`
+   */
   @Since("1.5.0")
   def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd)
 
@@ -218,7 +220,9 @@ object Statistics {
     KolmogorovSmirnovTest.testOneSample(data, distName, params: _*)
   }
 
-  /** Java-friendly version of `kolmogorovSmirnovTest()` */
+  /**
+   * Java-friendly version of `kolmogorovSmirnovTest()`
+   */
   @Since("1.5.0")
   @varargs
   def kolmogorovSmirnovTest(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala
index b9f8c46443021..68ea47cedac9a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Encoder.scala
@@ -77,6 +77,8 @@ trait Encoder[T] extends Serializable {
   /** Returns the schema of encoding this type of object as a Row. */
   def schema: StructType
 
-  /** A ClassTag that can be used to construct and Array to contain a collection of `T`. */
+  /**
+   * A ClassTag that can be used to construct and Array to contain a collection of `T`.
+   */
   def clsTag: ClassTag[T]
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index 5d70ef01373f5..d409271fbc6b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -31,7 +31,9 @@ import org.apache.spark.sql.catalyst.util.ArrayData
  */
 @InterfaceStability.Stable
 object ArrayType extends AbstractDataType {
-  /** Construct a [[ArrayType]] object with the given element type. The `containsNull` is true. */
+  /**
+   * Construct a [[ArrayType]] object with the given element type. The `containsNull` is true.
+   */
   def apply(elementType: DataType): ArrayType = ArrayType(elementType, containsNull = true)
 
   override private[sql] def defaultConcreteType: DataType = ArrayType(NullType, containsNull = true)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
index c3b28bd516da5..dcd698c860d8b 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StateSpec.scala
@@ -70,10 +70,14 @@ import org.apache.spark.util.ClosureCleaner
 @Experimental
 sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] extends Serializable {
 
-  /** Set the RDD containing the initial states that will be used by `mapWithState` */
+  /**
+   * Set the RDD containing the initial states that will be used by `mapWithState`
+   */
   def initialState(rdd: RDD[(KeyType, StateType)]): this.type
 
-  /** Set the RDD containing the initial states that will be used by `mapWithState` */
+  /**
+   * Set the RDD containing the initial states that will be used by `mapWithState`
+   */
   def initialState(javaPairRDD: JavaPairRDD[KeyType, StateType]): this.type
 
   /**

From d3aaed219b1a87765f0bf4d6b11eccdbcfb3672b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 29 Nov 2016 11:19:35 -0800
Subject: [PATCH 224/534] [SPARK-18592][ML] Move DT/RF/GBT Param setter methods
 to subclasses

## What changes were proposed in this pull request?
Mainly two changes:
* Move DT/RF/GBT Param setter methods to subclasses.
* Deprecate corresponding setter methods in the model classes.

See discussion here https://github.com/apache/spark/pull/15913#discussion_r89662469.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16017 from yanboliang/spark-18592.

(cherry picked from commit 95f79850127204c75d1b356727237ef68d042e69)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../DecisionTreeClassifier.scala              | 36 ++++++--
 .../ml/classification/GBTClassifier.scala     | 44 ++++++---
 .../RandomForestClassifier.scala              | 45 ++++++---
 .../ml/regression/DecisionTreeRegressor.scala | 38 ++++++--
 .../spark/ml/regression/GBTRegressor.scala    | 47 +++++++---
 .../ml/regression/RandomForestRegressor.scala | 48 +++++++---
 .../org/apache/spark/ml/tree/treeParams.scala | 92 +++++++++++++++----
 7 files changed, 260 insertions(+), 90 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 7424031ed4608..7e0bc19a7aeb4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -52,33 +52,49 @@ class DecisionTreeClassifier @Since("1.4.0") (
 
   // Override parameter setters from parent trait for Java API compatibility.
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
+  /** @group setParam */
   @Since("1.6.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index ca5223133317c..c5fc3c8772908 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -69,31 +69,47 @@ class GBTClassifier @Since("1.4.0") (
 
   // Parameters from TreeClassifierParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /**
    * The impurity setting is ignored for GBT models.
    * Individual trees are built using impurity "Variance."
+   *
+   * @group setParam
    */
   @Since("1.4.0")
   override def setImpurity(value: String): this.type = {
@@ -103,19 +119,23 @@ class GBTClassifier @Since("1.4.0") (
 
   // Parameters from TreeEnsembleParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from GBTParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxIter(value: Int): this.type = super.setMaxIter(value)
+  override def setMaxIter(value: Int): this.type = set(maxIter, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setStepSize(value: Double): this.type = super.setStepSize(value)
+  override def setStepSize(value: Double): this.type = set(stepSize, value)
 
   // Parameters from GBTClassifierParams:
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index d151213f9edd8..34c055dce6511 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -54,47 +54,66 @@ class RandomForestClassifier @Since("1.4.0") (
 
   // Parameters from TreeClassifierParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
   // Parameters from TreeEnsembleParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from RandomForestParams:
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setNumTrees(value: Int): this.type = super.setNumTrees(value)
+  override def setNumTrees(value: Int): this.type = set(numTrees, value)
 
+  /** @group setParam */
   @Since("1.4.0")
   override def setFeatureSubsetStrategy(value: String): this.type =
-    super.setFeatureSubsetStrategy(value)
+    set(featureSubsetStrategy, value)
 
   override protected def train(dataset: Dataset[_]): RandomForestClassificationModel = {
     val categoricalFeatures: Map[Int, Int] =
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 0b0c46144bfbe..0cdfa7b0b742a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -51,34 +51,52 @@ class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
   def this() = this(Identifiable.randomUID("dtr"))
 
   // Override parameter setters from parent trait for Java API compatibility.
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  /** @group setParam */
+  @Since("1.6.0")
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   /** @group setParam */
+  @Since("2.0.0")
   def setVarianceCol(value: String): this.type = set(varianceCol, value)
 
   override protected def train(dataset: Dataset[_]): DecisionTreeRegressionModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 6e62c8d03c708..49a3f8b6b5152 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -65,31 +65,48 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   // Override parameter setters from parent trait for Java API compatibility.
 
   // Parameters from TreeRegressorParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /**
    * The impurity setting is ignored for GBT models.
    * Individual trees are built using impurity "Variance."
+   *
+   * @group setParam
    */
   @Since("1.4.0")
   override def setImpurity(value: String): this.type = {
@@ -98,18 +115,24 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   }
 
   // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from GBTParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxIter(value: Int): this.type = super.setMaxIter(value)
+  override def setMaxIter(value: Int): this.type = set(maxIter, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setStepSize(value: Double): this.type = super.setStepSize(value)
+  override def setStepSize(value: Double): this.type = set(stepSize, value)
 
   // Parameters from GBTRegressorParams:
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 62dd729a2994a..67fb648625550 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -52,45 +52,67 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
   // Override parameter setters from parent trait for Java API compatibility.
 
   // Parameters from TreeRegressorParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value)
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMaxBins(value: Int): this.type = super.setMaxBins(value)
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInstancesPerNode(value: Int): this.type =
-    super.setMinInstancesPerNode(value)
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setMinInfoGain(value: Double): this.type = super.setMinInfoGain(value)
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setMaxMemoryInMB(value: Int): this.type = super.setMaxMemoryInMB(value)
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
+  /** @group expertSetParam */
   @Since("1.4.0")
-  override def setCacheNodeIds(value: Boolean): this.type = super.setCacheNodeIds(value)
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be >= 1.
+   * (default = 10)
+   * @group setParam
+   */
   @Since("1.4.0")
-  override def setCheckpointInterval(value: Int): this.type = super.setCheckpointInterval(value)
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setImpurity(value: String): this.type = super.setImpurity(value)
+  override def setImpurity(value: String): this.type = set(impurity, value)
 
   // Parameters from TreeEnsembleParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSubsamplingRate(value: Double): this.type = super.setSubsamplingRate(value)
+  override def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
+  /** @group setParam */
   @Since("1.4.0")
-  override def setSeed(value: Long): this.type = super.setSeed(value)
+  override def setSeed(value: Long): this.type = set(seed, value)
 
   // Parameters from RandomForestParams:
+
+  /** @group setParam */
   @Since("1.4.0")
-  override def setNumTrees(value: Int): this.type = super.setNumTrees(value)
+  override def setNumTrees(value: Int): this.type = set(numTrees, value)
 
+  /** @group setParam */
   @Since("1.4.0")
   override def setFeatureSubsetStrategy(value: String): this.type =
-    super.setFeatureSubsetStrategy(value)
+    set(featureSubsetStrategy, value)
 
   override protected def train(dataset: Dataset[_]): RandomForestRegressionModel = {
     val categoricalFeatures: Map[Int, Int] =
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 83ab4b5da87be..c7a8f76eca84f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -107,54 +107,78 @@ private[ml] trait DecisionTreeParams extends PredictorParams
   setDefault(maxDepth -> 5, maxBins -> 32, minInstancesPerNode -> 1, minInfoGain -> 0.0,
     maxMemoryInMB -> 256, cacheNodeIds -> false, checkpointInterval -> 10)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxDepth(value: Int): this.type = set(maxDepth, value)
 
   /** @group getParam */
   final def getMaxDepth: Int = $(maxDepth)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxBins(value: Int): this.type = set(maxBins, value)
 
   /** @group getParam */
   final def getMaxBins: Int = $(maxBins)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
 
   /** @group getParam */
   final def getMinInstancesPerNode: Int = $(minInstancesPerNode)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
 
   /** @group getParam */
   final def getMinInfoGain: Double = $(minInfoGain)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setSeed(value: Long): this.type = set(seed, value)
 
-  /** @group expertSetParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group expertSetParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
 
   /** @group expertGetParam */
   final def getMaxMemoryInMB: Int = $(maxMemoryInMB)
 
-  /** @group expertSetParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group expertSetParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
 
   /** @group expertGetParam */
   final def getCacheNodeIds: Boolean = $(cacheNodeIds)
 
   /**
-   * Specifies how often to checkpoint the cached node IDs.
-   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
-   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
-   * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
-   * (default = 10)
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
    * @group setParam
    */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
 
   /** (private[ml]) Create a Strategy instance to use with the old API. */
@@ -198,7 +222,11 @@ private[ml] trait TreeClassifierParams extends Params {
 
   setDefault(impurity -> "gini")
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setImpurity(value: String): this.type = set(impurity, value)
 
   /** @group getParam */
@@ -243,7 +271,11 @@ private[ml] trait TreeRegressorParams extends Params {
 
   setDefault(impurity -> "variance")
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setImpurity(value: String): this.type = set(impurity, value)
 
   /** @group getParam */
@@ -300,7 +332,11 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams {
 
   setDefault(subsamplingRate -> 1.0)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setSubsamplingRate(value: Double): this.type = set(subsamplingRate, value)
 
   /** @group getParam */
@@ -340,7 +376,11 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams {
 
   setDefault(numTrees -> 20)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setNumTrees(value: Int): this.type = set(numTrees, value)
 
   /** @group getParam */
@@ -383,7 +423,11 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams {
 
   setDefault(featureSubsetStrategy -> "auto")
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setFeatureSubsetStrategy(value: String): this.type = set(featureSubsetStrategy, value)
 
   /** @group getParam */
@@ -420,7 +464,11 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   // final val validationTol: DoubleParam = new DoubleParam(this, "validationTol", "")
   // validationTol -> 1e-5
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
   /**
@@ -436,7 +484,11 @@ private[ml] trait GBTParams extends TreeEnsembleParams with HasMaxIter {
   /** @group getParam */
   final def getStepSize: Double = $(stepSize)
 
-  /** @group setParam */
+  /**
+   * @deprecated This method is deprecated and will be removed in 2.2.0.
+   * @group setParam
+   */
+  @deprecated("This method is deprecated and will be removed in 2.2.0.", "2.1.0")
   def setStepSize(value: Double): this.type = set(stepSize, value)
 
   setDefault(maxIter -> 20, stepSize -> 0.1)

From e8ca1aea56956755e6335c0b7d2cbaa43e1f1e18 Mon Sep 17 00:00:00 2001
From: Tyson Condie <tcondie@gmail.com>
Date: Tue, 29 Nov 2016 12:36:41 -0800
Subject: [PATCH 225/534] [SPARK-18498][SQL] Revise HDFSMetadataLog API for
 better testing

Revise HDFSMetadataLog API such that metadata object serialization and final batch file write are separated. This will allow serialization checks without worrying about batch file name formats. marmbrus zsxwing

Existing tests already ensure this API faithfully support core functionality i.e., creation of batch files.

Author: Tyson Condie <tcondie@gmail.com>

Closes #15924 from tcondie/SPARK-18498.

Signed-off-by: Michael Armbrust <michael@databricks.com>
(cherry picked from commit f643fe47f4889faf68da3da8d7850ee48df7c22f)
Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 .../execution/streaming/HDFSMetadataLog.scala | 100 ++++++++++++------
 1 file changed, 66 insertions(+), 34 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index d95ec7f67feb3..1b413528935f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -138,14 +138,7 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     }
   }
 
-  /**
-   * Write a batch to a temp file then rename it to the batch file.
-   *
-   * There may be multiple [[HDFSMetadataLog]] using the same metadata path. Although it is not a
-   * valid behavior, we still need to prevent it from destroying the files.
-   */
-  private def writeBatch(batchId: Long, metadata: T, writer: (T, OutputStream) => Unit): Unit = {
-    // Use nextId to create a temp file
+  def writeTempBatch(metadata: T, writer: (T, OutputStream) => Unit = serialize): Option[Path] = {
     var nextId = 0
     while (true) {
       val tempPath = new Path(metadataPath, s".${UUID.randomUUID.toString}.tmp")
@@ -153,33 +146,10 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
         val output = fileManager.create(tempPath)
         try {
           writer(metadata, output)
+          return Some(tempPath)
         } finally {
           IOUtils.closeQuietly(output)
         }
-        try {
-          // Try to commit the batch
-          // It will fail if there is an existing file (someone has committed the batch)
-          logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
-          fileManager.rename(tempPath, batchIdToPath(batchId))
-
-          // SPARK-17475: HDFSMetadataLog should not leak CRC files
-          // If the underlying filesystem didn't rename the CRC file, delete it.
-          val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc")
-          if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
-          return
-        } catch {
-          case e: IOException if isFileAlreadyExistsException(e) =>
-            // If "rename" fails, it means some other "HDFSMetadataLog" has committed the batch.
-            // So throw an exception to tell the user this is not a valid behavior.
-            throw new ConcurrentModificationException(
-              s"Multiple HDFSMetadataLog are using $path", e)
-          case e: FileNotFoundException =>
-            // Sometimes, "create" will succeed when multiple writers are calling it at the same
-            // time. However, only one writer can call "rename" successfully, others will get
-            // FileNotFoundException because the first writer has removed it.
-            throw new ConcurrentModificationException(
-              s"Multiple HDFSMetadataLog are using $path", e)
-        }
       } catch {
         case e: IOException if isFileAlreadyExistsException(e) =>
           // Failed to create "tempPath". There are two cases:
@@ -195,10 +165,45 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
           // metadata path. In addition, the old Streaming also have this issue, people can create
           // malicious checkpoint files to crash a Streaming application too.
           nextId += 1
-      } finally {
-        fileManager.delete(tempPath)
       }
     }
+    None
+  }
+
+  /**
+   * Write a batch to a temp file then rename it to the batch file.
+   *
+   * There may be multiple [[HDFSMetadataLog]] using the same metadata path. Although it is not a
+   * valid behavior, we still need to prevent it from destroying the files.
+   */
+  private def writeBatch(batchId: Long, metadata: T, writer: (T, OutputStream) => Unit): Unit = {
+    val tempPath = writeTempBatch(metadata, writer).getOrElse(
+      throw new IllegalStateException(s"Unable to create temp batch file $batchId"))
+    try {
+      // Try to commit the batch
+      // It will fail if there is an existing file (someone has committed the batch)
+      logDebug(s"Attempting to write log #${batchIdToPath(batchId)}")
+      fileManager.rename(tempPath, batchIdToPath(batchId))
+
+      // SPARK-17475: HDFSMetadataLog should not leak CRC files
+      // If the underlying filesystem didn't rename the CRC file, delete it.
+      val crcPath = new Path(tempPath.getParent(), s".${tempPath.getName()}.crc")
+      if (fileManager.exists(crcPath)) fileManager.delete(crcPath)
+    } catch {
+      case e: IOException if isFileAlreadyExistsException(e) =>
+        // If "rename" fails, it means some other "HDFSMetadataLog" has committed the batch.
+        // So throw an exception to tell the user this is not a valid behavior.
+        throw new ConcurrentModificationException(
+          s"Multiple HDFSMetadataLog are using $path", e)
+      case e: FileNotFoundException =>
+        // Sometimes, "create" will succeed when multiple writers are calling it at the same
+        // time. However, only one writer can call "rename" successfully, others will get
+        // FileNotFoundException because the first writer has removed it.
+        throw new ConcurrentModificationException(
+          s"Multiple HDFSMetadataLog are using $path", e)
+    } finally {
+      fileManager.delete(tempPath)
+    }
   }
 
   private def isFileAlreadyExistsException(e: IOException): Boolean = {
@@ -208,6 +213,22 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
       (e.getMessage != null && e.getMessage.startsWith("File already exists: "))
   }
 
+  /**
+   * @return the deserialized metadata in a batch file, or None if file not exist.
+   * @throws IllegalArgumentException when path does not point to a batch file.
+   */
+  def get(batchFile: Path): Option[T] = {
+    if (fileManager.exists(batchFile)) {
+      if (isBatchFile(batchFile)) {
+        get(pathToBatchId(batchFile))
+      } else {
+        throw new IllegalArgumentException(s"File ${batchFile} is not a batch file!")
+      }
+    } else {
+      None
+    }
+  }
+
   override def get(batchId: Long): Option[T] = {
     val batchMetadataFile = batchIdToPath(batchId)
     if (fileManager.exists(batchMetadataFile)) {
@@ -250,6 +271,17 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     None
   }
 
+  /**
+   * Get an array of [FileStatus] referencing batch files.
+   * The array is sorted by most recent batch file first to
+   * oldest batch file.
+   */
+  def getOrderedBatchFiles(): Array[FileStatus] = {
+    fileManager.list(metadataPath, batchFilesFilter)
+      .sortBy(f => pathToBatchId(f.getPath))
+      .reverse
+  }
+
   /**
    * Removes all the log entry earlier than thresholdBatchId (exclusive).
    */

From 68e8d243b847ab8467dcb2c39faf3bf6fa6c2283 Mon Sep 17 00:00:00 2001
From: Nattavut Sutyanyong <nsy.can@gmail.com>
Date: Tue, 29 Nov 2016 15:27:43 -0800
Subject: [PATCH 226/534] [SPARK-18614][SQL] Incorrect predicate pushdown from
 ExistenceJoin

## What changes were proposed in this pull request?

ExistenceJoin should be treated the same as LeftOuter and LeftAnti, not InnerLike and LeftSemi. This is not currently exposed because the rewrite of [NOT] EXISTS OR ... to ExistenceJoin happens in rule RewritePredicateSubquery, which is in a separate rule set and placed after the rule PushPredicateThroughJoin. During the transformation in the rule PushPredicateThroughJoin, an ExistenceJoin never exists.

The semantics of ExistenceJoin says we need to preserve all the rows from the left table through the join operation as if it is a regular LeftOuter join. The ExistenceJoin augments the LeftOuter operation with a new column called exists, set to true when the join condition in the ON clause is true and false otherwise. The filter of any rows will happen in the Filter operation above the ExistenceJoin.

Example:

A(c1, c2): { (1, 1), (1, 2) }
// B can be any value as it is irrelevant in this example
B(c1): { (NULL) }

select A.*
from   A
where  exists (select 1 from B where A.c1 = A.c2)
       or A.c2=2

In this example, the correct result is all the rows from A. If the pattern ExistenceJoin around line 935 in Optimizer.scala is indeed active, the code will push down the predicate A.c1 = A.c2 to be a Filter on relation A, which will incorrectly filter the row (1,2) from A.

## How was this patch tested?

Since this is not an exposed case, no new test cases is added. The scenario is discovered via a code review of another PR and confirmed to be valid with peer.

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #16044 from nsyca/spark-18614.

(cherry picked from commit 3600635215f25d695c9be5931b5185fec8a35527)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala      |  4 ++--
 .../optimizer/FilterPushdownSuite.scala         | 17 +++++++++++++++++
 .../inputs/{anti-join.sql => pred-pushdown.sql} |  7 ++++++-
 ...{anti-join.sql.out => pred-pushdown.sql.out} | 13 ++++++++++++-
 4 files changed, 37 insertions(+), 4 deletions(-)
 rename sql/core/src/test/resources/sql-tests/inputs/{anti-join.sql => pred-pushdown.sql} (64%)
 rename sql/core/src/test/resources/sql-tests/results/{anti-join.sql.out => pred-pushdown.sql.out} (71%)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 805cad5cb953e..37f0c8ed19d37 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -932,7 +932,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
         split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right)
 
       joinType match {
-        case _: InnerLike |  LeftSemi | ExistenceJoin(_) =>
+        case _: InnerLike |  LeftSemi =>
           // push down the single side only join filter for both sides sub queries
           val newLeft = leftJoinConditions.
             reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
@@ -949,7 +949,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
           val newJoinCond = (rightJoinConditions ++ commonJoinCondition).reduceLeftOption(And)
 
           Join(newLeft, newRight, RightOuter, newJoinCond)
-        case LeftOuter | LeftAnti =>
+        case LeftOuter | LeftAnti | ExistenceJoin(_) =>
           // push down the right side only join filter for right sub query
           val newLeft = left
           val newRight = rightJoinConditions.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 3e67282d687f5..6feea4060f46a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -546,6 +546,23 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
   }
 
+  test("joins: only push down join conditions to the right of an existence join") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    val fillerVal = 'val.boolean
+    val originalQuery =
+      x.join(y,
+        ExistenceJoin(fillerVal),
+        Some("x.a".attr > 1 && "y.b".attr > 2)).analyze
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer =
+      x.join(
+        y.where("y.b".attr > 2),
+        ExistenceJoin(fillerVal),
+        Some("x.a".attr > 1))
+      .analyze
+    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+  }
 
   val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType))
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/anti-join.sql b/sql/core/src/test/resources/sql-tests/inputs/pred-pushdown.sql
similarity index 64%
rename from sql/core/src/test/resources/sql-tests/inputs/anti-join.sql
rename to sql/core/src/test/resources/sql-tests/inputs/pred-pushdown.sql
index 0346f57d609ad..eff258a06635a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/anti-join.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/pred-pushdown.sql
@@ -1,7 +1,12 @@
--- SPARK-18597: Do not push down predicates to left hand side in an anti-join
 CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2);
 CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1);
 
+-- SPARK-18597: Do not push down predicates to left hand side in an anti-join
 SELECT *
 FROM   tbl_a
        LEFT ANTI JOIN tbl_b ON ((tbl_a.c1 = tbl_a.c2) IS NULL OR tbl_a.c1 = tbl_a.c2);
+
+-- SPARK-18614: Do not push down predicates on left table below ExistenceJoin
+SELECT l.c1, l.c2
+FROM   tbl_a l
+WHERE  EXISTS (SELECT 1 FROM tbl_b r WHERE l.c1 = l.c2) OR l.c2 < 2;
diff --git a/sql/core/src/test/resources/sql-tests/results/anti-join.sql.out b/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out
similarity index 71%
rename from sql/core/src/test/resources/sql-tests/results/anti-join.sql.out
rename to sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out
index 6f38c4d08bc5a..1b8ddbe4c7211 100644
--- a/sql/core/src/test/resources/sql-tests/results/anti-join.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 3
+-- Number of queries: 4
 
 
 -- !query 0
@@ -27,3 +27,14 @@ struct<c1:int,c2:int>
 -- !query 2 output
 2	1
 3	6
+
+
+-- !query 3
+SELECT l.c1, l.c2
+FROM   tbl_a l
+WHERE  EXISTS (SELECT 1 FROM tbl_b r WHERE l.c1 = l.c2) OR l.c2 < 2
+-- !query 3 schema
+struct<c1:int,c2:int>
+-- !query 3 output
+1	1
+2	1

From 045ae299c358e3b991e4e0cd0eb660cd501fdc4d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 29 Nov 2016 16:27:25 -0800
Subject: [PATCH 227/534] [SPARK-18553][CORE] Fix leak of TaskSetManager
 following executor loss

_This is the master branch version of #15986; the original description follows:_

This patch fixes a critical resource leak in the TaskScheduler which could cause RDDs and ShuffleDependencies to be kept alive indefinitely if an executor with running tasks is permanently lost and the associated stage fails.

This problem was originally identified by analyzing the heap dump of a driver belonging to a cluster that had run out of shuffle space. This dump contained several `ShuffleDependency` instances that were retained by `TaskSetManager`s inside the scheduler but were not otherwise referenced. Each of these `TaskSetManager`s was considered a "zombie" but had no running tasks and therefore should have been cleaned up. However, these zombie task sets were still referenced by the `TaskSchedulerImpl.taskIdToTaskSetManager` map.

Entries are added to the `taskIdToTaskSetManager` map when tasks are launched and are removed inside of `TaskScheduler.statusUpdate()`, which is invoked by the scheduler backend while processing `StatusUpdate` messages from executors. The problem with this design is that a completely dead executor will never send a `StatusUpdate`. There is [some code](https://github.com/apache/spark/blob/072f4c518cdc57d705beec6bcc3113d9a6740819/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala#L338) in `statusUpdate` which handles tasks that exit with the `TaskState.LOST` state (which is supposed to correspond to a task failure triggered by total executor loss), but this state only seems to be used in Mesos fine-grained mode. There doesn't seem to be any code which performs per-task state cleanup for tasks that were running on an executor that completely disappears without sending any sort of final death message. The `executorLost` and [`removeExecutor`](https://github.com/apache/spark/blob/072f4c518cdc57d705beec6bcc3113d9a6740819/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala#L527) methods don't appear to perform any cleanup of the `taskId -> *` mappings, causing the leaks observed here.

This patch's fix is to maintain a `executorId -> running task id` mapping so that these `taskId -> *` maps can be properly cleaned up following an executor loss.

There are some potential corner-case interactions that I'm concerned about here, especially some details in [the comment](https://github.com/apache/spark/blob/072f4c518cdc57d705beec6bcc3113d9a6740819/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala#L523) in `removeExecutor`, so I'd appreciate a very careful review of these changes.

I added a new unit test to `TaskSchedulerImplSuite`.

/cc kayousterhout and markhamstra, who reviewed #15986.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #16045 from JoshRosen/fix-leak-following-total-executor-loss-master.

(cherry picked from commit 9a02f6821265ff67ba3f7b095cd1afaebd25a898)
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
---
 .../spark/scheduler/TaskSchedulerImpl.scala   | 82 +++++++++++--------
 .../StandaloneDynamicAllocationSuite.scala    |  7 +-
 .../scheduler/TaskSchedulerImplSuite.scala    | 72 ++++++++++++++++
 3 files changed, 125 insertions(+), 36 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 3e3f1ad031e66..67446da0a8b8d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -93,10 +93,12 @@ private[spark] class TaskSchedulerImpl(
   // Incrementing task IDs
   val nextTaskId = new AtomicLong(0)
 
-  // Number of tasks running on each executor
-  private val executorIdToTaskCount = new HashMap[String, Int]
+  // IDs of the tasks running on each executor
+  private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]]
 
-  def runningTasksByExecutors(): Map[String, Int] = executorIdToTaskCount.toMap
+  def runningTasksByExecutors(): Map[String, Int] = {
+    executorIdToRunningTaskIds.toMap.mapValues(_.size)
+  }
 
   // The set of executors we have on each host; this is used to compute hostsAlive, which
   // in turn is used to decide when we can attain data locality on a given host
@@ -264,7 +266,7 @@ private[spark] class TaskSchedulerImpl(
             val tid = task.taskId
             taskIdToTaskSetManager(tid) = taskSet
             taskIdToExecutorId(tid) = execId
-            executorIdToTaskCount(execId) += 1
+            executorIdToRunningTaskIds(execId).add(tid)
             availableCpus(i) -= CPUS_PER_TASK
             assert(availableCpus(i) >= 0)
             launchedTask = true
@@ -294,11 +296,11 @@ private[spark] class TaskSchedulerImpl(
       if (!hostToExecutors.contains(o.host)) {
         hostToExecutors(o.host) = new HashSet[String]()
       }
-      if (!executorIdToTaskCount.contains(o.executorId)) {
+      if (!executorIdToRunningTaskIds.contains(o.executorId)) {
         hostToExecutors(o.host) += o.executorId
         executorAdded(o.executorId, o.host)
         executorIdToHost(o.executorId) = o.host
-        executorIdToTaskCount(o.executorId) = 0
+        executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
         newExecAvail = true
       }
       for (rack <- getRackForHost(o.host)) {
@@ -349,38 +351,34 @@ private[spark] class TaskSchedulerImpl(
     var reason: Option[ExecutorLossReason] = None
     synchronized {
       try {
-        if (state == TaskState.LOST && taskIdToExecutorId.contains(tid)) {
-          // We lost this entire executor, so remember that it's gone
-          val execId = taskIdToExecutorId(tid)
-
-          if (executorIdToTaskCount.contains(execId)) {
-            reason = Some(
-              SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
-            removeExecutor(execId, reason.get)
-            failedExecutor = Some(execId)
-          }
-        }
         taskIdToTaskSetManager.get(tid) match {
           case Some(taskSet) =>
-            if (TaskState.isFinished(state)) {
-              taskIdToTaskSetManager.remove(tid)
-              taskIdToExecutorId.remove(tid).foreach { execId =>
-                if (executorIdToTaskCount.contains(execId)) {
-                  executorIdToTaskCount(execId) -= 1
-                }
+            if (state == TaskState.LOST) {
+              // TaskState.LOST is only used by the deprecated Mesos fine-grained scheduling mode,
+              // where each executor corresponds to a single task, so mark the executor as failed.
+              val execId = taskIdToExecutorId.getOrElse(tid, throw new IllegalStateException(
+                "taskIdToTaskSetManager.contains(tid) <=> taskIdToExecutorId.contains(tid)"))
+              if (executorIdToRunningTaskIds.contains(execId)) {
+                reason = Some(
+                  SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
+                removeExecutor(execId, reason.get)
+                failedExecutor = Some(execId)
               }
             }
-            if (state == TaskState.FINISHED) {
-              taskSet.removeRunningTask(tid)
-              taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData)
-            } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
+            if (TaskState.isFinished(state)) {
+              cleanupTaskState(tid)
               taskSet.removeRunningTask(tid)
-              taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
+              if (state == TaskState.FINISHED) {
+                taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData)
+              } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
+                taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
+              }
             }
           case None =>
             logError(
               ("Ignoring update with state %s for TID %s because its task set is gone (this is " +
-                "likely the result of receiving duplicate task finished status updates)")
+                "likely the result of receiving duplicate task finished status updates) or its " +
+                "executor has been marked as failed.")
                 .format(state, tid))
         }
       } catch {
@@ -491,7 +489,7 @@ private[spark] class TaskSchedulerImpl(
     var failedExecutor: Option[String] = None
 
     synchronized {
-      if (executorIdToTaskCount.contains(executorId)) {
+      if (executorIdToRunningTaskIds.contains(executorId)) {
         val hostPort = executorIdToHost(executorId)
         logExecutorLoss(executorId, hostPort, reason)
         removeExecutor(executorId, reason)
@@ -533,13 +531,31 @@ private[spark] class TaskSchedulerImpl(
       logError(s"Lost executor $executorId on $hostPort: $reason")
   }
 
+  /**
+   * Cleans up the TaskScheduler's state for tracking the given task.
+   */
+  private def cleanupTaskState(tid: Long): Unit = {
+    taskIdToTaskSetManager.remove(tid)
+    taskIdToExecutorId.remove(tid).foreach { executorId =>
+      executorIdToRunningTaskIds.get(executorId).foreach { _.remove(tid) }
+    }
+  }
+
   /**
    * Remove an executor from all our data structures and mark it as lost. If the executor's loss
    * reason is not yet known, do not yet remove its association with its host nor update the status
    * of any running tasks, since the loss reason defines whether we'll fail those tasks.
    */
   private def removeExecutor(executorId: String, reason: ExecutorLossReason) {
-    executorIdToTaskCount -= executorId
+    // The tasks on the lost executor may not send any more status updates (because the executor
+    // has been lost), so they should be cleaned up here.
+    executorIdToRunningTaskIds.remove(executorId).foreach { taskIds =>
+      logDebug("Cleaning up TaskScheduler state for tasks " +
+        s"${taskIds.mkString("[", ",", "]")} on failed executor $executorId")
+      // We do not notify the TaskSetManager of the task failures because that will
+      // happen below in the rootPool.executorLost() call.
+      taskIds.foreach(cleanupTaskState)
+    }
 
     val host = executorIdToHost(executorId)
     val execs = hostToExecutors.getOrElse(host, new HashSet)
@@ -577,11 +593,11 @@ private[spark] class TaskSchedulerImpl(
   }
 
   def isExecutorAlive(execId: String): Boolean = synchronized {
-    executorIdToTaskCount.contains(execId)
+    executorIdToRunningTaskIds.contains(execId)
   }
 
   def isExecutorBusy(execId: String): Boolean = synchronized {
-    executorIdToTaskCount.getOrElse(execId, -1) > 0
+    executorIdToRunningTaskIds.get(execId).exists(_.nonEmpty)
   }
 
   // By default, rack is unknown
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index e29eb8552e134..05dad7a4b86ad 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -433,10 +433,11 @@ class StandaloneDynamicAllocationSuite
     assert(executors.size === 2)
 
     // simulate running a task on the executor
-    val getMap = PrivateMethod[mutable.HashMap[String, Int]]('executorIdToTaskCount)
+    val getMap =
+      PrivateMethod[mutable.HashMap[String, mutable.HashSet[Long]]]('executorIdToRunningTaskIds)
     val taskScheduler = sc.taskScheduler.asInstanceOf[TaskSchedulerImpl]
-    val executorIdToTaskCount = taskScheduler invokePrivate getMap()
-    executorIdToTaskCount(executors.head) = 1
+    val executorIdToRunningTaskIds = taskScheduler invokePrivate getMap()
+    executorIdToRunningTaskIds(executors.head) = mutable.HashSet(1L)
     // kill the busy executor without force; this should fail
     assert(killExecutor(sc, executors.head, force = false).isEmpty)
     apps = getApplications()
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index f5f1947661d9a..48ec04bd5aab3 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -17,6 +17,12 @@
 
 package org.apache.spark.scheduler
 
+import java.nio.ByteBuffer
+
+import scala.collection.mutable.HashMap
+
+import org.mockito.Matchers.{anyInt, anyString, eq => meq}
+import org.mockito.Mockito.{atLeast, atMost, never, spy, verify, when}
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
@@ -408,4 +414,70 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     assert(thirdTaskDescs.size === 0)
     assert(taskScheduler.getExecutorsAliveOnHost("host1") === Some(Set("executor1", "executor3")))
   }
+  test("if an executor is lost then the state for its running tasks is cleaned up (SPARK-18553)") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val e0Offers = IndexedSeq(WorkerOffer("executor0", "host0", 1))
+    val attempt1 = FakeTask.createTaskSet(1)
+
+    // submit attempt 1, offer resources, task gets scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(e0Offers).flatten
+    assert(1 === taskDescriptions.length)
+
+    // mark executor0 as dead
+    taskScheduler.executorLost("executor0", SlaveLost())
+    assert(!taskScheduler.isExecutorAlive("executor0"))
+    assert(!taskScheduler.hasExecutorsAliveOnHost("host0"))
+    assert(taskScheduler.getExecutorsAliveOnHost("host0").isEmpty)
+
+
+    // Check that state associated with the lost task attempt is cleaned up:
+    assert(taskScheduler.taskIdToExecutorId.isEmpty)
+    assert(taskScheduler.taskIdToTaskSetManager.isEmpty)
+    assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty)
+  }
+
+  test("if a task finishes with TaskState.LOST its executor is marked as dead") {
+    sc = new SparkContext("local", "TaskSchedulerImplSuite")
+    val taskScheduler = new TaskSchedulerImpl(sc)
+    taskScheduler.initialize(new FakeSchedulerBackend)
+    // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks.
+    new DAGScheduler(sc, taskScheduler) {
+      override def taskStarted(task: Task[_], taskInfo: TaskInfo) {}
+      override def executorAdded(execId: String, host: String) {}
+    }
+
+    val e0Offers = IndexedSeq(WorkerOffer("executor0", "host0", 1))
+    val attempt1 = FakeTask.createTaskSet(1)
+
+    // submit attempt 1, offer resources, task gets scheduled
+    taskScheduler.submitTasks(attempt1)
+    val taskDescriptions = taskScheduler.resourceOffers(e0Offers).flatten
+    assert(1 === taskDescriptions.length)
+
+    // Report the task as failed with TaskState.LOST
+    taskScheduler.statusUpdate(
+      tid = taskDescriptions.head.taskId,
+      state = TaskState.LOST,
+      serializedData = ByteBuffer.allocate(0)
+    )
+
+    // Check that state associated with the lost task attempt is cleaned up:
+    assert(taskScheduler.taskIdToExecutorId.isEmpty)
+    assert(taskScheduler.taskIdToTaskSetManager.isEmpty)
+    assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty)
+
+    // Check that the executor has been marked as dead
+    assert(!taskScheduler.isExecutorAlive("executor0"))
+    assert(!taskScheduler.hasExecutorsAliveOnHost("host0"))
+    assert(taskScheduler.getExecutorsAliveOnHost("host0").isEmpty)
+  }
 }

From 28b57c8a124fe55501c4ca4b91320851ace5d735 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 29 Nov 2016 17:24:17 -0800
Subject: [PATCH 228/534] [SPARK-18516][SQL] Split state and progress in
 streaming

This PR separates the status of a `StreamingQuery` into two separate APIs:
 - `status` - describes the status of a `StreamingQuery` at this moment, including what phase of processing is currently happening and if data is available.
 - `recentProgress` - an array of statistics about the most recent microbatches that have executed.

A recent progress contains the following information:
```
{
  "id" : "2be8670a-fce1-4859-a530-748f29553bb6",
  "name" : "query-29",
  "timestamp" : 1479705392724,
  "inputRowsPerSecond" : 230.76923076923077,
  "processedRowsPerSecond" : 10.869565217391303,
  "durationMs" : {
    "triggerExecution" : 276,
    "queryPlanning" : 3,
    "getBatch" : 5,
    "getOffset" : 3,
    "addBatch" : 234,
    "walCommit" : 30
  },
  "currentWatermark" : 0,
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaSource[Subscribe[topic-14]]",
    "startOffset" : {
      "topic-14" : {
        "2" : 0,
        "4" : 1,
        "1" : 0,
        "3" : 0,
        "0" : 0
      }
    },
    "endOffset" : {
      "topic-14" : {
        "2" : 1,
        "4" : 2,
        "1" : 0,
        "3" : 0,
        "0" : 1
      }
    },
    "numRecords" : 3,
    "inputRowsPerSecond" : 230.76923076923077,
    "processedRowsPerSecond" : 10.869565217391303
  } ]
}
```

Additionally, in order to make it possible to correlate progress updates across restarts, we change the `id` field from an integer that is unique with in the JVM to a `UUID` that is globally unique.

Author: Tathagata Das <tathagata.das1565@gmail.com>
Author: Michael Armbrust <michael@databricks.com>

Closes #15954 from marmbrus/queryProgress.

(cherry picked from commit c3d08e2f29baeebe09bf4c059ace4336af9116b5)
Signed-off-by: Michael Armbrust <michael@databricks.com>
---
 .../spark/sql/kafka010/KafkaSourceSuite.scala |   7 +-
 project/MimaExcludes.scala                    |  11 +
 python/pyspark/sql/streaming.py               | 326 ++----------------
 python/pyspark/sql/tests.py                   |  22 ++
 .../execution/streaming/MetricsReporter.scala |  53 +++
 .../streaming/ProgressReporter.scala          | 234 +++++++++++++
 .../execution/streaming/StreamExecution.scala | 282 ++++-----------
 .../execution/streaming/StreamMetrics.scala   | 243 -------------
 .../apache/spark/sql/internal/SQLConf.scala   |   8 +
 .../spark/sql/streaming/SinkStatus.scala      |  66 ----
 .../spark/sql/streaming/SourceStatus.scala    |  95 -----
 .../spark/sql/streaming/StreamingQuery.scala  |  33 +-
 .../streaming/StreamingQueryException.scala   |   2 +-
 .../streaming/StreamingQueryListener.scala    |  24 +-
 .../sql/streaming/StreamingQueryManager.scala |  27 +-
 .../sql/streaming/StreamingQueryStatus.scala  | 151 +-------
 .../apache/spark/sql/streaming/progress.scala | 193 +++++++++++
 .../streaming/StreamMetricsSuite.scala        | 213 ------------
 .../sql/streaming/FileStreamSourceSuite.scala |  10 +-
 .../spark/sql/streaming/StreamTest.scala      |  73 +---
 .../StreamingQueryListenerSuite.scala         | 267 +++++++-------
 .../StreamingQueryManagerSuite.scala          |   2 +-
 .../StreamingQueryProgressSuite.scala         |  98 ++++++
 .../streaming/StreamingQueryStatusSuite.scala | 123 -------
 .../sql/streaming/StreamingQuerySuite.scala   | 260 ++++++++------
 .../spark/sql/streaming/WatermarkSuite.scala  |  16 +-
 26 files changed, 1087 insertions(+), 1752 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index e1af14f95dfc9..2d6ccb22ddb06 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -442,12 +442,13 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
     val mapped = kafka.map(kv => kv._2.toInt + 1)
     testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
       makeSureGetOffsetCalled,
       AddKafkaData(Set(topic), 1, 2, 3),
       CheckAnswer(2, 3, 4),
-      AssertOnLastQueryStatus { status =>
-        assert(status.triggerDetails.get("numRows.input.total").toInt > 0)
-        assert(status.sourceStatuses(0).processingRate > 0.0)
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgresses.map(_.numInputRows).sum
+        recordsRead == 3
       }
     )
   }
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 03c9fcc0124d2..97391643322fc 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -78,6 +78,17 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryTerminated"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryListener.onQueryTerminated"),
 
+      // [SPARK-18516][SQL] Split state and progress in streaming
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.SourceStatus"),
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.SinkStatus"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.sinkStatus"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.sourceStatuses"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQuery.id"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.lastProgress"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.recentProgresses"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.id"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.get"),
+
       // [SPARK-17338][SQL] add global temp view
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.dropGlobalTempView"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.catalog.Catalog.dropTempView"),
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 9c3a237699f96..c420b0d016091 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -16,6 +16,8 @@
 #
 
 import sys
+import json
+
 if sys.version >= '3':
     intlike = int
     basestring = unicode = str
@@ -48,10 +50,9 @@ def __init__(self, jsq):
     @property
     @since(2.0)
     def id(self):
-        """The id of the streaming query. This id is unique across all queries that have been
-        started in the current process.
+        """The id of the streaming query.
         """
-        return self._jsq.id()
+        return self._jsq.id().toString()
 
     @property
     @since(2.0)
@@ -87,6 +88,24 @@ def awaitTermination(self, timeout=None):
         else:
             return self._jsq.awaitTermination()
 
+    @property
+    @since(2.1)
+    def recentProgresses(self):
+        """Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.
+        The number of progress updates retained for each stream is configured by Spark session
+        configuration `spark.sql.streaming.numRecentProgresses`.
+        """
+        return [json.loads(p.json()) for p in self._jsq.recentProgresses()]
+
+    @property
+    @since(2.1)
+    def lastProgress(self):
+        """
+        Returns the most recent :class:`StreamingQueryProgress` update of this streaming query.
+        :return: a map
+        """
+        return json.loads(self._jsq.lastProgress().json())
+
     @since(2.0)
     def processAllAvailable(self):
         """Blocks until all available data in the source has been processed and committed to the
@@ -149,8 +168,6 @@ def get(self, id):
         True
         >>> sq.stop()
         """
-        if not isinstance(id, intlike):
-            raise ValueError("The id for the query must be an integer. Got: %s" % id)
         return StreamingQuery(self._jsqm.get(id))
 
     @since(2.0)
@@ -191,303 +208,6 @@ def resetTerminated(self):
         self._jsqm.resetTerminated()
 
 
-class StreamingQueryStatus(object):
-    """A class used to report information about the progress of a StreamingQuery.
-
-    .. note:: Experimental
-
-    .. versionadded:: 2.1
-    """
-
-    def __init__(self, jsqs):
-        self._jsqs = jsqs
-
-    def __str__(self):
-        """
-        Pretty string of this query status.
-
-        >>> print(sqs)
-        Status of query 'query'
-            Query id: 1
-            Status timestamp: 123
-            Input rate: 15.5 rows/sec
-            Processing rate 23.5 rows/sec
-            Latency: 345.0 ms
-            Trigger details:
-                batchId: 5
-                isDataPresentInTrigger: true
-                isTriggerActive: true
-                latency.getBatch.total: 20
-                latency.getOffset.total: 10
-                numRows.input.total: 100
-            Source statuses [1 source]:
-                Source 1 - MySource1
-                    Available offset: 0
-                    Input rate: 15.5 rows/sec
-                    Processing rate: 23.5 rows/sec
-                    Trigger details:
-                        numRows.input.source: 100
-                        latency.getOffset.source: 10
-                        latency.getBatch.source: 20
-            Sink status - MySink
-                Committed offsets: [1, -]
-        """
-        return self._jsqs.toString()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def name(self):
-        """
-        Name of the query. This name is unique across all active queries.
-
-        >>> sqs.name
-        u'query'
-        """
-        return self._jsqs.name()
-
-    @property
-    @since(2.1)
-    def id(self):
-        """
-        Id of the query. This id is unique across all queries that have been started in
-        the current process.
-
-        >>> int(sqs.id)
-        1
-        """
-        return self._jsqs.id()
-
-    @property
-    @since(2.1)
-    def timestamp(self):
-        """
-        Timestamp (ms) of when this query was generated.
-
-        >>> int(sqs.timestamp)
-        123
-        """
-        return self._jsqs.timestamp()
-
-    @property
-    @since(2.1)
-    def inputRate(self):
-        """
-        Current total rate (rows/sec) at which data is being generated by all the sources.
-
-        >>> sqs.inputRate
-        15.5
-        """
-        return self._jsqs.inputRate()
-
-    @property
-    @since(2.1)
-    def processingRate(self):
-        """
-        Current rate (rows/sec) at which the query is processing data from all the sources.
-
-        >>> sqs.processingRate
-        23.5
-        """
-        return self._jsqs.processingRate()
-
-    @property
-    @since(2.1)
-    def latency(self):
-        """
-        Current average latency between the data being available in source and the sink
-        writing the corresponding output.
-
-        >>> sqs.latency
-        345.0
-        """
-        if (self._jsqs.latency().nonEmpty()):
-            return self._jsqs.latency().get()
-        else:
-            return None
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def sourceStatuses(self):
-        """
-        Current statuses of the sources as a list.
-
-        >>> len(sqs.sourceStatuses)
-        1
-        >>> sqs.sourceStatuses[0].description
-        u'MySource1'
-        """
-        return [SourceStatus(ss) for ss in self._jsqs.sourceStatuses()]
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def sinkStatus(self):
-        """
-        Current status of the sink.
-
-        >>> sqs.sinkStatus.description
-        u'MySink'
-        """
-        return SinkStatus(self._jsqs.sinkStatus())
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def triggerDetails(self):
-        """
-        Low-level details of the currently active trigger (e.g. number of rows processed
-        in trigger, latency of intermediate steps, etc.).
-
-        If no trigger is currently active, then it will have details of the last completed trigger.
-
-        >>> sqs.triggerDetails
-        {u'latency.getBatch.total': u'20', u'numRows.input.total': u'100',
-        u'isTriggerActive': u'true', u'batchId': u'5', u'latency.getOffset.total': u'10',
-        u'isDataPresentInTrigger': u'true'}
-        """
-        return self._jsqs.triggerDetails()
-
-
-class SourceStatus(object):
-    """
-    Status and metrics of a streaming Source.
-
-    .. note:: Experimental
-
-    .. versionadded:: 2.1
-    """
-
-    def __init__(self, jss):
-        self._jss = jss
-
-    def __str__(self):
-        """
-        Pretty string of this source status.
-
-        >>> print(sqs.sourceStatuses[0])
-        Status of source MySource1
-            Available offset: 0
-            Input rate: 15.5 rows/sec
-            Processing rate: 23.5 rows/sec
-            Trigger details:
-                numRows.input.source: 100
-                latency.getOffset.source: 10
-                latency.getBatch.source: 20
-        """
-        return self._jss.toString()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def description(self):
-        """
-        Description of the source corresponding to this status.
-
-        >>> sqs.sourceStatuses[0].description
-        u'MySource1'
-        """
-        return self._jss.description()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def offsetDesc(self):
-        """
-        Description of the current offset if known.
-
-        >>> sqs.sourceStatuses[0].offsetDesc
-        u'0'
-        """
-        return self._jss.offsetDesc()
-
-    @property
-    @since(2.1)
-    def inputRate(self):
-        """
-        Current rate (rows/sec) at which data is being generated by the source.
-
-        >>> sqs.sourceStatuses[0].inputRate
-        15.5
-        """
-        return self._jss.inputRate()
-
-    @property
-    @since(2.1)
-    def processingRate(self):
-        """
-        Current rate (rows/sec) at which the query is processing data from the source.
-
-        >>> sqs.sourceStatuses[0].processingRate
-        23.5
-        """
-        return self._jss.processingRate()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def triggerDetails(self):
-        """
-        Low-level details of the currently active trigger (e.g. number of rows processed
-        in trigger, latency of intermediate steps, etc.).
-
-        If no trigger is currently active, then it will have details of the last completed trigger.
-
-        >>> sqs.sourceStatuses[0].triggerDetails
-        {u'numRows.input.source': u'100', u'latency.getOffset.source': u'10',
-        u'latency.getBatch.source': u'20'}
-       """
-        return self._jss.triggerDetails()
-
-
-class SinkStatus(object):
-    """
-    Status and metrics of a streaming Sink.
-
-    .. note:: Experimental
-
-    .. versionadded:: 2.1
-    """
-
-    def __init__(self, jss):
-        self._jss = jss
-
-    def __str__(self):
-        """
-        Pretty string of this source status.
-
-        >>> print(sqs.sinkStatus)
-        Status of sink MySink
-            Committed offsets: [1, -]
-        """
-        return self._jss.toString()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def description(self):
-        """
-        Description of the source corresponding to this status.
-
-        >>> sqs.sinkStatus.description
-        u'MySink'
-        """
-        return self._jss.description()
-
-    @property
-    @ignore_unicode_prefix
-    @since(2.1)
-    def offsetDesc(self):
-        """
-        Description of the current offsets up to which data has been written by the sink.
-
-        >>> sqs.sinkStatus.offsetDesc
-        u'[1, -]'
-        """
-        return self._jss.offsetDesc()
-
-
 class Trigger(object):
     """Used to indicate how often results should be produced by a :class:`StreamingQuery`.
 
@@ -1053,8 +773,6 @@ def _test():
     globs['sdf_schema'] = StructType([StructField("data", StringType(), False)])
     globs['df'] = \
         globs['spark'].readStream.format('text').load('python/test_support/sql/streaming')
-    globs['sqs'] = StreamingQueryStatus(
-        spark.sparkContext._jvm.org.apache.spark.sql.streaming.StreamingQueryStatus.testStatus())
 
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.streaming, globs=globs,
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 3d46b852c52e1..7151f95216e03 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1082,6 +1082,28 @@ def test_stream_save_options_overwrite(self):
             q.stop()
             shutil.rmtree(tmpPath)
 
+    def test_stream_status_and_progress(self):
+        df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
+        for q in self.spark._wrapped.streams.active:
+            q.stop()
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        self.assertTrue(df.isStreaming)
+        out = os.path.join(tmpPath, 'out')
+        chk = os.path.join(tmpPath, 'chk')
+        q = df.writeStream \
+            .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk)
+        try:
+            q.processAllAvailable()
+            lastProgress = q.lastProgress
+            recentProgresses = q.recentProgresses
+            self.assertEqual(lastProgress['name'], q.name)
+            self.assertEqual(lastProgress['id'], q.id)
+            self.assertTrue(any(p == lastProgress for p in recentProgresses))
+        finally:
+            q.stop()
+            shutil.rmtree(tmpPath)
+
     def test_stream_await_termination(self):
         df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
         for q in self.spark._wrapped.streams.active:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
new file mode 100644
index 0000000000000..5551d12fa8ad2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.{util => ju}
+
+import scala.collection.mutable
+
+import com.codahale.metrics.{Gauge, MetricRegistry}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.{Source => CodahaleSource}
+import org.apache.spark.util.Clock
+
+/**
+ * Serves metrics from a [[org.apache.spark.sql.streaming.StreamingQuery]] to
+ * Codahale/DropWizard metrics
+ */
+class MetricsReporter(
+    stream: StreamExecution,
+    override val sourceName: String) extends CodahaleSource with Logging {
+
+  override val metricRegistry: MetricRegistry = new MetricRegistry
+
+  // Metric names should not have . in them, so that all the metrics of a query are identified
+  // together in Ganglia as a single metric group
+  registerGauge("inputRate-total", () => stream.lastProgress.inputRowsPerSecond)
+  registerGauge("processingRate-total", () => stream.lastProgress.inputRowsPerSecond)
+  registerGauge("latency", () => stream.lastProgress.durationMs.get("triggerExecution").longValue())
+
+  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
+    synchronized {
+      metricRegistry.register(name, new Gauge[T] {
+        override def getValue: T = f()
+      })
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
new file mode 100644
index 0000000000000..b7b6e1988eef5
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.util.UUID
+
+import scala.collection.mutable
+import scala.collection.JavaConverters._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.streaming._
+import org.apache.spark.util.Clock
+
+/**
+ * Responsible for continually reporting statistics about the amount of data processed as well
+ * as latency for a streaming query.  This trait is designed to be mixed into the
+ * [[StreamExecution]], who is responsible for calling `startTrigger` and `finishTrigger`
+ * at the appropriate times. Additionally, the status can updated with `updateStatusMessage` to
+ * allow reporting on the streams current state (i.e. "Fetching more data").
+ */
+trait ProgressReporter extends Logging {
+
+  case class ExecutionStats(
+    inputRows: Map[Source, Long], stateOperators: Seq[StateOperatorProgress])
+
+  // Internal state of the stream, required for computing metrics.
+  protected def id: UUID
+  protected def name: String
+  protected def triggerClock: Clock
+  protected def logicalPlan: LogicalPlan
+  protected def lastExecution: QueryExecution
+  protected def newData: Map[Source, DataFrame]
+  protected def availableOffsets: StreamProgress
+  protected def committedOffsets: StreamProgress
+  protected def sources: Seq[Source]
+  protected def sink: Sink
+  protected def streamExecutionMetadata: StreamExecutionMetadata
+  protected def currentBatchId: Long
+  protected def sparkSession: SparkSession
+
+  // Local timestamps and counters.
+  private var currentTriggerStartTimestamp = -1L
+  private var currentTriggerEndTimestamp = -1L
+  // TODO: Restore this from the checkpoint when possible.
+  private var lastTriggerStartTimestamp = -1L
+  private val currentDurationsMs = new mutable.HashMap[String, Long]()
+
+  /** Flag that signals whether any error with input metrics have already been logged */
+  private var metricWarningLogged: Boolean = false
+
+  /** Holds the most recent query progress updates.  Accesses must lock on the queue itself. */
+  private val progressBuffer = new mutable.Queue[StreamingQueryProgress]()
+
+  @volatile
+  protected var currentStatus: StreamingQueryStatus =
+    StreamingQueryStatus(
+      message = "Initializing StreamExecution",
+      isDataAvailable = false,
+      isTriggerActive = false)
+
+  /** Returns the current status of the query. */
+  def status: StreamingQueryStatus = currentStatus
+
+  /** Returns an array containing the most recent query progress updates. */
+  def recentProgresses: Array[StreamingQueryProgress] = progressBuffer.synchronized {
+    progressBuffer.toArray
+  }
+
+  /** Returns the most recent query progress update. */
+  def lastProgress: StreamingQueryProgress = progressBuffer.synchronized {
+    progressBuffer.last
+  }
+
+  /** Begins recording statistics about query progress for a given trigger. */
+  protected def startTrigger(): Unit = {
+    logDebug("Starting Trigger Calculation")
+    lastTriggerStartTimestamp = currentTriggerStartTimestamp
+    currentTriggerStartTimestamp = triggerClock.getTimeMillis()
+    currentStatus = currentStatus.copy(isTriggerActive = true)
+    currentDurationsMs.clear()
+  }
+
+  /** Finalizes the query progress and adds it to list of recent status updates. */
+  protected def finishTrigger(hasNewData: Boolean): Unit = {
+    currentTriggerEndTimestamp = triggerClock.getTimeMillis()
+
+    val executionStats: ExecutionStats = if (!hasNewData) {
+      ExecutionStats(Map.empty, Seq.empty)
+    } else {
+      extractExecutionStats
+    }
+
+    val processingTimeSec =
+      (currentTriggerEndTimestamp - currentTriggerStartTimestamp).toDouble / 1000
+
+    val inputTimeSec = if (lastTriggerStartTimestamp >= 0) {
+      (currentTriggerStartTimestamp - lastTriggerStartTimestamp).toDouble / 1000
+    } else {
+      Double.NaN
+    }
+    logDebug(s"Execution stats: $executionStats")
+
+    val sourceProgress = sources.map { source =>
+      val numRecords = executionStats.inputRows.getOrElse(source, 0L)
+      new SourceProgress(
+        description = source.toString,
+        startOffset = committedOffsets.get(source).map(_.json).orNull,
+        endOffset = availableOffsets.get(source).map(_.json).orNull,
+        numInputRows = numRecords,
+        inputRowsPerSecond = numRecords / inputTimeSec,
+        processedRowsPerSecond = numRecords / processingTimeSec
+      )
+    }
+    val sinkProgress = new SinkProgress(sink.toString)
+
+    val newProgress = new StreamingQueryProgress(
+      id = id,
+      name = name,
+      timestamp = currentTriggerStartTimestamp,
+      batchId = currentBatchId,
+      durationMs = currentDurationsMs.toMap.mapValues(long2Long).asJava,
+      currentWatermark = streamExecutionMetadata.batchWatermarkMs,
+      stateOperators = executionStats.stateOperators.toArray,
+      sources = sourceProgress.toArray,
+      sink = sinkProgress)
+
+    progressBuffer.synchronized {
+      progressBuffer += newProgress
+      while (progressBuffer.length >= sparkSession.sqlContext.conf.streamingProgressRetention) {
+        progressBuffer.dequeue()
+      }
+    }
+
+    logInfo(s"Streaming query made progress: $newProgress")
+    currentStatus = currentStatus.copy(isTriggerActive = false)
+  }
+
+  /** Extracts statistics from the most recent query execution. */
+  private def extractExecutionStats: ExecutionStats = {
+    // We want to associate execution plan leaves to sources that generate them, so that we match
+    // the their metrics (e.g. numOutputRows) to the sources. To do this we do the following.
+    // Consider the translation from the streaming logical plan to the final executed plan.
+    //
+    //  streaming logical plan (with sources) <==> trigger's logical plan <==> executed plan
+    //
+    // 1. We keep track of streaming sources associated with each leaf in the trigger's logical plan
+    //    - Each logical plan leaf will be associated with a single streaming source.
+    //    - There can be multiple logical plan leaves associated with a streaming source.
+    //    - There can be leaves not associated with any streaming source, because they were
+    //      generated from a batch source (e.g. stream-batch joins)
+    //
+    // 2. Assuming that the executed plan has same number of leaves in the same order as that of
+    //    the trigger logical plan, we associate executed plan leaves with corresponding
+    //    streaming sources.
+    //
+    // 3. For each source, we sum the metrics of the associated execution plan leaves.
+    //
+    val logicalPlanLeafToSource = newData.flatMap { case (source, df) =>
+      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
+    }
+    val allLogicalPlanLeaves = lastExecution.logical.collectLeaves() // includes non-streaming
+    val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
+    val numInputRows: Map[Source, Long] =
+      if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
+        val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
+          case (lp, ep) => logicalPlanLeafToSource.get(lp).map { source => ep -> source }
+        }
+        val sourceToNumInputRows = execLeafToSource.map { case (execLeaf, source) =>
+          val numRows = execLeaf.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
+          source -> numRows
+        }
+        sourceToNumInputRows.groupBy(_._1).mapValues(_.map(_._2).sum) // sum up rows for each source
+      } else {
+        if (!metricWarningLogged) {
+          def toString[T](seq: Seq[T]): String = s"(size = ${seq.size}), ${seq.mkString(", ")}"
+          logWarning(
+            "Could not report metrics as number leaves in trigger logical plan did not match that" +
+                s" of the execution plan:\n" +
+                s"logical plan leaves: ${toString(allLogicalPlanLeaves)}\n" +
+                s"execution plan leaves: ${toString(allExecPlanLeaves)}\n")
+          metricWarningLogged = true
+        }
+        Map.empty
+      }
+
+    // Extract statistics about stateful operators in the query plan.
+    val stateNodes = lastExecution.executedPlan.collect {
+      case p if p.isInstanceOf[StateStoreSaveExec] => p
+    }
+    val stateOperators = stateNodes.map { node =>
+      new StateOperatorProgress(
+        numRowsTotal = node.metrics.get("numTotalStateRows").map(_.value).getOrElse(0L),
+        numRowsUpdated = node.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L))
+    }
+
+    ExecutionStats(numInputRows, stateOperators)
+  }
+
+  /** Records the duration of running `body` for the next query progress update. */
+  protected def reportTimeTaken[T](triggerDetailKey: String)(body: => T): T = {
+    val startTime = triggerClock.getTimeMillis()
+    val result = body
+    val endTime = triggerClock.getTimeMillis()
+    val timeTaken = math.max(endTime - startTime, 0)
+
+    val previousTime = currentDurationsMs.getOrElse(triggerDetailKey, 0L)
+    currentDurationsMs.put(triggerDetailKey, previousTime + timeTaken)
+    logDebug(s"$triggerDetailKey took $timeTaken ms")
+    result
+  }
+
+  /** Updates the message returned in `status`. */
+  protected def updateStatusMessage(message: String): Unit = {
+    currentStatus = currentStatus.copy(message = message)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 21664d7fd0381..e4f31af35fdf4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
 import java.util.concurrent.{CountDownLatch, TimeUnit}
-import java.util.concurrent.atomic.AtomicLong
 import java.util.concurrent.locks.ReentrantLock
 
 import scala.collection.mutable.ArrayBuffer
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.{QueryExecution, SparkPlan}
+import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.streaming._
 import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
@@ -47,7 +47,6 @@ import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
  */
 class StreamExecution(
     override val sparkSession: SparkSession,
-    override val id: Long,
     override val name: String,
     checkpointRoot: String,
     val logicalPlan: LogicalPlan,
@@ -55,10 +54,12 @@ class StreamExecution(
     val trigger: Trigger,
     val triggerClock: Clock,
     val outputMode: OutputMode)
-  extends StreamingQuery with Logging {
+  extends StreamingQuery with ProgressReporter with Logging {
 
   import org.apache.spark.sql.streaming.StreamingQueryListener._
-  import StreamMetrics._
+
+  // TODO: restore this from the checkpoint directory.
+  override val id: UUID = UUID.randomUUID()
 
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
@@ -89,16 +90,16 @@ class StreamExecution(
    * once, since the field's value may change at any time.
    */
   @volatile
-  private var availableOffsets = new StreamProgress
+  protected var availableOffsets = new StreamProgress
 
   /** The current batchId or -1 if execution has not yet been initialized. */
-  private var currentBatchId: Long = -1
+  protected var currentBatchId: Long = -1
 
   /** Stream execution metadata */
-  private var streamExecutionMetadata = StreamExecutionMetadata()
+  protected var streamExecutionMetadata = StreamExecutionMetadata()
 
   /** All stream sources present in the query plan. */
-  private val sources =
+  protected val sources =
     logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
 
   /** A list of unique sources in the query plan. */
@@ -113,7 +114,10 @@ class StreamExecution(
   private var state: State = INITIALIZED
 
   @volatile
-  var lastExecution: QueryExecution = null
+  var lastExecution: QueryExecution = _
+
+  /** Holds the most recent input data for each source. */
+  protected var newData: Map[Source, DataFrame] = _
 
   @volatile
   private var streamDeathCause: StreamingQueryException = null
@@ -121,16 +125,8 @@ class StreamExecution(
   /* Get the call site in the caller thread; will pass this into the micro batch thread */
   private val callSite = Utils.getCallSite()
 
-  /** Metrics for this query */
-  private val streamMetrics =
-    new StreamMetrics(uniqueSources.toSet, triggerClock, s"StructuredStreaming.$name")
-
-  @volatile
-  private var currentStatus: StreamingQueryStatus = null
-
-  /** Flag that signals whether any error with input metrics have already been logged */
-  @volatile
-  private var metricWarningLogged: Boolean = false
+  /** Used to report metrics to coda-hale. */
+  lazy val streamMetrics = new MetricsReporter(this, s"spark.streaming.$name")
 
   /**
    * The thread that runs the micro-batches of this stream. Note that this thread must be
@@ -158,15 +154,6 @@ class StreamExecution(
   /** Whether the query is currently active or not */
   override def isActive: Boolean = state == ACTIVE
 
-  /** Returns the current status of the query. */
-  override def status: StreamingQueryStatus = currentStatus
-
-  /** Returns current status of all the sources. */
-  override def sourceStatuses: Array[SourceStatus] = currentStatus.sourceStatuses.toArray
-
-  /** Returns current status of the sink. */
-  override def sinkStatus: SinkStatus = currentStatus.sinkStatus
-
   /** Returns the [[StreamingQueryException]] if the query was terminated by an exception. */
   override def exception: Option[StreamingQueryException] = Option(streamDeathCause)
 
@@ -200,8 +187,8 @@ class StreamExecution(
       if (sparkSession.sessionState.conf.streamingMetricsEnabled) {
         sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
       }
-      updateStatus()
-      postEvent(new QueryStartedEvent(currentStatus)) // Assumption: Does not throw exception.
+
+      postEvent(new QueryStartedEvent(id, name)) // Assumption: Does not throw exception.
 
       // Unblock starting thread
       startLatch.countDown()
@@ -210,40 +197,45 @@ class StreamExecution(
       SparkSession.setActiveSession(sparkSession)
 
       triggerExecutor.execute(() => {
-        streamMetrics.reportTriggerStarted(currentBatchId)
-        streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "Finding new data from sources")
-        updateStatus()
-        val isTerminated = reportTimeTaken(TRIGGER_LATENCY) {
+        startTrigger()
+
+        val isTerminated =
           if (isActive) {
-            if (currentBatchId < 0) {
-              // We'll do this initialization only once
-              populateStartOffsets()
-              logDebug(s"Stream running from $committedOffsets to $availableOffsets")
-            } else {
-              constructNextBatch()
+            reportTimeTaken("triggerExecution") {
+              if (currentBatchId < 0) {
+                // We'll do this initialization only once
+                populateStartOffsets()
+                logDebug(s"Stream running from $committedOffsets to $availableOffsets")
+              } else {
+                constructNextBatch()
+              }
+              if (dataAvailable) {
+                currentStatus = currentStatus.copy(isDataAvailable = true)
+                updateStatusMessage("Processing new data")
+                runBatch()
+              }
             }
+
+            // Report trigger as finished and construct progress object.
+            finishTrigger(dataAvailable)
+            postEvent(new QueryProgressEvent(lastProgress))
+
             if (dataAvailable) {
-              streamMetrics.reportTriggerDetail(IS_DATA_PRESENT_IN_TRIGGER, true)
-              streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "Processing new data")
-              updateStatus()
-              runBatch()
               // We'll increase currentBatchId after we complete processing current batch's data
               currentBatchId += 1
             } else {
-              streamMetrics.reportTriggerDetail(IS_DATA_PRESENT_IN_TRIGGER, false)
-              streamMetrics.reportTriggerDetail(STATUS_MESSAGE, "No new data")
-              updateStatus()
+              currentStatus = currentStatus.copy(isDataAvailable = false)
+              updateStatusMessage("Waiting for data to arrive")
               Thread.sleep(pollingDelayMs)
             }
             true
           } else {
             false
           }
-        }
-        // Update metrics and notify others
-        streamMetrics.reportTriggerFinished()
-        updateStatus()
-        postEvent(new QueryProgressEvent(currentStatus))
+
+        // Update committed offsets.
+        committedOffsets ++= availableOffsets
+        updateStatusMessage("Waiting for next trigger")
         isTerminated
       })
     } catch {
@@ -264,14 +256,12 @@ class StreamExecution(
       state = TERMINATED
 
       // Update metrics and status
-      streamMetrics.stop()
       sparkSession.sparkContext.env.metricsSystem.removeSource(streamMetrics)
-      updateStatus()
 
       // Notify others
       sparkSession.streams.notifyQueryTermination(StreamExecution.this)
       postEvent(
-        new QueryTerminatedEvent(currentStatus, exception.map(_.cause).map(Utils.exceptionString)))
+       new QueryTerminatedEvent(id, exception.map(_.cause).map(Utils.exceptionString)))
       terminationLatch.countDown()
     }
   }
@@ -328,14 +318,13 @@ class StreamExecution(
     val hasNewData = {
       awaitBatchLock.lock()
       try {
-        reportTimeTaken(GET_OFFSET_LATENCY) {
-          val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
-            reportTimeTaken(s, SOURCE_GET_OFFSET_LATENCY) {
-              (s, s.getOffset)
-            }
-          }.toMap
-          availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
-        }
+        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
+          updateStatusMessage(s"Getting offsets from $s")
+          reportTimeTaken("getOffset") {
+            (s, s.getOffset)
+          }
+        }.toMap
+        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
 
         if (dataAvailable) {
           true
@@ -350,8 +339,10 @@ class StreamExecution(
     if (hasNewData) {
       // Current batch timestamp in milliseconds
       streamExecutionMetadata.batchTimestampMs = triggerClock.getTimeMillis()
-      reportTimeTaken(OFFSET_WAL_WRITE_LATENCY) {
-        assert(offsetLog.add(currentBatchId,
+      updateStatusMessage("Writing offsets to log")
+      reportTimeTaken("walCommit") {
+        assert(offsetLog.add(
+          currentBatchId,
           availableOffsets.toOffsetSeq(sources, streamExecutionMetadata.json)),
           s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
         logInfo(s"Committed offsets for batch $currentBatchId. " +
@@ -384,30 +375,24 @@ class StreamExecution(
         awaitBatchLock.unlock()
       }
     }
-    reportTimestamp(GET_OFFSET_TIMESTAMP)
   }
 
   /**
    * Processes any data available between `availableOffsets` and `committedOffsets`.
    */
   private def runBatch(): Unit = {
-    // TODO: Move this to IncrementalExecution.
-
     // Request unprocessed data from all sources.
-    val newData = reportTimeTaken(GET_BATCH_LATENCY) {
+    newData = reportTimeTaken("getBatch") {
       availableOffsets.flatMap {
         case (source, available)
           if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
           val current = committedOffsets.get(source)
-          val batch = reportTimeTaken(source, SOURCE_GET_BATCH_LATENCY) {
-            source.getBatch(current, available)
-          }
+          val batch = source.getBatch(current, available)
           logDebug(s"Retrieving data from $source: $current -> $available")
           Some(source -> batch)
         case _ => None
       }
     }
-    reportTimestamp(GET_BATCH_TIMESTAMP)
 
     // A list of attributes that will need to be updated.
     var replacements = new ArrayBuffer[(Attribute, Attribute)]
@@ -438,7 +423,7 @@ class StreamExecution(
           cd.dataType)
     }
 
-    val executedPlan = reportTimeTaken(OPTIMIZER_LATENCY) {
+    val executedPlan = reportTimeTaken("queryPlanning") {
       lastExecution = new IncrementalExecution(
         sparkSession,
         triggerLogicalPlan,
@@ -451,11 +436,12 @@ class StreamExecution(
 
     val nextBatch =
       new Dataset(sparkSession, lastExecution, RowEncoder(lastExecution.analyzed.schema))
-    sink.addBatch(currentBatchId, nextBatch)
-    reportNumRows(executedPlan, triggerLogicalPlan, newData)
+
+    reportTimeTaken("addBatch") {
+      sink.addBatch(currentBatchId, nextBatch)
+    }
 
     // Update the eventTime watermark if we find one in the plan.
-    // TODO: Does this need to be an AttributeMap?
     lastExecution.executedPlan.collect {
       case e: EventTimeWatermarkExec =>
         logTrace(s"Maximum observed eventTime: ${e.maxEventTime.value}")
@@ -468,10 +454,6 @@ class StreamExecution(
         logTrace(s"Event time didn't move: $newWatermark < " +
           s"$streamExecutionMetadata.currentEventTimeWatermark")
       }
-
-      if (newWatermark != 0) {
-        streamMetrics.reportTriggerDetail(EVENT_TIME_WATERMARK, newWatermark)
-      }
     }
 
     awaitBatchLock.lock()
@@ -481,9 +463,6 @@ class StreamExecution(
     } finally {
       awaitBatchLock.unlock()
     }
-
-    // Update committed offsets.
-    committedOffsets ++= availableOffsets
   }
 
   private def postEvent(event: StreamingQueryListener.Event) {
@@ -616,145 +595,12 @@ class StreamExecution(
      """.stripMargin
   }
 
-  /**
-   * Report row metrics of the executed trigger
-   * @param triggerExecutionPlan Execution plan of the trigger
-   * @param triggerLogicalPlan Logical plan of the trigger, generated from the query logical plan
-   * @param sourceToDF Source to DataFrame returned by the source.getBatch in this trigger
-   */
-  private def reportNumRows(
-      triggerExecutionPlan: SparkPlan,
-      triggerLogicalPlan: LogicalPlan,
-      sourceToDF: Map[Source, DataFrame]): Unit = {
-    // We want to associate execution plan leaves to sources that generate them, so that we match
-    // the their metrics (e.g. numOutputRows) to the sources. To do this we do the following.
-    // Consider the translation from the streaming logical plan to the final executed plan.
-    //
-    //  streaming logical plan (with sources) <==> trigger's logical plan <==> executed plan
-    //
-    // 1. We keep track of streaming sources associated with each leaf in the trigger's logical plan
-    //    - Each logical plan leaf will be associated with a single streaming source.
-    //    - There can be multiple logical plan leaves associated with a streaming source.
-    //    - There can be leaves not associated with any streaming source, because they were
-    //      generated from a batch source (e.g. stream-batch joins)
-    //
-    // 2. Assuming that the executed plan has same number of leaves in the same order as that of
-    //    the trigger logical plan, we associate executed plan leaves with corresponding
-    //    streaming sources.
-    //
-    // 3. For each source, we sum the metrics of the associated execution plan leaves.
-    //
-    val logicalPlanLeafToSource = sourceToDF.flatMap { case (source, df) =>
-      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
-    }
-    val allLogicalPlanLeaves = triggerLogicalPlan.collectLeaves() // includes non-streaming sources
-    val allExecPlanLeaves = triggerExecutionPlan.collectLeaves()
-    val sourceToNumInputRows: Map[Source, Long] =
-      if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
-        val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
-          case (lp, ep) => logicalPlanLeafToSource.get(lp).map { source => ep -> source }
-        }
-        val sourceToNumInputRows = execLeafToSource.map { case (execLeaf, source) =>
-          val numRows = execLeaf.metrics.get("numOutputRows").map(_.value).getOrElse(0L)
-          source -> numRows
-        }
-        sourceToNumInputRows.groupBy(_._1).mapValues(_.map(_._2).sum) // sum up rows for each source
-      } else {
-        if (!metricWarningLogged) {
-          def toString[T](seq: Seq[T]): String = s"(size = ${seq.size}), ${seq.mkString(", ")}"
-          logWarning(
-            "Could not report metrics as number leaves in trigger logical plan did not match that" +
-              s" of the execution plan:\n" +
-              s"logical plan leaves: ${toString(allLogicalPlanLeaves)}\n" +
-              s"execution plan leaves: ${toString(allExecPlanLeaves)}\n")
-          metricWarningLogged = true
-        }
-        Map.empty
-      }
-    val numOutputRows = triggerExecutionPlan.metrics.get("numOutputRows").map(_.value)
-    val stateNodes = triggerExecutionPlan.collect {
-      case p if p.isInstanceOf[StateStoreSaveExec] => p
-    }
-
-    streamMetrics.reportNumInputRows(sourceToNumInputRows)
-    stateNodes.zipWithIndex.foreach { case (s, i) =>
-      streamMetrics.reportTriggerDetail(
-        NUM_TOTAL_STATE_ROWS(i + 1),
-        s.metrics.get("numTotalStateRows").map(_.value).getOrElse(0L))
-      streamMetrics.reportTriggerDetail(
-        NUM_UPDATED_STATE_ROWS(i + 1),
-        s.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L))
-    }
-    updateStatus()
-  }
-
-  private def reportTimeTaken[T](triggerDetailKey: String)(body: => T): T = {
-    val startTime = triggerClock.getTimeMillis()
-    val result = body
-    val endTime = triggerClock.getTimeMillis()
-    val timeTaken = math.max(endTime - startTime, 0)
-    streamMetrics.reportTriggerDetail(triggerDetailKey, timeTaken)
-    updateStatus()
-    if (triggerDetailKey == TRIGGER_LATENCY) {
-      logInfo(s"Completed up to $availableOffsets in $timeTaken ms")
-    }
-    result
-  }
-
-  private def reportTimeTaken[T](source: Source, triggerDetailKey: String)(body: => T): T = {
-    val startTime = triggerClock.getTimeMillis()
-    val result = body
-    val endTime = triggerClock.getTimeMillis()
-    streamMetrics.reportSourceTriggerDetail(
-      source, triggerDetailKey, math.max(endTime - startTime, 0))
-    updateStatus()
-    result
-  }
-
-  private def reportTimestamp(triggerDetailKey: String): Unit = {
-    streamMetrics.reportTriggerDetail(triggerDetailKey, triggerClock.getTimeMillis)
-    updateStatus()
-  }
-
-  private def updateStatus(): Unit = {
-    val localAvailableOffsets = availableOffsets
-    val sourceStatuses = sources.map { s =>
-      SourceStatus(
-        s.toString,
-        localAvailableOffsets.get(s).map(_.json).getOrElse("-"),
-        streamMetrics.currentSourceInputRate(s),
-        streamMetrics.currentSourceProcessingRate(s),
-        streamMetrics.currentSourceTriggerDetails(s))
-    }.toArray
-    val sinkStatus = SinkStatus(
-      sink.toString,
-      committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json).toString)
-
-    currentStatus =
-      StreamingQueryStatus(
-        name = name,
-        id = id,
-        timestamp = triggerClock.getTimeMillis(),
-        inputRate = streamMetrics.currentInputRate(),
-        processingRate = streamMetrics.currentProcessingRate(),
-        latency = streamMetrics.currentLatency(),
-        sourceStatuses = sourceStatuses,
-        sinkStatus = sinkStatus,
-        triggerDetails = streamMetrics.currentTriggerDetails())
-  }
-
   trait State
   case object INITIALIZED extends State
   case object ACTIVE extends State
   case object TERMINATED extends State
 }
 
-object StreamExecution {
-  private val _nextId = new AtomicLong(0)
-
-  def nextId: Long = _nextId.getAndIncrement()
-}
-
 /**
  * Contains metadata associated with a stream execution. This information is
  * persisted to the offset log via the OffsetSeq metadata field. Current
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
deleted file mode 100644
index 942e6ed8944be..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetrics.scala
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import java.{util => ju}
-
-import scala.collection.mutable
-
-import com.codahale.metrics.{Gauge, MetricRegistry}
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.metrics.source.{Source => CodahaleSource}
-import org.apache.spark.util.Clock
-
-/**
- * Class that manages all the metrics related to a StreamingQuery. It does the following.
- * - Calculates metrics (rates, latencies, etc.) based on information reported by StreamExecution.
- * - Allows the current metric values to be queried
- * - Serves some of the metrics through Codahale/DropWizard metrics
- *
- * @param sources Unique set of sources in a query
- * @param triggerClock Clock used for triggering in StreamExecution
- * @param codahaleSourceName Root name for all the Codahale metrics
- */
-class StreamMetrics(sources: Set[Source], triggerClock: Clock, codahaleSourceName: String)
-  extends CodahaleSource with Logging {
-
-  import StreamMetrics._
-
-  // Trigger infos
-  private val triggerDetails = new mutable.HashMap[String, String]
-  private val sourceTriggerDetails = new mutable.HashMap[Source, mutable.HashMap[String, String]]
-
-  // Rate estimators for sources and sinks
-  private val inputRates = new mutable.HashMap[Source, RateCalculator]
-  private val processingRates = new mutable.HashMap[Source, RateCalculator]
-
-  // Number of input rows in the current trigger
-  private val numInputRows = new mutable.HashMap[Source, Long]
-  private var currentTriggerStartTimestamp: Long = -1
-  private var previousTriggerStartTimestamp: Long = -1
-  private var latency: Option[Double] = None
-
-  override val sourceName: String = codahaleSourceName
-  override val metricRegistry: MetricRegistry = new MetricRegistry
-
-  // =========== Initialization ===========
-
-  // Metric names should not have . in them, so that all the metrics of a query are identified
-  // together in Ganglia as a single metric group
-  registerGauge("inputRate-total", currentInputRate)
-  registerGauge("processingRate-total", () => currentProcessingRate)
-  registerGauge("latency", () => currentLatency().getOrElse(-1.0))
-
-  sources.foreach { s =>
-    inputRates.put(s, new RateCalculator)
-    processingRates.put(s, new RateCalculator)
-    sourceTriggerDetails.put(s, new mutable.HashMap[String, String])
-
-    registerGauge(s"inputRate-${s.toString}", () => currentSourceInputRate(s))
-    registerGauge(s"processingRate-${s.toString}", () => currentSourceProcessingRate(s))
-  }
-
-  // =========== Setter methods ===========
-
-  def reportTriggerStarted(batchId: Long): Unit = synchronized {
-    numInputRows.clear()
-    triggerDetails.clear()
-    sourceTriggerDetails.values.foreach(_.clear())
-
-    reportTriggerDetail(BATCH_ID, batchId)
-    sources.foreach(s => reportSourceTriggerDetail(s, BATCH_ID, batchId))
-    reportTriggerDetail(IS_TRIGGER_ACTIVE, true)
-    currentTriggerStartTimestamp = triggerClock.getTimeMillis()
-    reportTriggerDetail(START_TIMESTAMP, currentTriggerStartTimestamp)
-  }
-
-  def reportTriggerDetail[T](key: String, value: T): Unit = synchronized {
-    triggerDetails.put(key, value.toString)
-  }
-
-  def reportSourceTriggerDetail[T](source: Source, key: String, value: T): Unit = synchronized {
-    sourceTriggerDetails(source).put(key, value.toString)
-  }
-
-  def reportNumInputRows(inputRows: Map[Source, Long]): Unit = synchronized {
-    numInputRows ++= inputRows
-  }
-
-  def reportTriggerFinished(): Unit = synchronized {
-    require(currentTriggerStartTimestamp >= 0)
-    val currentTriggerFinishTimestamp = triggerClock.getTimeMillis()
-    reportTriggerDetail(FINISH_TIMESTAMP, currentTriggerFinishTimestamp)
-    triggerDetails.remove(STATUS_MESSAGE)
-    reportTriggerDetail(IS_TRIGGER_ACTIVE, false)
-
-    // Report number of rows
-    val totalNumInputRows = numInputRows.values.sum
-    reportTriggerDetail(NUM_INPUT_ROWS, totalNumInputRows)
-    numInputRows.foreach { case (s, r) =>
-      reportSourceTriggerDetail(s, NUM_SOURCE_INPUT_ROWS, r)
-    }
-
-    val currentTriggerDuration = currentTriggerFinishTimestamp - currentTriggerStartTimestamp
-    val previousInputIntervalOption = if (previousTriggerStartTimestamp >= 0) {
-      Some(currentTriggerStartTimestamp - previousTriggerStartTimestamp)
-    } else None
-
-    // Update input rate = num rows received by each source during the previous trigger interval
-    // Interval is measures as interval between start times of previous and current trigger.
-    //
-    // TODO: Instead of trigger start, we should use time when getOffset was called on each source
-    // as this may be different for each source if there are many sources in the query plan
-    // and getOffset is called serially on them.
-    if (previousInputIntervalOption.nonEmpty) {
-      sources.foreach { s =>
-        inputRates(s).update(numInputRows.getOrElse(s, 0), previousInputIntervalOption.get)
-      }
-    }
-
-    // Update processing rate = num rows processed for each source in current trigger duration
-    sources.foreach { s =>
-      processingRates(s).update(numInputRows.getOrElse(s, 0), currentTriggerDuration)
-    }
-
-    // Update latency = if data present, 0.5 * previous trigger interval + current trigger duration
-    if (previousInputIntervalOption.nonEmpty && totalNumInputRows > 0) {
-      latency = Some((previousInputIntervalOption.get.toDouble / 2) + currentTriggerDuration)
-    } else {
-      latency = None
-    }
-
-    previousTriggerStartTimestamp = currentTriggerStartTimestamp
-    currentTriggerStartTimestamp = -1
-  }
-
-  // =========== Getter methods ===========
-
-  def currentInputRate(): Double = synchronized {
-    // Since we are calculating source input rates using the same time interval for all sources
-    // it is fine to calculate total input rate as the sum of per source input rate.
-    inputRates.map(_._2.currentRate).sum
-  }
-
-  def currentSourceInputRate(source: Source): Double = synchronized {
-    inputRates(source).currentRate
-  }
-
-  def currentProcessingRate(): Double = synchronized {
-    // Since we are calculating source processing rates using the same time interval for all sources
-    // it is fine to calculate total processing rate as the sum of per source processing rate.
-    processingRates.map(_._2.currentRate).sum
-  }
-
-  def currentSourceProcessingRate(source: Source): Double = synchronized {
-    processingRates(source).currentRate
-  }
-
-  def currentLatency(): Option[Double] = synchronized { latency }
-
-  def currentTriggerDetails(): Map[String, String] = synchronized { triggerDetails.toMap }
-
-  def currentSourceTriggerDetails(source: Source): Map[String, String] = synchronized {
-    sourceTriggerDetails(source).toMap
-  }
-
-  // =========== Other methods ===========
-
-  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
-    synchronized {
-      metricRegistry.register(name, new Gauge[T] {
-        override def getValue: T = f()
-      })
-    }
-  }
-
-  def stop(): Unit = synchronized {
-    triggerDetails.clear()
-    inputRates.valuesIterator.foreach { _.stop() }
-    processingRates.valuesIterator.foreach { _.stop() }
-    latency = None
-  }
-}
-
-object StreamMetrics extends Logging {
-  /** Simple utility class to calculate rate while avoiding DivideByZero */
-  class RateCalculator {
-    @volatile private var rate: Option[Double] = None
-
-    def update(numRows: Long, timeGapMs: Long): Unit = {
-      if (timeGapMs > 0) {
-        rate = Some(numRows.toDouble * 1000 / timeGapMs)
-      } else {
-        rate = None
-        logDebug(s"Rate updates cannot with zero or negative time gap $timeGapMs")
-      }
-    }
-
-    def currentRate: Double = rate.getOrElse(0.0)
-
-    def stop(): Unit = { rate = None }
-  }
-
-
-  val BATCH_ID = "batchId"
-  val IS_TRIGGER_ACTIVE = "isTriggerActive"
-  val IS_DATA_PRESENT_IN_TRIGGER = "isDataPresentInTrigger"
-  val STATUS_MESSAGE = "statusMessage"
-  val EVENT_TIME_WATERMARK = "eventTimeWatermark"
-
-  val START_TIMESTAMP = "timestamp.triggerStart"
-  val GET_OFFSET_TIMESTAMP = "timestamp.afterGetOffset"
-  val GET_BATCH_TIMESTAMP = "timestamp.afterGetBatch"
-  val FINISH_TIMESTAMP = "timestamp.triggerFinish"
-
-  val GET_OFFSET_LATENCY = "latency.getOffset.total"
-  val GET_BATCH_LATENCY = "latency.getBatch.total"
-  val OFFSET_WAL_WRITE_LATENCY = "latency.offsetLogWrite"
-  val OPTIMIZER_LATENCY = "latency.optimizer"
-  val TRIGGER_LATENCY = "latency.fullTrigger"
-  val SOURCE_GET_OFFSET_LATENCY = "latency.getOffset.source"
-  val SOURCE_GET_BATCH_LATENCY = "latency.getBatch.source"
-
-  val NUM_INPUT_ROWS = "numRows.input.total"
-  val NUM_SOURCE_INPUT_ROWS = "numRows.input.source"
-  def NUM_TOTAL_STATE_ROWS(aggId: Int): String = s"numRows.state.aggregation$aggId.total"
-  def NUM_UPDATED_STATE_ROWS(aggId: Int): String = s"numRows.state.aggregation$aggId.updated"
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5589805212b7e..21b26b81467fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -583,6 +583,12 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val STREAMING_PROGRESS_RETENTION =
+    SQLConfigBuilder("spark.sql.streaming.numRecentProgresses")
+      .doc("The number of progress updates to retain for a streaming query")
+      .intConf
+      .createWithDefault(100)
+
   val NDV_MAX_ERROR =
     SQLConfigBuilder("spark.sql.statistics.ndv.maxError")
       .internal()
@@ -654,6 +660,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def streamingMetricsEnabled: Boolean = getConf(STREAMING_METRICS_ENABLED)
 
+  def streamingProgressRetention: Int = getConf(STREAMING_PROGRESS_RETENTION)
+
   def filesMaxPartitionBytes: Long = getConf(FILES_MAX_PARTITION_BYTES)
 
   def filesOpenCostInBytes: Long = getConf(FILES_OPEN_COST_IN_BYTES)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
deleted file mode 100644
index ab19602207ad8..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SinkStatus.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.json4s._
-import org.json4s.JsonAST.JValue
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods._
-
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
-
-/**
- * :: Experimental ::
- * Status and metrics of a streaming sink.
- *
- * @param description Description of the source corresponding to this status.
- * @param offsetDesc Description of the current offsets up to which data has been written
- *                   by the sink.
- * @since 2.0.0
- */
-@Experimental
-class SinkStatus private(
-    val description: String,
-    val offsetDesc: String) {
-
-  /** The compact JSON representation of this status. */
-  def json: String = compact(render(jsonValue))
-
-  /** The pretty (i.e. indented) JSON representation of this status. */
-  def prettyJson: String = pretty(render(jsonValue))
-
-  override def toString: String =
-    "Status of sink " + indent(prettyString).trim
-
-  private[sql] def jsonValue: JValue = {
-    ("description" -> JString(description)) ~
-    ("offsetDesc" -> JString(offsetDesc))
-  }
-
-  private[sql] def prettyString: String = {
-    s"""$description
-       |Committed offsets: $offsetDesc
-       |""".stripMargin
-  }
-}
-
-/** Companion object, primarily for creating SinkStatus instances internally */
-private[sql] object SinkStatus {
-  def apply(desc: String, offsetDesc: String): SinkStatus = new SinkStatus(desc, offsetDesc)
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
deleted file mode 100644
index cfdf11370e06d..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/SourceStatus.scala
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import java.{util => ju}
-
-import scala.collection.JavaConverters._
-
-import org.json4s._
-import org.json4s.JsonAST.JValue
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods._
-
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.streaming.StreamingQueryStatus.indent
-import org.apache.spark.util.JsonProtocol
-
-/**
- * :: Experimental ::
- * Status and metrics of a streaming Source.
- *
- * @param description Description of the source corresponding to this status.
- * @param offsetDesc Description of the current offset if known.
- * @param inputRate Current rate (rows/sec) at which data is being generated by the source.
- * @param processingRate Current rate (rows/sec) at which the query is processing data from
- *                       the source.
- * @param triggerDetails Low-level details of the currently active trigger (e.g. number of
- *                      rows processed in trigger, latency of intermediate steps, etc.).
- *                      If no trigger is active, then it will have details of the last completed
- *                      trigger.
- * @since 2.0.0
- */
-@Experimental
-class SourceStatus private(
-    val description: String,
-    val offsetDesc: String,
-    val inputRate: Double,
-    val processingRate: Double,
-    val triggerDetails: ju.Map[String, String]) {
-
-  /** The compact JSON representation of this status. */
-  def json: String = compact(render(jsonValue))
-
-  /** The pretty (i.e. indented) JSON representation of this status. */
-  def prettyJson: String = pretty(render(jsonValue))
-
-  override def toString: String =
-    "Status of source " + indent(prettyString).trim
-
-  private[sql] def jsonValue: JValue = {
-    ("description" -> JString(description)) ~
-    ("offsetDesc" -> JString(offsetDesc)) ~
-    ("inputRate" -> JDouble(inputRate)) ~
-    ("processingRate" -> JDouble(processingRate)) ~
-    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala))
-  }
-
-  private[sql] def prettyString: String = {
-    val triggerDetailsLines =
-      triggerDetails.asScala.map { case (k, v) => s"$k: $v" }
-    s"""$description
-       |Available offset: $offsetDesc
-       |Input rate: $inputRate rows/sec
-       |Processing rate: $processingRate rows/sec
-       |Trigger details:
-       |""".stripMargin + indent(triggerDetailsLines)
-  }
-}
-
-/** Companion object, primarily for creating SourceStatus instances internally */
-private[sql] object SourceStatus {
-  def apply(
-      desc: String,
-      offsetDesc: String,
-      inputRate: Double,
-      processingRate: Double,
-      triggerDetails: Map[String, String]): SourceStatus = {
-    new SourceStatus(desc, offsetDesc, inputRate, processingRate, triggerDetails.asJava)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
index 374313f2ca9ab..8fc4e43b6de53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.UUID
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.SparkSession
 
@@ -33,25 +35,27 @@ trait StreamingQuery {
    * Returns the name of the query. This name is unique across all active queries. This can be
    * set in the `org.apache.spark.sql.streaming.DataStreamWriter` as
    * `dataframe.writeStream.queryName("query").start()`.
+   *
    * @since 2.0.0
    */
   def name: String
 
   /**
-   * Returns the unique id of this query. This id is automatically generated and is unique across
-   * all queries that have been started in the current process.
-   * @since 2.0.0
+   * Returns the unique id of this query.
+   * @since 2.1.0
    */
-  def id: Long
+  def id: UUID
 
   /**
    * Returns the `SparkSession` associated with `this`.
+   *
    * @since 2.0.0
    */
   def sparkSession: SparkSession
 
   /**
-   * Whether the query is currently active or not
+   * Returns `true` if this query is actively running.
+   *
    * @since 2.0.0
    */
   def isActive: Boolean
@@ -64,23 +68,26 @@ trait StreamingQuery {
 
   /**
    * Returns the current status of the query.
+   *
    * @since 2.0.2
    */
   def status: StreamingQueryStatus
 
   /**
-   * Returns current status of all the sources.
-   * @since 2.0.0
+   * Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.
+   * The number of progress updates retained for each stream is configured by Spark session
+   * configuration `spark.sql.streaming.numRecentProgresses`.
+   *
+   * @since 2.1.0
    */
-  @deprecated("use status.sourceStatuses", "2.0.2")
-  def sourceStatuses: Array[SourceStatus]
+  def recentProgresses: Array[StreamingQueryProgress]
 
   /**
-   * Returns current status of the sink.
-   * @since 2.0.0
+   * Returns the most recent [[StreamingQueryProgress]] update of this streaming query.
+   *
+   * @since 2.1.0
    */
-  @deprecated("use status.sinkStatus", "2.0.2")
-  def sinkStatus: SinkStatus
+  def lastProgress: StreamingQueryProgress
 
   /**
    * Waits for the termination of `this` query, either by `query.stop()` or by an exception.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
index 0a58142e066ac..13f11ba1c9222 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.execution.streaming.{Offset, OffsetSeq, StreamExecut
  * :: Experimental ::
  * Exception that stopped a [[StreamingQuery]]. Use `cause` get the actual exception
  * that caused the failure.
- * @param query      Query that caused the exception
+ * @param query       Query that caused the exception
  * @param message     Message of this exception
  * @param cause       Internal cause of this exception
  * @param startOffset Starting offset (if known) of the range of data in which exception occurred
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
index 9e311fae842be..d9ee75c064065 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.UUID
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.scheduler.SparkListenerEvent
 
@@ -81,30 +83,28 @@ object StreamingQueryListener {
   /**
    * :: Experimental ::
    * Event representing the start of a query
-   * @since 2.0.0
+   * @since 2.1.0
    */
   @Experimental
-  class QueryStartedEvent private[sql](val queryStatus: StreamingQueryStatus) extends Event
+  class QueryStartedEvent private[sql](val id: UUID, val name: String) extends Event
 
   /**
    * :: Experimental ::
-   * Event representing any progress updates in a query
-   * @since 2.0.0
+   * Event representing any progress updates in a query.
+   * @since 2.1.0
    */
   @Experimental
-  class QueryProgressEvent private[sql](val queryStatus: StreamingQueryStatus) extends Event
+  class QueryProgressEvent private[sql](val progress: StreamingQueryProgress) extends Event
 
   /**
    * :: Experimental ::
-   * Event representing that termination of a query
+   * Event representing that termination of a query.
    *
-   * @param queryStatus Information about the status of the query.
-   * @param exception The exception message of the [[StreamingQuery]] if the query was terminated
+   * @param id The query id.
+   * @param exception The exception message of the query if the query was terminated
    *                  with an exception. Otherwise, it will be `None`.
-   * @since 2.0.0
+   * @since 2.1.0
    */
   @Experimental
-  class QueryTerminatedEvent private[sql](
-      val queryStatus: StreamingQueryStatus,
-      val exception: Option[String]) extends Event
+  class QueryTerminatedEvent private[sql](val id: UUID, val exception: Option[String]) extends Event
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 53968a82d8e22..c448468bea519 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.UUID
+import java.util.concurrent.atomic.AtomicLong
+
 import scala.collection.mutable
 
 import org.apache.hadoop.fs.Path
@@ -41,7 +44,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
   private[sql] val stateStoreCoordinator =
     StateStoreCoordinatorRef.forDriver(sparkSession.sparkContext.env)
   private val listenerBus = new StreamingQueryListenerBus(sparkSession.sparkContext.listenerBus)
-  private val activeQueries = new mutable.HashMap[Long, StreamingQuery]
+  private val activeQueries = new mutable.HashMap[UUID, StreamingQuery]
   private val activeQueriesLock = new Object
   private val awaitTerminationLock = new Object
 
@@ -59,12 +62,19 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
   /**
    * Returns the query if there is an active query with the given id, or null.
    *
-   * @since 2.0.0
+   * @since 2.1.0
    */
-  def get(id: Long): StreamingQuery = activeQueriesLock.synchronized {
+  def get(id: UUID): StreamingQuery = activeQueriesLock.synchronized {
     activeQueries.get(id).orNull
   }
 
+  /**
+   * Returns the query if there is an active query with the given id, or null.
+   *
+   * @since 2.1.0
+   */
+  def get(id: String): StreamingQuery = get(UUID.fromString(id))
+
   /**
    * Wait until any of the queries on the associated SQLContext has terminated since the
    * creation of the context, or since `resetTerminated()` was called. If any query was terminated
@@ -197,8 +207,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
       trigger: Trigger = ProcessingTime(0),
       triggerClock: Clock = new SystemClock()): StreamingQuery = {
     activeQueriesLock.synchronized {
-      val id = StreamExecution.nextId
-      val name = userSpecifiedName.getOrElse(s"query-$id")
+      val name = userSpecifiedName.getOrElse(s"query-${StreamingQueryManager.nextId}")
       if (activeQueries.values.exists(_.name == name)) {
         throw new IllegalArgumentException(
           s"Cannot start query with name $name as a query with that name is already active")
@@ -252,7 +261,6 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
       }
       val query = new StreamExecution(
         sparkSession,
-        id,
         name,
         checkpointLocation,
         logicalPlan,
@@ -261,7 +269,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
         triggerClock,
         outputMode)
       query.start()
-      activeQueries.put(id, query)
+      activeQueries.put(query.id, query)
       query
     }
   }
@@ -279,3 +287,8 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
     }
   }
 }
+
+private object StreamingQueryManager {
+  private val _nextId = new AtomicLong(0)
+  private def nextId: Long = _nextId.getAndIncrement()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
index ba732ff7fc2ce..4c1a7ce6a03fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -17,146 +17,17 @@
 
 package org.apache.spark.sql.streaming
 
-import java.{util => ju}
-
-import scala.collection.JavaConverters._
-
-import org.json4s._
-import org.json4s.JsonAST.JValue
-import org.json4s.JsonDSL._
-import org.json4s.jackson.JsonMethods._
-
-import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.execution.streaming.{LongOffset, OffsetSeq}
-import org.apache.spark.util.JsonProtocol
-
 /**
- * :: Experimental ::
- * A class used to report information about the progress of a [[StreamingQuery]].
+ * Reports information about the instantaneous status of a streaming query.
  *
- * @param name Name of the query. This name is unique across all active queries.
- * @param id Id of the query. This id is unique across
- *          all queries that have been started in the current process.
- * @param timestamp Timestamp (ms) of when this query was generated.
- * @param inputRate Current rate (rows/sec) at which data is being generated by all the sources.
- * @param processingRate Current rate (rows/sec) at which the query is processing data from
- *                       all the sources.
- * @param latency  Current average latency between the data being available in source and the sink
- *                   writing the corresponding output.
- * @param sourceStatuses Current statuses of the sources.
- * @param sinkStatus Current status of the sink.
- * @param triggerDetails Low-level details of the currently active trigger (e.g. number of
- *                      rows processed in trigger, latency of intermediate steps, etc.).
- *                      If no trigger is active, then it will have details of the last completed
- *                      trigger.
- * @since 2.0.0
+ * @param message A human readable description of what the stream is currently doing.
+ * @param isDataAvailable True when there is new data to be processed.
+ * @param isTriggerActive True when the trigger is actively firing, false when waiting for the
+ *                        next trigger time.
+ *
+ * @since 2.1.0
  */
-@Experimental
-class StreamingQueryStatus private(
-  val name: String,
-  val id: Long,
-  val timestamp: Long,
-  val inputRate: Double,
-  val processingRate: Double,
-  val latency: Option[Double],
-  val sourceStatuses: Array[SourceStatus],
-  val sinkStatus: SinkStatus,
-  val triggerDetails: ju.Map[String, String]) {
-
-  import StreamingQueryStatus._
-
-  /** The compact JSON representation of this status. */
-  def json: String = compact(render(jsonValue))
-
-  /** The pretty (i.e. indented) JSON representation of this status. */
-  def prettyJson: String = pretty(render(jsonValue))
-
-  override def toString: String = {
-    val sourceStatusLines = sourceStatuses.zipWithIndex.map { case (s, i) =>
-      s"Source ${i + 1} - " + indent(s.prettyString).trim
-    }
-    val sinkStatusLines = sinkStatus.prettyString.trim
-    val triggerDetailsLines = triggerDetails.asScala.map { case (k, v) => s"$k: $v" }.toSeq.sorted
-    val numSources = sourceStatuses.length
-    val numSourcesString = s"$numSources source" + { if (numSources > 1) "s" else "" }
-
-    val allLines =
-      s"""|Query id: $id
-          |Status timestamp: $timestamp
-          |Input rate: $inputRate rows/sec
-          |Processing rate $processingRate rows/sec
-          |Latency: ${latency.getOrElse("-")} ms
-          |Trigger details:
-          |${indent(triggerDetailsLines)}
-          |Source statuses [$numSourcesString]:
-          |${indent(sourceStatusLines)}
-          |Sink status - ${indent(sinkStatusLines).trim}""".stripMargin
-
-    s"Status of query '$name'\n${indent(allLines)}"
-  }
-
-  private[sql] def jsonValue: JValue = {
-    ("name" -> JString(name)) ~
-    ("id" -> JInt(id)) ~
-    ("timestamp" -> JInt(timestamp)) ~
-    ("inputRate" -> JDouble(inputRate)) ~
-    ("processingRate" -> JDouble(processingRate)) ~
-    ("latency" -> latency.map(JDouble).getOrElse(JNothing)) ~
-    ("triggerDetails" -> JsonProtocol.mapToJson(triggerDetails.asScala)) ~
-    ("sourceStatuses" -> JArray(sourceStatuses.map(_.jsonValue).toList)) ~
-    ("sinkStatus" -> sinkStatus.jsonValue)
-  }
-}
-
-/** Companion object, primarily for creating StreamingQueryInfo instances internally */
-private[sql] object StreamingQueryStatus {
-  def apply(
-      name: String,
-      id: Long,
-      timestamp: Long,
-      inputRate: Double,
-      processingRate: Double,
-      latency: Option[Double],
-      sourceStatuses: Array[SourceStatus],
-      sinkStatus: SinkStatus,
-      triggerDetails: Map[String, String]): StreamingQueryStatus = {
-    new StreamingQueryStatus(name, id, timestamp, inputRate, processingRate,
-      latency, sourceStatuses, sinkStatus, triggerDetails.asJava)
-  }
-
-  def indent(strings: Iterable[String]): String = strings.map(indent).mkString("\n")
-  def indent(string: String): String = string.split("\n").map("    " + _).mkString("\n")
-
-  /** Create an instance of status for python testing */
-  def testStatus(): StreamingQueryStatus = {
-    import org.apache.spark.sql.execution.streaming.StreamMetrics._
-    StreamingQueryStatus(
-      name = "query",
-      id = 1,
-      timestamp = 123,
-      inputRate = 15.5,
-      processingRate = 23.5,
-      latency = Some(345),
-      sourceStatuses = Array(
-        SourceStatus(
-          desc = "MySource1",
-          offsetDesc = LongOffset(0).json,
-          inputRate = 15.5,
-          processingRate = 23.5,
-          triggerDetails = Map(
-            NUM_SOURCE_INPUT_ROWS -> "100",
-            SOURCE_GET_OFFSET_LATENCY -> "10",
-            SOURCE_GET_BATCH_LATENCY -> "20"))),
-      sinkStatus = SinkStatus(
-        desc = "MySink",
-        offsetDesc = OffsetSeq(Some(LongOffset(1)) :: None :: Nil).toString),
-      triggerDetails = Map(
-        BATCH_ID -> "5",
-        IS_TRIGGER_ACTIVE -> "true",
-        IS_DATA_PRESENT_IN_TRIGGER -> "true",
-        GET_OFFSET_LATENCY -> "10",
-        GET_BATCH_LATENCY -> "20",
-        NUM_INPUT_ROWS -> "100"
-      ))
-  }
-}
+case class StreamingQueryStatus protected[sql](
+    message: String,
+    isDataAvailable: Boolean,
+    isTriggerActive: Boolean)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
new file mode 100644
index 0000000000000..7129fa4d15ef8
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.{util => ju}
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+import org.apache.jute.compiler.JLong
+import org.json4s._
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Information about updates made to stateful operators in a [[StreamingQuery]] during a trigger.
+ */
+@Experimental
+class StateOperatorProgress private[sql](
+    val numRowsTotal: Long,
+    val numRowsUpdated: Long) {
+  private[sql] def jsonValue: JValue = {
+    ("numRowsTotal" -> JInt(numRowsTotal)) ~
+    ("numRowsUpdated" -> JInt(numRowsUpdated))
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Information about progress made in the execution of a [[StreamingQuery]] during
+ * a trigger. Each event relates to processing done for a single trigger of the streaming
+ * query. Events are emitted even when no new data is available to be processed.
+ *
+ * @param id A unique id of the query.
+ * @param name Name of the query. This name is unique across all active queries.
+ * @param timestamp Timestamp (ms) of the beginning of the trigger.
+ * @param batchId A unique id for the current batch of data being processed.  Note that in the
+ *                case of retries after a failure a given batchId my be executed more than once.
+ *                Similarly, when there is no data to be processed, the batchId will not be
+ *                incremented.
+ * @param durationMs The amount of time taken to perform various operations in milliseconds.
+ * @param currentWatermark The current event time watermark in milliseconds
+ * @param stateOperators Information about operators in the query that store state.
+ * @param sources detailed statistics on data being read from each of the streaming sources.
+ * @since 2.1.0
+ */
+@Experimental
+class StreamingQueryProgress private[sql](
+  val id: UUID,
+  val name: String,
+  val timestamp: Long,
+  val batchId: Long,
+  val durationMs: ju.Map[String, java.lang.Long],
+  val currentWatermark: Long,
+  val stateOperators: Array[StateOperatorProgress],
+  val sources: Array[SourceProgress],
+  val sink: SinkProgress) {
+
+  /** The aggregate (across all sources) number of records processed in a trigger. */
+  def numInputRows: Long = sources.map(_.numInputRows).sum
+
+  /** The aggregate (across all sources) rate of data arriving. */
+  def inputRowsPerSecond: Double = sources.map(_.inputRowsPerSecond).sum
+
+  /** The aggregate (across all sources) rate at which Spark is processing data. */
+  def processedRowsPerSecond: Double = sources.map(_.processedRowsPerSecond).sum
+
+  /** The compact JSON representation of this status. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this status. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def jsonValue: JValue = {
+    def safeDoubleToJValue(value: Double): JValue = {
+      if (value.isNaN || value.isInfinity) JNothing else JDouble(value)
+    }
+
+    ("id" -> JString(id.toString)) ~
+    ("name" -> JString(name)) ~
+    ("timestamp" -> JInt(timestamp)) ~
+    ("numInputRows" -> JInt(numInputRows)) ~
+    ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
+    ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond)) ~
+    ("durationMs" -> durationMs
+        .asScala
+        .map { case (k, v) => k -> JInt(v.toLong): JObject }
+        .reduce(_ ~ _)) ~
+    ("currentWatermark" -> JInt(currentWatermark)) ~
+    ("stateOperators" -> JArray(stateOperators.map(_.jsonValue).toList)) ~
+    ("sources" -> JArray(sources.map(_.jsonValue).toList)) ~
+    ("sink" -> sink.jsonValue)
+
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Information about progress made for a source in the execution of a [[StreamingQuery]]
+ * during a trigger. See [[StreamingQueryProgress]] for more information.
+ *
+ * @param description            Description of the source.
+ * @param startOffset            The starting offset for data being read.
+ * @param endOffset              The ending offset for data being read.
+ * @param numInputRows           The number of records read from this source.
+ * @param inputRowsPerSecond     The rate at which data is arriving from this source.
+ * @param processedRowsPerSecond The rate at which data from this source is being procressed by
+ *                               Spark.
+ * @since 2.1.0
+ */
+@Experimental
+class SourceProgress protected[sql](
+  val description: String,
+  val startOffset: String,
+  val endOffset: String,
+  val numInputRows: Long,
+  val inputRowsPerSecond: Double,
+  val processedRowsPerSecond: Double) {
+
+  /** The compact JSON representation of this progress. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this progress. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def jsonValue: JValue = {
+    def safeDoubleToJValue(value: Double): JValue = {
+      if (value.isNaN || value.isInfinity) JNothing else JDouble(value)
+    }
+
+    ("description" -> JString(description)) ~
+      ("startOffset" -> tryParse(startOffset)) ~
+      ("endOffset" -> tryParse(endOffset)) ~
+      ("numInputRows" -> JInt(numInputRows)) ~
+      ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
+      ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond))
+  }
+
+  private def tryParse(json: String) = try {
+    parse(json)
+  } catch {
+    case NonFatal(e) => JString(json)
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Information about progress made for a sink in the execution of a [[StreamingQuery]]
+ * during a trigger. See [[StreamingQueryProgress]] for more information.
+ *
+ * @param description Description of the source corresponding to this status.
+ * @since 2.1.0
+ */
+@Experimental
+class SinkProgress protected[sql](
+    val description: String) {
+
+  /** The compact JSON representation of this status. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this status. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def jsonValue: JValue = {
+    ("description" -> JString(description))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
deleted file mode 100644
index 38c4ece439770..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetricsSuite.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming
-
-import org.scalactic.TolerantNumerics
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.util.ManualClock
-
-class StreamMetricsSuite extends SparkFunSuite {
-  import StreamMetrics._
-
-  // To make === between double tolerate inexact values
-  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
-
-  test("rates, latencies, trigger details - basic life cycle") {
-    val sm = newStreamMetrics(source)
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails().isEmpty)
-
-    // When trigger started, the rates should not change, but should return
-    // reported trigger details
-    sm.reportTriggerStarted(1)
-    sm.reportTriggerDetail("key", "value")
-    sm.reportSourceTriggerDetail(source, "key2", "value2")
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails() ===
-      Map(BATCH_ID -> "1", IS_TRIGGER_ACTIVE -> "true",
-        START_TIMESTAMP -> "0", "key" -> "value"))
-    assert(sm.currentSourceTriggerDetails(source) ===
-      Map(BATCH_ID -> "1", "key2" -> "value2"))
-
-    // Finishing the trigger should calculate the rates, except input rate which needs
-    // to have another trigger interval
-    sm.reportNumInputRows(Map(source -> 100L)) // 100 input rows, 10 output rows
-    clock.advance(1000)
-    sm.reportTriggerFinished()
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 100.0)  // 100 input rows processed in 1 sec
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 100.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails() ===
-      Map(BATCH_ID -> "1", IS_TRIGGER_ACTIVE -> "false",
-        START_TIMESTAMP -> "0", FINISH_TIMESTAMP -> "1000",
-        NUM_INPUT_ROWS -> "100", "key" -> "value"))
-    assert(sm.currentSourceTriggerDetails(source) ===
-      Map(BATCH_ID -> "1", NUM_SOURCE_INPUT_ROWS -> "100", "key2" -> "value2"))
-
-    // After another trigger starts, the rates and latencies should not change until
-    // new rows are reported
-    clock.advance(1000)
-    sm.reportTriggerStarted(2)
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 100.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 100.0)
-    assert(sm.currentLatency() === None)
-
-    // Reporting new rows should update the rates and latencies
-    sm.reportNumInputRows(Map(source -> 200L))     // 200 input rows
-    clock.advance(500)
-    sm.reportTriggerFinished()
-    assert(sm.currentInputRate() === 100.0)      // 200 input rows generated in 2 seconds b/w starts
-    assert(sm.currentProcessingRate() === 400.0) // 200 output rows processed in 0.5 sec
-    assert(sm.currentSourceInputRate(source) === 100.0)
-    assert(sm.currentSourceProcessingRate(source) === 400.0)
-    assert(sm.currentLatency().get === 1500.0)       // 2000 ms / 2 + 500 ms
-
-    // Rates should be set to 0 after stop
-    sm.stop()
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    assert(sm.currentTriggerDetails().isEmpty)
-  }
-
-  test("rates and latencies - after trigger with no data") {
-    val sm = newStreamMetrics(source)
-    // Trigger 1 with data
-    sm.reportTriggerStarted(1)
-    sm.reportNumInputRows(Map(source -> 100L)) // 100 input rows
-    clock.advance(1000)
-    sm.reportTriggerFinished()
-
-    // Trigger 2 with data
-    clock.advance(1000)
-    sm.reportTriggerStarted(2)
-    sm.reportNumInputRows(Map(source -> 200L)) // 200 input rows
-    clock.advance(500)
-    sm.reportTriggerFinished()
-
-    // Make sure that all rates are set
-    require(sm.currentInputRate() === 100.0) // 200 input rows generated in 2 seconds b/w starts
-    require(sm.currentProcessingRate() === 400.0) // 200 output rows processed in 0.5 sec
-    require(sm.currentSourceInputRate(source) === 100.0)
-    require(sm.currentSourceProcessingRate(source) === 400.0)
-    require(sm.currentLatency().get === 1500.0) // 2000 ms / 2 + 500 ms
-
-    // Trigger 3 with data
-    clock.advance(500)
-    sm.reportTriggerStarted(3)
-    clock.advance(500)
-    sm.reportTriggerFinished()
-
-    // Rates are set to zero and latency is set to None
-    assert(sm.currentInputRate() === 0.0)
-    assert(sm.currentProcessingRate() === 0.0)
-    assert(sm.currentSourceInputRate(source) === 0.0)
-    assert(sm.currentSourceProcessingRate(source) === 0.0)
-    assert(sm.currentLatency() === None)
-    sm.stop()
-  }
-
-  test("rates - after trigger with multiple sources, and one source having no info") {
-    val source1 = TestSource(1)
-    val source2 = TestSource(2)
-    val sm = newStreamMetrics(source1, source2)
-    // Trigger 1 with data
-    sm.reportTriggerStarted(1)
-    sm.reportNumInputRows(Map(source1 -> 100L, source2 -> 100L))
-    clock.advance(1000)
-    sm.reportTriggerFinished()
-
-    // Trigger 2 with data
-    clock.advance(1000)
-    sm.reportTriggerStarted(2)
-    sm.reportNumInputRows(Map(source1 -> 200L, source2 -> 200L))
-    clock.advance(500)
-    sm.reportTriggerFinished()
-
-    // Make sure that all rates are set
-    assert(sm.currentInputRate() === 200.0) // 200*2 input rows generated in 2 seconds b/w starts
-    assert(sm.currentProcessingRate() === 800.0) // 200*2 output rows processed in 0.5 sec
-    assert(sm.currentSourceInputRate(source1) === 100.0)
-    assert(sm.currentSourceInputRate(source2) === 100.0)
-    assert(sm.currentSourceProcessingRate(source1) === 400.0)
-    assert(sm.currentSourceProcessingRate(source2) === 400.0)
-
-    // Trigger 3 with data
-    clock.advance(500)
-    sm.reportTriggerStarted(3)
-    clock.advance(500)
-    sm.reportNumInputRows(Map(source1 -> 200L))
-    sm.reportTriggerFinished()
-
-    // Rates are set to zero and latency is set to None
-    assert(sm.currentInputRate() === 200.0)
-    assert(sm.currentProcessingRate() === 400.0)
-    assert(sm.currentSourceInputRate(source1) === 200.0)
-    assert(sm.currentSourceInputRate(source2) === 0.0)
-    assert(sm.currentSourceProcessingRate(source1) === 400.0)
-    assert(sm.currentSourceProcessingRate(source2) === 0.0)
-    sm.stop()
-  }
-
-  test("registered Codahale metrics") {
-    import scala.collection.JavaConverters._
-    val sm = newStreamMetrics(source)
-    val gaugeNames = sm.metricRegistry.getGauges().keySet().asScala
-
-    // so that all metrics are considered as a single metric group in Ganglia
-    assert(!gaugeNames.exists(_.contains(".")))
-    assert(gaugeNames === Set(
-      "inputRate-total",
-      "inputRate-source0",
-      "processingRate-total",
-      "processingRate-source0",
-      "latency"))
-  }
-
-  private def newStreamMetrics(sources: Source*): StreamMetrics = {
-    new StreamMetrics(sources.toSet, clock, "test")
-  }
-
-  private val clock = new ManualClock()
-  private val source = TestSource(0)
-
-  case class TestSource(id: Int) extends Source {
-    override def schema: StructType = StructType(Array.empty[StructField])
-    override def getOffset: Option[Offset] = Some(new LongOffset(0))
-    override def getBatch(start: Option[Offset], end: Offset): DataFrame = { null }
-    override def stop() {}
-    override def toString(): String = s"source$id"
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index bad6642ea4058..8256c63d87090 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -1006,9 +1006,13 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
       testStream(input)(
         AddTextFileData("100", src, tmp),
         CheckAnswer("100"),
-        AssertOnLastQueryStatus { status =>
-          assert(status.triggerDetails.get("numRows.input.total") === "1")
-          assert(status.sourceStatuses(0).processingRate > 0.0)
+        AssertOnQuery { query =>
+          val actualProgress = query.recentProgresses
+              .find(_.numInputRows > 0)
+              .getOrElse(sys.error("Could not find records with data."))
+          assert(actualProgress.numInputRows === 1)
+          assert(actualProgress.sources(0).processedRowsPerSecond > 0.0)
+          true
         }
       )
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index a6b2d4b9ab4c8..a2629f7f68160 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -28,7 +28,6 @@ import scala.util.control.NonFatal
 
 import org.scalatest.Assertions
 import org.scalatest.concurrent.{Eventually, Timeouts}
-import org.scalatest.concurrent.AsyncAssertions.Waiter
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.exceptions.TestFailedDueToTimeoutException
@@ -202,10 +201,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     }
   }
 
-  case class AssertOnLastQueryStatus(condition: StreamingQueryStatus => Unit)
-    extends StreamAction
-
-  class StreamManualClock(time: Long = 0L) extends ManualClock(time) {
+  class StreamManualClock(time: Long = 0L) extends ManualClock(time) with Serializable {
     private var waitStartTime: Option[Long] = None
 
     override def waitTillTime(targetTime: Long): Long = synchronized {
@@ -325,10 +321,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
 
     val testThread = Thread.currentThread()
     val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
-    val statusCollector = new QueryStatusCollector
     var manualClockExpectedTime = -1L
     try {
-      spark.streams.addListener(statusCollector)
       startedTest.foreach { action =>
         logInfo(s"Processing test stream action: $action")
         action match {
@@ -375,10 +369,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
                    s"can not advance clock of type ${currentStream.triggerClock.getClass}")
             val clock = currentStream.triggerClock.asInstanceOf[StreamManualClock]
             assert(manualClockExpectedTime >= 0)
+
             // Make sure we don't advance ManualClock too early. See SPARK-16002.
             eventually("StreamManualClock has not yet entered the waiting state") {
               assert(clock.isStreamWaitingAt(manualClockExpectedTime))
             }
+
             clock.advance(timeToAdd)
             manualClockExpectedTime += timeToAdd
             verify(clock.getTimeMillis() === manualClockExpectedTime,
@@ -447,13 +443,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
             val streamToAssert = Option(currentStream).getOrElse(lastStream)
             verify({ a.run(); true }, s"Assert failed: ${a.message}")
 
-          case a: AssertOnLastQueryStatus =>
-            Eventually.eventually(timeout(streamingTimeout)) {
-              require(statusCollector.lastTriggerStatus.nonEmpty)
-            }
-            val status = statusCollector.lastTriggerStatus.get
-            verify({ a.condition(status); true }, "Assert on last query status failed")
-
           case a: AddData =>
             try {
               // Add data and get the source where it was added, and the expected offset of the
@@ -528,7 +517,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       if (currentStream != null && currentStream.microBatchThread.isAlive) {
         currentStream.stop()
       }
-      spark.streams.removeListener(statusCollector)
 
       // Rollback prev configuration values
       resetConfValues.foreach {
@@ -614,7 +602,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
     testStream(ds)(actions: _*)
   }
 
-
   object AwaitTerminationTester {
 
     trait ExpectedBehavior
@@ -668,58 +655,4 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       }
     }
   }
-
-
-  class QueryStatusCollector extends StreamingQueryListener {
-    // to catch errors in the async listener events
-    @volatile private var asyncTestWaiter = new Waiter
-
-    @volatile var startStatus: StreamingQueryStatus = null
-    @volatile var terminationStatus: StreamingQueryStatus = null
-    @volatile var terminationException: Option[String] = null
-
-    private val progressStatuses = new mutable.ArrayBuffer[StreamingQueryStatus]
-
-    /** Get the info of the last trigger that processed data */
-    def lastTriggerStatus: Option[StreamingQueryStatus] = synchronized {
-      progressStatuses.filter { i =>
-        i.triggerDetails.get("isTriggerActive").toBoolean == false &&
-          i.triggerDetails.get("isDataPresentInTrigger").toBoolean == true
-      }.lastOption
-    }
-
-    def reset(): Unit = {
-      startStatus = null
-      terminationStatus = null
-      progressStatuses.clear()
-      asyncTestWaiter = new Waiter
-    }
-
-    def checkAsyncErrors(): Unit = {
-      asyncTestWaiter.await(timeout(10 seconds))
-    }
-
-
-    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
-      asyncTestWaiter {
-        startStatus = queryStarted.queryStatus
-      }
-    }
-
-    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
-      asyncTestWaiter {
-        assert(startStatus != null, "onQueryProgress called before onQueryStarted")
-        synchronized { progressStatuses += queryProgress.queryStatus }
-      }
-    }
-
-    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
-      asyncTestWaiter {
-        assert(startStatus != null, "onQueryTerminated called before onQueryStarted")
-        terminationStatus = queryTerminated.queryStatus
-        terminationException = queryTerminated.exception
-      }
-      asyncTestWaiter.dismiss()
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 98f3bec7080af..c68f953b10136 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -17,24 +17,26 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.UUID
+
 import scala.collection.mutable
 
 import org.scalactic.TolerantNumerics
+import org.scalatest.concurrent.AsyncAssertions.Waiter
+import org.scalatest.concurrent.Eventually._
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.BeforeAndAfter
 import org.scalatest.PrivateMethodTester._
 
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler._
-import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.functions._
-import org.apache.spark.util.{JsonProtocol, ManualClock}
-
+import org.apache.spark.sql.streaming.StreamingQueryListener._
+import org.apache.spark.util.JsonProtocol
 
 class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
 
   import testImplicits._
-  import StreamingQueryListenerSuite._
 
   // To make === between double tolerate inexact values
   implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.01)
@@ -46,86 +48,86 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     // Make sure we don't leak any events to the next test
   }
 
-  test("single listener, check trigger statuses") {
-    import StreamingQueryListenerSuite._
-    clock = new StreamManualClock
-
-    /** Custom MemoryStream that waits for manual clock to reach a time */
-    val inputData = new MemoryStream[Int](0, sqlContext) {
-      // Wait for manual clock to be 100 first time there is data
-      override def getOffset: Option[Offset] = {
-        val offset = super.getOffset
-        if (offset.nonEmpty) {
-          clock.waitTillTime(100)
+  testQuietly("single listener, check trigger events are generated correctly") {
+    val clock = new StreamManualClock
+    val inputData = new MemoryStream[Int](0, sqlContext)
+    val df = inputData.toDS().as[Long].map { 10 / _ }
+    val listener = new EventCollector
+    try {
+      // No events until started
+      spark.streams.addListener(listener)
+      assert(listener.startEvent === null)
+      assert(listener.progressEvents.isEmpty)
+      assert(listener.terminationEvent === null)
+
+      testStream(df, OutputMode.Append)(
+
+        // Start event generated when query started
+        StartStream(ProcessingTime(100), triggerClock = clock),
+        AssertOnQuery { query =>
+          assert(listener.startEvent !== null)
+          assert(listener.startEvent.id === query.id)
+          assert(listener.startEvent.name === query.name)
+          assert(listener.progressEvents.isEmpty)
+          assert(listener.terminationEvent === null)
+          true
+        },
+
+        // Progress event generated when data processed
+        AddData(inputData, 1, 2),
+        AdvanceManualClock(100),
+        CheckAnswer(10, 5),
+        AssertOnQuery { query =>
+          assert(listener.progressEvents.nonEmpty)
+          assert(listener.progressEvents.last.json === query.lastProgress.json)
+          assert(listener.terminationEvent === null)
+          true
+        },
+
+        // Termination event generated when stopped cleanly
+        StopStream,
+        AssertOnQuery { query =>
+          eventually(Timeout(streamingTimeout)) {
+            assert(listener.terminationEvent !== null)
+            assert(listener.terminationEvent.id === query.id)
+            assert(listener.terminationEvent.exception === None)
+          }
+          listener.checkAsyncErrors()
+          listener.reset()
+          true
+        },
+
+        // Termination event generated with exception message when stopped with error
+        StartStream(ProcessingTime(100), triggerClock = clock),
+        AddData(inputData, 0),
+        AdvanceManualClock(100),
+        ExpectFailure[SparkException],
+        AssertOnQuery { query =>
+          assert(listener.terminationEvent !== null)
+          assert(listener.terminationEvent.id === query.id)
+          assert(listener.terminationEvent.exception.nonEmpty)
+          listener.checkAsyncErrors()
+          true
         }
-        offset
-      }
-
-      // Wait for manual clock to be 300 first time there is data
-      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
-        clock.waitTillTime(300)
-        super.getBatch(start, end)
-      }
-    }
-
-    // This is to make sure thatquery waits for manual clock to be 600 first time there is data
-    val mapped = inputData.toDS().agg(count("*")).as[Long].coalesce(1).map { x =>
-      clock.waitTillTime(600)
-      x
+      )
+    } finally {
+      spark.streams.removeListener(listener)
     }
-
-    testStream(mapped, OutputMode.Complete)(
-      StartStream(triggerClock = clock),
-      AddData(inputData, 1, 2),
-      AdvanceManualClock(100),  // unblock getOffset, will block on getBatch
-      AdvanceManualClock(200),  // unblock getBatch, will block on computation
-      AdvanceManualClock(300),  // unblock computation
-      AssertOnQuery { _ => clock.getTimeMillis() === 600 },
-      AssertOnLastQueryStatus { status: StreamingQueryStatus =>
-        // Check the correctness of the trigger info of the last completed batch reported by
-        // onQueryProgress
-        assert(status.triggerDetails.containsKey("batchId"))
-        assert(status.triggerDetails.get("isTriggerActive") === "false")
-        assert(status.triggerDetails.get("isDataPresentInTrigger") === "true")
-
-        assert(status.triggerDetails.get("timestamp.triggerStart") === "0")
-        assert(status.triggerDetails.get("timestamp.afterGetOffset") === "100")
-        assert(status.triggerDetails.get("timestamp.afterGetBatch") === "300")
-        assert(status.triggerDetails.get("timestamp.triggerFinish") === "600")
-
-        assert(status.triggerDetails.get("latency.getOffset.total") === "100")
-        assert(status.triggerDetails.get("latency.getBatch.total") === "200")
-        assert(status.triggerDetails.get("latency.optimizer") === "0")
-        assert(status.triggerDetails.get("latency.offsetLogWrite") === "0")
-        assert(status.triggerDetails.get("latency.fullTrigger") === "600")
-
-        assert(status.triggerDetails.get("numRows.input.total") === "2")
-        assert(status.triggerDetails.get("numRows.state.aggregation1.total") === "1")
-        assert(status.triggerDetails.get("numRows.state.aggregation1.updated") === "1")
-
-        assert(status.sourceStatuses.length === 1)
-        assert(status.sourceStatuses(0).triggerDetails.containsKey("batchId"))
-        assert(status.sourceStatuses(0).triggerDetails.get("latency.getOffset.source") === "100")
-        assert(status.sourceStatuses(0).triggerDetails.get("latency.getBatch.source") === "200")
-        assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "2")
-      },
-      CheckAnswer(2)
-    )
   }
 
   test("adding and removing listener") {
-    def isListenerActive(listener: QueryStatusCollector): Boolean = {
+    def isListenerActive(listener: EventCollector): Boolean = {
       listener.reset()
       testStream(MemoryStream[Int].toDS)(
         StartStream(),
         StopStream
       )
-      listener.startStatus != null
+      listener.startEvent != null
     }
 
     try {
-      val listener1 = new QueryStatusCollector
-      val listener2 = new QueryStatusCollector
+      val listener1 = new EventCollector
+      val listener2 = new EventCollector
 
       spark.streams.addListener(listener1)
       assert(isListenerActive(listener1) === true)
@@ -142,14 +144,14 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   }
 
   test("event ordering") {
-    val listener = new QueryStatusCollector
+    val listener = new EventCollector
     withListenerAdded(listener) {
       for (i <- 1 to 100) {
         listener.reset()
-        require(listener.startStatus === null)
+        require(listener.startEvent === null)
         testStream(MemoryStream[Int].toDS)(
           StartStream(),
-          Assert(listener.startStatus !== null, "onQueryStarted not called before query returned"),
+          Assert(listener.startEvent !== null, "onQueryStarted not called before query returned"),
           StopStream,
           Assert { listener.checkAsyncErrors() }
         )
@@ -158,7 +160,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   }
 
   testQuietly("exception should be reported in QueryTerminated") {
-    val listener = new QueryStatusCollector
+    val listener = new EventCollector
     withListenerAdded(listener) {
       val input = MemoryStream[Int]
       testStream(input.toDS.map(_ / 0))(
@@ -167,49 +169,46 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         ExpectFailure[SparkException](),
         Assert {
           spark.sparkContext.listenerBus.waitUntilEmpty(10000)
-          assert(listener.terminationStatus !== null)
-          assert(listener.terminationException.isDefined)
+          assert(listener.terminationEvent !== null)
+          assert(listener.terminationEvent.exception.nonEmpty)
           // Make sure that the exception message reported through listener
           // contains the actual exception and relevant stack trace
-          assert(!listener.terminationException.get.contains("StreamingQueryException"))
-          assert(listener.terminationException.get.contains("java.lang.ArithmeticException"))
-          assert(listener.terminationException.get.contains("StreamingQueryListenerSuite"))
+          assert(!listener.terminationEvent.exception.get.contains("StreamingQueryException"))
+          assert(listener.terminationEvent.exception.get.contains("java.lang.ArithmeticException"))
+          assert(listener.terminationEvent.exception.get.contains("StreamingQueryListenerSuite"))
         }
       )
     }
   }
 
-  test("QueryStarted serialization") {
-    val queryStarted = new StreamingQueryListener.QueryStartedEvent(StreamingQueryStatus.testStatus)
+  test("QueryStartedEvent serialization") {
+    val queryStarted = new StreamingQueryListener.QueryStartedEvent(UUID.randomUUID(), "name")
     val json = JsonProtocol.sparkEventToJson(queryStarted)
     val newQueryStarted = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryStartedEvent]
-    assertStreamingQueryInfoEquals(queryStarted.queryStatus, newQueryStarted.queryStatus)
   }
 
-  test("QueryProgress serialization") {
-    val queryProcess = new StreamingQueryListener.QueryProgressEvent(
-      StreamingQueryStatus.testStatus)
-    val json = JsonProtocol.sparkEventToJson(queryProcess)
-    val newQueryProcess = JsonProtocol.sparkEventFromJson(json)
+  test("QueryProgressEvent serialization") {
+    val event = new StreamingQueryListener.QueryProgressEvent(
+      StreamingQueryProgressSuite.testProgress)
+    val json = JsonProtocol.sparkEventToJson(event)
+    val newEvent = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryProgressEvent]
-    assertStreamingQueryInfoEquals(queryProcess.queryStatus, newQueryProcess.queryStatus)
+    assert(event.progress.json === newEvent.progress.json)
   }
 
-  test("QueryTerminated serialization") {
+  test("QueryTerminatedEvent serialization") {
     val exception = new RuntimeException("exception")
     val queryQueryTerminated = new StreamingQueryListener.QueryTerminatedEvent(
-      StreamingQueryStatus.testStatus,
-      Some(exception.getMessage))
-    val json =
-      JsonProtocol.sparkEventToJson(queryQueryTerminated)
+      UUID.randomUUID, Some(exception.getMessage))
+    val json = JsonProtocol.sparkEventToJson(queryQueryTerminated)
     val newQueryTerminated = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryTerminatedEvent]
-    assertStreamingQueryInfoEquals(queryQueryTerminated.queryStatus, newQueryTerminated.queryStatus)
+    assert(queryQueryTerminated.id === newQueryTerminated.id)
     assert(queryQueryTerminated.exception === newQueryTerminated.exception)
   }
 
-  test("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
     // query-event-logs-version-2.0.0.txt has all types of events generated by
     // Structured Streaming in Spark 2.0.0.
     // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
@@ -217,7 +216,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.0.txt")
   }
 
-  test("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") {
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.1") {
     // query-event-logs-version-2.0.1.txt has all types of events generated by
     // Structured Streaming in Spark 2.0.1.
     // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
@@ -248,28 +247,6 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  private def assertStreamingQueryInfoEquals(
-      expected: StreamingQueryStatus,
-      actual: StreamingQueryStatus): Unit = {
-    assert(expected.name === actual.name)
-    assert(expected.sourceStatuses.size === actual.sourceStatuses.size)
-    expected.sourceStatuses.zip(actual.sourceStatuses).foreach {
-      case (expectedSource, actualSource) =>
-        assertSourceStatus(expectedSource, actualSource)
-    }
-    assertSinkStatus(expected.sinkStatus, actual.sinkStatus)
-  }
-
-  private def assertSourceStatus(expected: SourceStatus, actual: SourceStatus): Unit = {
-    assert(expected.description === actual.description)
-    assert(expected.offsetDesc === actual.offsetDesc)
-  }
-
-  private def assertSinkStatus(expected: SinkStatus, actual: SinkStatus): Unit = {
-    assert(expected.description === actual.description)
-    assert(expected.offsetDesc === actual.offsetDesc)
-  }
-
   private def withListenerAdded(listener: StreamingQueryListener)(body: => Unit): Unit = {
     try {
       failAfter(streamingTimeout) {
@@ -287,9 +264,51 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     val listenerBus = spark.streams invokePrivate listenerBusMethod()
     listenerBus.listeners.toArray.map(_.asInstanceOf[StreamingQueryListener])
   }
-}
 
-object StreamingQueryListenerSuite {
-  // Singleton reference to clock that does not get serialized in task closures
-  @volatile var clock: ManualClock = null
+  /** Collects events from the StreamingQueryListener for testing */
+  class EventCollector extends StreamingQueryListener {
+    // to catch errors in the async listener events
+    @volatile private var asyncTestWaiter = new Waiter
+
+    @volatile var startEvent: QueryStartedEvent = null
+    @volatile var terminationEvent: QueryTerminatedEvent = null
+
+    private val _progressEvents = new mutable.Queue[StreamingQueryProgress]
+
+    def progressEvents: Seq[StreamingQueryProgress] = _progressEvents.synchronized {
+      _progressEvents.filter(_.numInputRows > 0)
+    }
+
+    def reset(): Unit = {
+      startEvent = null
+      terminationEvent = null
+      _progressEvents.clear()
+      asyncTestWaiter = new Waiter
+    }
+
+    def checkAsyncErrors(): Unit = {
+      asyncTestWaiter.await(timeout(streamingTimeout))
+    }
+
+    override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
+      asyncTestWaiter {
+        startEvent = queryStarted
+      }
+    }
+
+    override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
+      asyncTestWaiter {
+        assert(startEvent != null, "onQueryProgress called before onQueryStarted")
+        _progressEvents.synchronized { _progressEvents += queryProgress.progress }
+      }
+    }
+
+    override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = {
+      asyncTestWaiter {
+        assert(startEvent != null, "onQueryTerminated called before onQueryStarted")
+        terminationEvent = queryTerminated
+      }
+      asyncTestWaiter.dismiss()
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
index 41ffd56cf1290..268b8ff7b41a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
@@ -62,7 +62,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
       assert(spark.streams.get(q1.id).eq(q1))
       assert(spark.streams.get(q2.id).eq(q2))
       assert(spark.streams.get(q3.id).eq(q3))
-      assert(spark.streams.get(-1) === null) // non-existent id
+      assert(spark.streams.get(java.util.UUID.randomUUID()) === null) // non-existent id
       q1.stop()
 
       assert(spark.streams.active.toSet === Set(q2, q3))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala
new file mode 100644
index 0000000000000..45d29f6b35b92
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming
+
+import java.util.UUID
+
+import scala.collection.JavaConverters._
+
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.streaming.StreamingQueryProgressSuite._
+
+
+class StreamingQueryProgressSuite extends SparkFunSuite {
+
+  test("prettyJson") {
+    val json = testProgress.prettyJson
+    assert(json ===
+      s"""
+        |{
+        |  "id" : "${testProgress.id.toString}",
+        |  "name" : "name",
+        |  "timestamp" : 1,
+        |  "numInputRows" : 678,
+        |  "inputRowsPerSecond" : 10.0,
+        |  "durationMs" : {
+        |    "total" : 0
+        |  },
+        |  "currentWatermark" : 3,
+        |  "stateOperators" : [ {
+        |    "numRowsTotal" : 0,
+        |    "numRowsUpdated" : 1
+        |  } ],
+        |  "sources" : [ {
+        |    "description" : "source",
+        |    "startOffset" : 123,
+        |    "endOffset" : 456,
+        |    "numInputRows" : 678,
+        |    "inputRowsPerSecond" : 10.0
+        |  } ],
+        |  "sink" : {
+        |    "description" : "sink"
+        |  }
+        |}
+      """.stripMargin.trim)
+    assert(compact(parse(json)) === testProgress.json)
+
+  }
+
+  test("json") {
+    assert(compact(parse(testProgress.json)) === testProgress.json)
+  }
+
+  test("toString") {
+    assert(testProgress.toString === testProgress.prettyJson)
+  }
+}
+
+object StreamingQueryProgressSuite {
+  val testProgress = new StreamingQueryProgress(
+    id = UUID.randomUUID(),
+    name = "name",
+    timestamp = 1L,
+    batchId = 2L,
+    durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
+    currentWatermark = 3L,
+    stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
+    sources = Array(
+      new SourceProgress(
+        description = "source",
+        startOffset = "123",
+        endOffset = "456",
+        numInputRows = 678,
+        inputRowsPerSecond = 10.0,
+        processedRowsPerSecond = Double.PositiveInfinity  // should not be present in the json
+      )
+    ),
+    sink = new SinkProgress("sink")
+  )
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
deleted file mode 100644
index 50a7d92ede9a5..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusSuite.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.apache.spark.SparkFunSuite
-
-class StreamingQueryStatusSuite extends SparkFunSuite {
-  test("toString") {
-    assert(StreamingQueryStatus.testStatus.sourceStatuses(0).toString ===
-      """
-        |Status of source MySource1
-        |    Available offset: 0
-        |    Input rate: 15.5 rows/sec
-        |    Processing rate: 23.5 rows/sec
-        |    Trigger details:
-        |        numRows.input.source: 100
-        |        latency.getOffset.source: 10
-        |        latency.getBatch.source: 20
-      """.stripMargin.trim, "SourceStatus.toString does not match")
-
-    assert(StreamingQueryStatus.testStatus.sinkStatus.toString ===
-      """
-        |Status of sink MySink
-        |    Committed offsets: [1, -]
-      """.stripMargin.trim, "SinkStatus.toString does not match")
-
-    assert(StreamingQueryStatus.testStatus.toString ===
-      """
-        |Status of query 'query'
-        |    Query id: 1
-        |    Status timestamp: 123
-        |    Input rate: 15.5 rows/sec
-        |    Processing rate 23.5 rows/sec
-        |    Latency: 345.0 ms
-        |    Trigger details:
-        |        batchId: 5
-        |        isDataPresentInTrigger: true
-        |        isTriggerActive: true
-        |        latency.getBatch.total: 20
-        |        latency.getOffset.total: 10
-        |        numRows.input.total: 100
-        |    Source statuses [1 source]:
-        |        Source 1 - MySource1
-        |            Available offset: 0
-        |            Input rate: 15.5 rows/sec
-        |            Processing rate: 23.5 rows/sec
-        |            Trigger details:
-        |                numRows.input.source: 100
-        |                latency.getOffset.source: 10
-        |                latency.getBatch.source: 20
-        |    Sink status - MySink
-        |        Committed offsets: [1, -]
-      """.stripMargin.trim, "StreamingQueryStatus.toString does not match")
-
-  }
-
-  test("json") {
-    assert(StreamingQueryStatus.testStatus.json ===
-      """
-        |{"name":"query","id":1,"timestamp":123,"inputRate":15.5,"processingRate":23.5,
-        |"latency":345.0,"triggerDetails":{"latency.getBatch.total":"20",
-        |"numRows.input.total":"100","isTriggerActive":"true","batchId":"5",
-        |"latency.getOffset.total":"10","isDataPresentInTrigger":"true"},
-        |"sourceStatuses":[{"description":"MySource1","offsetDesc":"0","inputRate":15.5,
-        |"processingRate":23.5,"triggerDetails":{"numRows.input.source":"100",
-        |"latency.getOffset.source":"10","latency.getBatch.source":"20"}}],
-        |"sinkStatus":{"description":"MySink","offsetDesc":"[1, -]"}}
-      """.stripMargin.replace("\n", "").trim)
-  }
-
-  test("prettyJson") {
-    assert(
-      StreamingQueryStatus.testStatus.prettyJson ===
-        """
-          |{
-          |  "name" : "query",
-          |  "id" : 1,
-          |  "timestamp" : 123,
-          |  "inputRate" : 15.5,
-          |  "processingRate" : 23.5,
-          |  "latency" : 345.0,
-          |  "triggerDetails" : {
-          |    "latency.getBatch.total" : "20",
-          |    "numRows.input.total" : "100",
-          |    "isTriggerActive" : "true",
-          |    "batchId" : "5",
-          |    "latency.getOffset.total" : "10",
-          |    "isDataPresentInTrigger" : "true"
-          |  },
-          |  "sourceStatuses" : [ {
-          |    "description" : "MySource1",
-          |    "offsetDesc" : "0",
-          |    "inputRate" : 15.5,
-          |    "processingRate" : 23.5,
-          |    "triggerDetails" : {
-          |      "numRows.input.source" : "100",
-          |      "latency.getOffset.source" : "10",
-          |      "latency.getBatch.source" : "20"
-          |    }
-          |  } ],
-          |  "sinkStatus" : {
-          |    "description" : "MySink",
-          |    "offsetDesc" : "[1, -]"
-          |  }
-          |}
-        """.stripMargin.trim)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 8ecb33cf9d266..4f3b4a2d7552b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -20,14 +20,15 @@ package org.apache.spark.sql.streaming
 import org.scalactic.TolerantNumerics
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.BeforeAndAfter
+import org.scalatest.concurrent.PatienceConfiguration.Timeout
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.SparkException
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.functions._
+import org.apache.spark.util.{ManualClock, Utils}
 
 
 class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
@@ -109,85 +110,139 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     )
   }
 
-  testQuietly("query statuses") {
-    val inputData = MemoryStream[Int]
-    val mapped = inputData.toDS().map(6 / _)
-    testStream(mapped)(
-      AssertOnQuery(q => q.status.name === q.name),
-      AssertOnQuery(q => q.status.id === q.id),
-      AssertOnQuery(_.status.timestamp <= System.currentTimeMillis),
-      AssertOnQuery(_.status.inputRate === 0.0),
-      AssertOnQuery(_.status.processingRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === "-"),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc === OffsetSeq(None :: Nil).toString),
-      AssertOnQuery(_.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === "-"),
-      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.sinkStatus.offsetDesc === new OffsetSeq(None :: Nil).toString),
+  testQuietly("query statuses and progresses") {
+    import StreamingQuerySuite._
+    clock = new StreamManualClock
+
+    /** Custom MemoryStream that waits for manual clock to reach a time */
+    val inputData = new MemoryStream[Int](0, sqlContext) {
+      // Wait for manual clock to be 100 first time there is data
+      override def getOffset: Option[Offset] = {
+        val offset = super.getOffset
+        if (offset.nonEmpty) {
+          clock.waitTillTime(300)
+        }
+        offset
+      }
 
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3),
-      AssertOnQuery(_.status.timestamp <= System.currentTimeMillis),
-      AssertOnQuery(_.status.inputRate >= 0.0),
-      AssertOnQuery(_.status.processingRate >= 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).description.contains("Memory")),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(0).json),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate >= 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate >= 0.0),
-      AssertOnQuery(_.status.sinkStatus.description.contains("Memory")),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        OffsetSeq.fill(LongOffset(0)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(0).json),
-      AssertOnQuery(_.sourceStatuses(0).inputRate >= 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate >= 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(0)).toString),
+      // Wait for manual clock to be 300 first time there is data
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        clock.waitTillTime(600)
+        super.getBatch(start, end)
+      }
+    }
 
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3, 6, 3),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).json),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        OffsetSeq.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).json),
-      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(1)).toString),
+    // This is to make sure thatquery waits for manual clock to be 600 first time there is data
+    val mapped = inputData.toDS().agg(count("*")).as[Long].coalesce(1).map { x =>
+      clock.waitTillTime(1100)
+      x
+    }
 
-      StopStream,
-      AssertOnQuery(_.status.inputRate === 0.0),
-      AssertOnQuery(_.status.processingRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(1).json),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        OffsetSeq.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(1).json),
-      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.status.triggerDetails.isEmpty),
+    case class AssertStreamExecThreadToWaitForClock()
+      extends AssertOnQuery(q => {
+        eventually(Timeout(streamingTimeout)) {
+          if (q.exception.isEmpty) {
+            assert(clock.asInstanceOf[StreamManualClock].isStreamWaitingAt(clock.getTimeMillis))
+          }
+        }
+        if (q.exception.isDefined) {
+          throw q.exception.get
+        }
+        true
+      }, "")
+
+    testStream(mapped, OutputMode.Complete)(
+      StartStream(ProcessingTime(100), triggerClock = clock),
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      // TODO: test status.message before trigger has started
+      // AssertOnQuery(_.lastProgress === null)  // there is an empty trigger as soon as started
+      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+
+      // Test status while offset is being fetched
+      AddData(inputData, 1, 2),
+      AdvanceManualClock(100), // time = 100 to start new trigger, will block on getOffset
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === true),
+      AssertOnQuery(_.status.message.toLowerCase.contains("getting offsets from")),
+      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+
+      // Test status while batch is being fetched
+      AdvanceManualClock(200), // time = 300 to unblock getOffset, will block on getBatch
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === true),
+      AssertOnQuery(_.status.message === "Processing new data"),
+      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+
+      // Test status while batch is being processed
+      AdvanceManualClock(300), // time = 600 to unblock getBatch, will block in Spark job
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === true),
+      AssertOnQuery(_.status.message === "Processing new data"),
+      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+
+      // Test status while batch processing has completed
+      AdvanceManualClock(500), // time = 1100 to unblock job
+      AssertOnQuery { _ => clock.getTimeMillis() === 1100 },
+      CheckAnswer(2),
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+      AssertOnQuery { query =>
+        assert(query.lastProgress != null)
+        assert(query.recentProgresses.exists(_.numInputRows > 0))
+        assert(query.recentProgresses.last.eq(query.lastProgress))
+
+        val progress = query.lastProgress
+        assert(progress.id === query.id)
+        assert(progress.name === query.name)
+        assert(progress.batchId === 0)
+        assert(progress.timestamp === 100)
+        assert(progress.numInputRows === 2)
+        assert(progress.processedRowsPerSecond === 2.0)
+
+        assert(progress.durationMs.get("getOffset") === 200)
+        assert(progress.durationMs.get("getBatch") === 300)
+        assert(progress.durationMs.get("queryPlanning") === 0)
+        assert(progress.durationMs.get("walCommit") === 0)
+        assert(progress.durationMs.get("triggerExecution") === 1000)
+
+        assert(progress.sources.length === 1)
+        assert(progress.sources(0).description contains "MemoryStream")
+        assert(progress.sources(0).startOffset === null)
+        assert(progress.sources(0).endOffset !== null)
+        assert(progress.sources(0).processedRowsPerSecond === 2.0)
+
+        assert(progress.stateOperators.length === 1)
+        assert(progress.stateOperators(0).numRowsUpdated === 1)
+        assert(progress.stateOperators(0).numRowsTotal === 1)
+
+        assert(progress.sink.description contains "MemorySink")
+        true
+      },
 
-      StartStream(),
-      AddData(inputData, 0),
-      ExpectFailure[SparkException],
-      AssertOnQuery(_.status.inputRate === 0.0),
-      AssertOnQuery(_.status.processingRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses.length === 1),
-      AssertOnQuery(_.status.sourceStatuses(0).offsetDesc === LongOffset(2).json),
-      AssertOnQuery(_.status.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.status.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.status.sinkStatus.offsetDesc ===
-        OffsetSeq.fill(LongOffset(1)).toString),
-      AssertOnQuery(_.sourceStatuses(0).offsetDesc === LongOffset(2).json),
-      AssertOnQuery(_.sourceStatuses(0).inputRate === 0.0),
-      AssertOnQuery(_.sourceStatuses(0).processingRate === 0.0),
-      AssertOnQuery(_.sinkStatus.offsetDesc === OffsetSeq.fill(LongOffset(1)).toString)
+      AddData(inputData, 1, 2),
+      AdvanceManualClock(100), // allow another trigger
+      CheckAnswer(4),
+      AssertOnQuery(_.status.isDataAvailable === true),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+      AssertOnQuery { query =>
+        assert(query.recentProgresses.last.eq(query.lastProgress))
+        assert(query.lastProgress.batchId === 1)
+        assert(query.lastProgress.sources(0).inputRowsPerSecond === 1.818)
+        true
+      },
+
+      // Test status after data is not available for a trigger
+      AdvanceManualClock(100), // allow another trigger
+      AssertStreamExecThreadToWaitForClock(),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Waiting for next trigger")
     )
   }
 
@@ -196,7 +251,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
 
     /** Whether metrics of a query is registered for reporting */
     def isMetricsRegistered(query: StreamingQuery): Boolean = {
-      val sourceName = s"StructuredStreaming.${query.name}"
+      val sourceName = s"spark.streaming.${query.name}"
       val sources = spark.sparkContext.env.metricsSystem.getSourcesByName(sourceName)
       require(sources.size <= 1)
       sources.nonEmpty
@@ -229,23 +284,23 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
 
     // Trigger input has 10 rows, static input has 2 rows,
     // therefore after the first trigger, the calculated input rows should be 10
-    val status = getFirstTriggerStatus(streamingInputDF.join(staticInputDF, "value"))
-    assert(status.triggerDetails.get("numRows.input.total") === "10")
-    assert(status.sourceStatuses.size === 1)
-    assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "10")
+    val progress = getFirstProgress(streamingInputDF.join(staticInputDF, "value"))
+    assert(progress.numInputRows === 10)
+    assert(progress.sources.size === 1)
+    assert(progress.sources(0).numInputRows === 10)
   }
 
-  test("input row calculation with trigger DF having multiple leaves") {
+  test("input row calculation with trigger input DF having multiple leaves") {
     val streamingTriggerDF =
       spark.createDataset(1 to 5).toDF.union(spark.createDataset(6 to 10).toDF)
     require(streamingTriggerDF.logicalPlan.collectLeaves().size > 1)
     val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF)
 
     // After the first trigger, the calculated input rows should be 10
-    val status = getFirstTriggerStatus(streamingInputDF)
-    assert(status.triggerDetails.get("numRows.input.total") === "10")
-    assert(status.sourceStatuses.size === 1)
-    assert(status.sourceStatuses(0).triggerDetails.get("numRows.input.source") === "10")
+    val progress = getFirstProgress(streamingInputDF)
+    assert(progress.numInputRows === 10)
+    assert(progress.sources.size === 1)
+    assert(progress.sources(0).numInputRows === 10)
   }
 
   testQuietly("StreamExecution metadata garbage collection") {
@@ -285,34 +340,14 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     StreamingExecutionRelation(source)
   }
 
-  /** Returns the query status at the end of the first trigger of streaming DF */
-  private def getFirstTriggerStatus(streamingDF: DataFrame): StreamingQueryStatus = {
-    // A StreamingQueryListener that gets the query status after the first completed trigger
-    val listener = new StreamingQueryListener {
-      @volatile var firstStatus: StreamingQueryStatus = null
-      @volatile var queryStartedEvent = 0
-      override def onQueryStarted(queryStarted: QueryStartedEvent): Unit = {
-        queryStartedEvent += 1
-      }
-      override def onQueryProgress(queryProgress: QueryProgressEvent): Unit = {
-       if (firstStatus == null) firstStatus = queryProgress.queryStatus
-      }
-      override def onQueryTerminated(queryTerminated: QueryTerminatedEvent): Unit = { }
-    }
-
+  /** Returns the query progress at the end of the first trigger of streaming DF */
+  private def getFirstProgress(streamingDF: DataFrame): StreamingQueryProgress = {
     try {
-      spark.streams.addListener(listener)
       val q = streamingDF.writeStream.format("memory").queryName("test").start()
       q.processAllAvailable()
-      eventually(timeout(streamingTimeout)) {
-        assert(listener.firstStatus != null)
-        // test if QueryStartedEvent callback is called for only once
-        assert(listener.queryStartedEvent === 1)
-      }
-      listener.firstStatus
+      q.recentProgresses.head
     } finally {
       spark.streams.active.map(_.stop())
-      spark.streams.removeListener(listener)
     }
   }
 
@@ -369,3 +404,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     }
   }
 }
+
+object StreamingQuerySuite {
+  // Singleton reference to clock that does not get serialized in task closures
+  var clock: ManualClock = null
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
index 3e9488c7dc9af..12f3c3e5ff3d9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
@@ -51,6 +51,7 @@ class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
 
 
   test("watermark metric") {
+
     val inputData = MemoryStream[Int]
 
     val windowedAggregation = inputData.toDF()
@@ -62,16 +63,19 @@ class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
 
     testStream(windowedAggregation)(
       AddData(inputData, 15),
-      AssertOnLastQueryStatus { status =>
-        status.triggerDetails.get(StreamMetrics.EVENT_TIME_WATERMARK) === "5000"
+      CheckAnswer(),
+      AssertOnQuery { query =>
+        query.lastProgress.currentWatermark === 5000
       },
       AddData(inputData, 15),
-      AssertOnLastQueryStatus { status =>
-        status.triggerDetails.get(StreamMetrics.EVENT_TIME_WATERMARK) === "5000"
+      CheckAnswer(),
+      AssertOnQuery { query =>
+        query.lastProgress.currentWatermark === 5000
       },
       AddData(inputData, 25),
-      AssertOnLastQueryStatus { status =>
-        status.triggerDetails.get(StreamMetrics.EVENT_TIME_WATERMARK) === "15000"
+      CheckAnswer(),
+      AssertOnQuery { query =>
+        query.lastProgress.currentWatermark === 15000
       }
     )
   }

From eb0b3631d0fe638e06cb497e1c8ad4cfa47dcc36 Mon Sep 17 00:00:00 2001
From: Yuhao <yuhao.yang@intel.com>
Date: Tue, 29 Nov 2016 18:46:59 -0800
Subject: [PATCH 229/534] [SPARK-18319][ML][QA2.1] 2.1 QA: API: Experimental,
 DeveloperApi, final, sealed audit

## What changes were proposed in this pull request?
make a pass through the items marked as Experimental or DeveloperApi and see if any are stable enough to be unmarked. Also check for items marked final or sealed to see if they are stable enough to be opened up as APIs.

Some discussions in the jira: https://issues.apache.org/jira/browse/SPARK-18319

## How was this patch tested?
existing ut

Author: Yuhao <yuhao.yang@intel.com>
Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #15972 from hhbyyh/experimental21.

(cherry picked from commit 9b670bcaec9c220603ec10a6d186865dabf26a5b)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 .../MultilayerPerceptronClassifier.scala         |  6 +-----
 .../spark/ml/clustering/BisectingKMeans.scala    |  5 -----
 .../spark/ml/clustering/GaussianMixture.scala    |  5 -----
 .../org/apache/spark/ml/clustering/KMeans.scala  |  4 ----
 .../org/apache/spark/ml/clustering/LDA.scala     | 12 ++----------
 .../apache/spark/ml/feature/LabeledPoint.scala   |  4 +---
 .../apache/spark/ml/feature/MaxAbsScaler.scala   |  6 +-----
 .../org/apache/spark/ml/util/ReadWrite.scala     | 14 +-------------
 .../spark/mllib/clustering/LDAOptimizer.scala    |  2 +-
 python/pyspark/ml/classification.py              |  4 ----
 python/pyspark/ml/clustering.py                  | 16 ----------------
 python/pyspark/ml/feature.py                     |  4 ----
 python/pyspark/ml/util.py                        |  8 --------
 13 files changed, 7 insertions(+), 83 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 1b45eafbaca23..aaaf7df34576a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -21,7 +21,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.ml.ann.{FeedForwardTopology, FeedForwardTrainer}
 import org.apache.spark.ml.feature.LabeledPoint
@@ -135,7 +135,6 @@ private object LabelConverter {
 }
 
 /**
- * :: Experimental ::
  * Classifier trainer based on the Multilayer Perceptron.
  * Each layer has sigmoid activation function, output layer has softmax.
  * Number of inputs has to be equal to the size of feature vectors.
@@ -143,7 +142,6 @@ private object LabelConverter {
  *
  */
 @Since("1.5.0")
-@Experimental
 class MultilayerPerceptronClassifier @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
   extends Predictor[Vector, MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel]
@@ -282,7 +280,6 @@ object MultilayerPerceptronClassifier
 }
 
 /**
- * :: Experimental ::
  * Classification model based on the Multilayer Perceptron.
  * Each layer has sigmoid activation function, output layer has softmax.
  *
@@ -291,7 +288,6 @@ object MultilayerPerceptronClassifier
  * @param weights the weights of layers
  */
 @Since("1.5.0")
-@Experimental
 class MultilayerPerceptronClassificationModel private[ml] (
     @Since("1.5.0") override val uid: String,
     @Since("1.5.0") val layers: Array[Int],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index e58df6ba9108a..4c20e6563bad1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -80,13 +80,11 @@ private[clustering] trait BisectingKMeansParams extends Params
 }
 
 /**
- * :: Experimental ::
  * Model fitted by BisectingKMeans.
  *
  * @param parentModel a model trained by [[org.apache.spark.mllib.clustering.BisectingKMeans]].
  */
 @Since("2.0.0")
-@Experimental
 class BisectingKMeansModel private[ml] (
     @Since("2.0.0") override val uid: String,
     private val parentModel: MLlibBisectingKMeansModel
@@ -197,8 +195,6 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
 }
 
 /**
- * :: Experimental ::
- *
  * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques"
  * by Steinbach, Karypis, and Kumar, with modification to fit Spark.
  * The algorithm starts from a single cluster that contains all points.
@@ -213,7 +209,6 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
  * KDD Workshop on Text Mining, 2000.</a>
  */
 @Since("2.0.0")
-@Experimental
 class BisectingKMeans @Since("2.0.0") (
     @Since("2.0.0") override val uid: String)
   extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index c764c3aa32a4c..ac56845581ae9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -68,8 +68,6 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
 }
 
 /**
- * :: Experimental ::
- *
  * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
  * are drawn from each Gaussian i with probability weights(i).
  *
@@ -80,7 +78,6 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
  *                  the Multivariate Gaussian (Normal) Distribution for Gaussian i
  */
 @Since("2.0.0")
-@Experimental
 class GaussianMixtureModel private[ml] (
     @Since("2.0.0") override val uid: String,
     @Since("2.0.0") val weights: Array[Double],
@@ -265,7 +262,6 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
 }
 
 /**
- * :: Experimental ::
  * Gaussian Mixture clustering.
  *
  * This class performs expectation maximization for multivariate Gaussian
@@ -284,7 +280,6 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
  * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  */
 @Since("2.0.0")
-@Experimental
 class GaussianMixture @Since("2.0.0") (
     @Since("2.0.0") override val uid: String)
   extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 6e124eb6ddca0..af8f35374a1f8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -95,13 +95,11 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
 }
 
 /**
- * :: Experimental ::
  * Model fitted by KMeans.
  *
  * @param parentModel a model trained by spark.mllib.clustering.KMeans.
  */
 @Since("1.5.0")
-@Experimental
 class KMeansModel private[ml] (
     @Since("1.5.0") override val uid: String,
     private val parentModel: MLlibKMeansModel)
@@ -247,13 +245,11 @@ object KMeansModel extends MLReadable[KMeansModel] {
 }
 
 /**
- * :: Experimental ::
  * K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
  *
  * @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
  */
 @Since("1.5.0")
-@Experimental
 class KMeans @Since("1.5.0") (
     @Since("1.5.0") override val uid: String)
   extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index cd403d842b694..583e5e0928eba 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -22,7 +22,7 @@ import org.json4s.DefaultFormats
 import org.json4s.JsonAST.JObject
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors, VectorUDT}
@@ -396,15 +396,13 @@ private object LDAParams {
 
 
 /**
- * :: Experimental ::
  * Model fitted by [[LDA]].
  *
  * @param vocabSize  Vocabulary size (number of terms or words in the vocabulary)
  * @param sparkSession  Used to construct local DataFrames for returning query results
  */
 @Since("1.6.0")
-@Experimental
-sealed abstract class LDAModel private[ml] (
+abstract class LDAModel private[ml] (
     @Since("1.6.0") override val uid: String,
     @Since("1.6.0") val vocabSize: Int,
     @Since("1.6.0") @transient private[ml] val sparkSession: SparkSession)
@@ -556,14 +554,12 @@ sealed abstract class LDAModel private[ml] (
 
 
 /**
- * :: Experimental ::
  *
  * Local (non-distributed) model fitted by [[LDA]].
  *
  * This model stores the inferred topics only; it does not store info about the training dataset.
  */
 @Since("1.6.0")
-@Experimental
 class LocalLDAModel private[ml] (
     uid: String,
     vocabSize: Int,
@@ -641,7 +637,6 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] {
 
 
 /**
- * :: Experimental ::
  *
  * Distributed model fitted by [[LDA]].
  * This type of model is currently only produced by Expectation-Maximization (EM).
@@ -653,7 +648,6 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] {
  *                             `copy()` cheap.
  */
 @Since("1.6.0")
-@Experimental
 class DistributedLDAModel private[ml] (
     uid: String,
     vocabSize: Int,
@@ -789,7 +783,6 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] {
 
 
 /**
- * :: Experimental ::
  *
  * Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
  *
@@ -813,7 +806,6 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] {
  * Latent Dirichlet allocation (Wikipedia)</a>
  */
 @Since("1.6.0")
-@Experimental
 class LDA @Since("1.6.0") (
     @Since("1.6.0") override val uid: String)
   extends Estimator[LDAModel] with LDAParams with DefaultParamsWritable {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala
index 7d8e4adcc2259..c5d0ec1a8d350 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/LabeledPoint.scala
@@ -19,11 +19,10 @@ package org.apache.spark.ml.feature
 
 import scala.beans.BeanInfo
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.ml.linalg.Vector
 
 /**
- * :: Experimental ::
  *
  * Class that represents the features and label of a data point.
  *
@@ -31,7 +30,6 @@ import org.apache.spark.ml.linalg.Vector
  * @param features List of features for this data point.
  */
 @Since("2.0.0")
-@Experimental
 @BeanInfo
 case class LabeledPoint(@Since("2.0.0") label: Double, @Since("2.0.0") features: Vector) {
   override def toString: String = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala
index acabf0b892660..85f9732f79f67 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.Since
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.ml.param.{ParamMap, Params}
@@ -48,12 +48,10 @@ private[feature] trait MaxAbsScalerParams extends Params with HasInputCol with H
 }
 
 /**
- * :: Experimental ::
  * Rescale each feature individually to range [-1, 1] by dividing through the largest maximum
  * absolute value in each feature. It does not shift/center the data, and thus does not destroy
  * any sparsity.
  */
-@Experimental
 @Since("2.0.0")
 class MaxAbsScaler @Since("2.0.0") (@Since("2.0.0") override val uid: String)
   extends Estimator[MaxAbsScalerModel] with MaxAbsScalerParams with DefaultParamsWritable {
@@ -101,11 +99,9 @@ object MaxAbsScaler extends DefaultParamsReadable[MaxAbsScaler] {
 }
 
 /**
- * :: Experimental ::
  * Model fitted by [[MaxAbsScaler]].
  *
  */
-@Experimental
 @Since("2.0.0")
 class MaxAbsScalerModel private[ml] (
     @Since("2.0.0") override val uid: String,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 95f480455ee45..c0e3801499818 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -26,7 +26,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel}
@@ -81,11 +81,8 @@ private[util] sealed trait BaseReadWrite {
 }
 
 /**
- * :: Experimental ::
- *
  * Abstract class for utility classes that can save ML instances.
  */
-@Experimental
 @Since("1.6.0")
 abstract class MLWriter extends BaseReadWrite with Logging {
 
@@ -138,11 +135,8 @@ abstract class MLWriter extends BaseReadWrite with Logging {
 }
 
 /**
- * :: Experimental ::
- *
  * Trait for classes that provide [[MLWriter]].
  */
-@Experimental
 @Since("1.6.0")
 trait MLWritable {
 
@@ -178,13 +172,10 @@ trait DefaultParamsWritable extends MLWritable { self: Params =>
 }
 
 /**
- * :: Experimental ::
- *
  * Abstract class for utility classes that can load ML instances.
  *
  * @tparam T ML instance type
  */
-@Experimental
 @Since("1.6.0")
 abstract class MLReader[T] extends BaseReadWrite {
 
@@ -202,13 +193,10 @@ abstract class MLReader[T] extends BaseReadWrite {
 }
 
 /**
- * :: Experimental ::
- *
  * Trait for objects that provide [[MLReader]].
  *
  * @tparam T ML instance type
  */
-@Experimental
 @Since("1.6.0")
 trait MLReadable[T] {
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 96b49bcc0aac1..48bae4276c480 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -38,7 +38,7 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.4.0")
 @DeveloperApi
-sealed trait LDAOptimizer {
+trait LDAOptimizer {
 
   /*
     DEVELOPERS NOTE:
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 8054a34db30f2..5fe4bab186bd2 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1138,8 +1138,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
                                      HasMaxIter, HasTol, HasSeed, HasStepSize, JavaMLWritable,
                                      JavaMLReadable):
     """
-    .. note:: Experimental
-
     Classifier trainer based on the Multilayer Perceptron.
     Each layer has sigmoid activation function, output layer has softmax.
     Number of inputs has to be equal to the size of feature vectors.
@@ -1311,8 +1309,6 @@ def getInitialWeights(self):
 class MultilayerPerceptronClassificationModel(JavaModel, JavaPredictionModel, JavaMLWritable,
                                               JavaMLReadable):
     """
-    .. note:: Experimental
-
     Model fitted by MultilayerPerceptronClassifier.
 
     .. versionadded:: 1.6.0
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index b29b5ac70e6fe..7f8d845564768 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -87,8 +87,6 @@ def clusterSizes(self):
 
 class GaussianMixtureModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     Model fitted by GaussianMixture.
 
     .. versionadded:: 2.0.0
@@ -141,8 +139,6 @@ def summary(self):
 class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
                       HasProbabilityCol, JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     GaussianMixture clustering.
     This class performs expectation maximization for multivariate Gaussian
     Mixture Models (GMMs).  A GMM represents a composite distribution of
@@ -441,8 +437,6 @@ def getInitSteps(self):
 
 class BisectingKMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     Model fitted by BisectingKMeans.
 
     .. versionadded:: 2.0.0
@@ -487,8 +481,6 @@ def summary(self):
 class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasSeed,
                       JavaMLWritable, JavaMLReadable):
     """
-    .. note:: Experimental
-
     A bisecting k-means algorithm based on the paper "A comparison of document clustering
     techniques" by Steinbach, Karypis, and Kumar, with modification to fit Spark.
     The algorithm starts from a single cluster that contains all points.
@@ -619,8 +611,6 @@ class BisectingKMeansSummary(ClusteringSummary):
 @inherit_doc
 class LDAModel(JavaModel):
     """
-    .. note:: Experimental
-
     Latent Dirichlet Allocation (LDA) model.
     This abstraction permits for different underlying representations,
     including local and distributed data structures.
@@ -697,8 +687,6 @@ def estimatedDocConcentration(self):
 @inherit_doc
 class DistributedLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Distributed model fitted by :py:class:`LDA`.
     This type of model is currently only produced by Expectation-Maximization (EM).
 
@@ -761,8 +749,6 @@ def getCheckpointFiles(self):
 @inherit_doc
 class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Local (non-distributed) model fitted by :py:class:`LDA`.
     This model stores the inferred topics only; it does not store info about the training dataset.
 
@@ -775,8 +761,6 @@ class LocalLDAModel(LDAModel, JavaMLReadable, JavaMLWritable):
 class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInterval,
           JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Latent Dirichlet Allocation (LDA), a topic model designed for text documents.
 
     Terminology:
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 40b63d4d31d4b..aada38d1ad2ec 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -654,8 +654,6 @@ def idf(self):
 @inherit_doc
 class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Rescale each feature individually to range [-1, 1] by dividing through the largest maximum
     absolute value in each feature. It does not shift/center the data, and thus does not destroy
     any sparsity.
@@ -715,8 +713,6 @@ def _create_model(self, java_model):
 
 class MaxAbsScalerModel(JavaModel, JavaMLReadable, JavaMLWritable):
     """
-    .. note:: Experimental
-
     Model fitted by :py:class:`MaxAbsScaler`.
 
     .. versionadded:: 2.0.0
diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
index bec4b28952102..c65b3d14be1df 100644
--- a/python/pyspark/ml/util.py
+++ b/python/pyspark/ml/util.py
@@ -62,8 +62,6 @@ def _randomUID(cls):
 @inherit_doc
 class MLWriter(object):
     """
-    .. note:: Experimental
-
     Utility class that can save ML instances.
 
     .. versionadded:: 2.0.0
@@ -129,8 +127,6 @@ def session(self, sparkSession):
 @inherit_doc
 class MLWritable(object):
     """
-    .. note:: Experimental
-
     Mixin for ML instances that provide :py:class:`MLWriter`.
 
     .. versionadded:: 2.0.0
@@ -159,8 +155,6 @@ def write(self):
 @inherit_doc
 class MLReader(object):
     """
-    .. note:: Experimental
-
     Utility class that can load ML instances.
 
     .. versionadded:: 2.0.0
@@ -242,8 +236,6 @@ def _load_java_obj(cls, clazz):
 @inherit_doc
 class MLReadable(object):
     """
-    .. note:: Experimental
-
     Mixin for instances that provide :py:class:`MLReader`.
 
     .. versionadded:: 2.0.0

From 55b1142bdbdcb9005e384a99ff5dffd3ae24216b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 29 Nov 2016 20:06:39 -0800
Subject: [PATCH 230/534] [SPARK-18145] Update documentation for hive partition
 management in 2.1

## What changes were proposed in this pull request?

This documents the partition handling changes for Spark 2.1 and how to migrate existing tables.

## How was this patch tested?

Built docs locally.

rxin

Author: Eric Liang <ekl@databricks.com>

Closes #16074 from ericl/spark-18145.

(cherry picked from commit 489845f3a0e2a3555b96b6f3dbb984c783b20d97)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 docs/sql-programming-guide.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 3093d48282919..51ba91130e91f 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1320,6 +1320,15 @@ options.
 
 # Migration Guide
 
+## Upgrading From Spark SQL 2.0 to 2.1
+
+ - Datasource tables now store partition metadata in the Hive metastore. This means that Hive DDLs such as `ALTER TABLE PARTITION ... SET LOCATION` are now available for tables created with the Datasource API.
+    - Legacy datasource tables can be migrated to this format via the `MSCK REPAIR TABLE` command. Migrating legacy tables is recommended to take advantage of Hive DDL support and improved planning performance.
+    - To determine if a table has been migrated, look for the `PartitionProvider: Catalog` attribute when issuing `DESCRIBE FORMATTED` on the table.
+ - Changes to `INSERT OVERWRITE TABLE ... PARTITION ...` behavior for Datasource tables.
+    - In prior Spark versions `INSERT OVERWRITE` overwrote the entire Datasource table, even when given a partition specification. Now only partitions matching the specification are overwritten.
+    - Note that this still differs from the behavior of Hive tables, which is to overwrite only partitions overlapping with newly inserted data.
+
 ## Upgrading From Spark SQL 1.6 to 2.0
 
  - `SparkSession` is now the new entry point of Spark that replaces the old `SQLContext` and

From b95aad7cad99a62851fe5e61692fda9bceb4b160 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Tue, 29 Nov 2016 20:51:27 -0800
Subject: [PATCH 231/534] [SPARK-15819][PYSPARK][ML] Add KMeanSummary in KMeans
 of PySpark

## What changes were proposed in this pull request?

Add python api for KMeansSummary
## How was this patch tested?

unit test added

Author: Jeff Zhang <zjffdu@apache.org>

Closes #13557 from zjffdu/SPARK-15819.

(cherry picked from commit 4c82ca86d979e5526a15666683eef3c79c37dc68)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 python/pyspark/ml/clustering.py | 41 +++++++++++++++++++++++++++++++++
 python/pyspark/ml/tests.py      | 15 ++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 7f8d845564768..35d0aefa04a8e 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -292,6 +292,17 @@ def probability(self):
         return self._call_java("probability")
 
 
+class KMeansSummary(ClusteringSummary):
+    """
+    .. note:: Experimental
+
+    Summary of KMeans.
+
+    .. versionadded:: 2.1.0
+    """
+    pass
+
+
 class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable):
     """
     Model fitted by KMeans.
@@ -312,6 +323,27 @@ def computeCost(self, dataset):
         """
         return self._call_java("computeCost", dataset)
 
+    @property
+    @since("2.1.0")
+    def hasSummary(self):
+        """
+        Indicates whether a training summary exists for this model instance.
+        """
+        return self._call_java("hasSummary")
+
+    @property
+    @since("2.1.0")
+    def summary(self):
+        """
+        Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the
+        training set. An exception is thrown if no summary exists.
+        """
+        if self.hasSummary:
+            return KMeansSummary(self._call_java("summary"))
+        else:
+            raise RuntimeError("No training summary available for this %s" %
+                               self.__class__.__name__)
+
 
 @inherit_doc
 class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed,
@@ -337,6 +369,13 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     True
     >>> rows[2].prediction == rows[3].prediction
     True
+    >>> model.hasSummary
+    True
+    >>> summary = model.summary
+    >>> summary.k
+    2
+    >>> summary.clusterSizes
+    [2, 2]
     >>> kmeans_path = temp_path + "/kmeans"
     >>> kmeans.save(kmeans_path)
     >>> kmeans2 = KMeans.load(kmeans_path)
@@ -345,6 +384,8 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     >>> model_path = temp_path + "/kmeans_model"
     >>> model.save(model_path)
     >>> model2 = KMeansModel.load(model_path)
+    >>> model2.hasSummary
+    False
     >>> model.clusterCenters()[0] == model2.clusterCenters()[0]
     array([ True,  True], dtype=bool)
     >>> model.clusterCenters()[1] == model2.clusterCenters()[1]
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index c0f0d4073564e..a0c288a0b71a2 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1129,6 +1129,21 @@ def test_bisecting_kmeans_summary(self):
         self.assertEqual(len(s.clusterSizes), 2)
         self.assertEqual(s.k, 2)
 
+    def test_kmeans_summary(self):
+        data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
+                (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
+        df = self.spark.createDataFrame(data, ["features"])
+        kmeans = KMeans(k=2, seed=1)
+        model = kmeans.fit(df)
+        self.assertTrue(model.hasSummary)
+        s = model.summary
+        self.assertTrue(isinstance(s.predictions, DataFrame))
+        self.assertEqual(s.featuresCol, "features")
+        self.assertEqual(s.predictionCol, "prediction")
+        self.assertTrue(isinstance(s.cluster, DataFrame))
+        self.assertEqual(len(s.clusterSizes), 2)
+        self.assertEqual(s.k, 2)
+
 
 class OneVsRestTests(SparkSessionTestCase):
 

From e780733b4d2ef40b1adbfcb172960987d2df758b Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 29 Nov 2016 23:08:56 -0800
Subject: [PATCH 232/534] [SPARK-18516][STRUCTURED STREAMING] Follow up PR to
 add StreamingQuery.status to Python

## What changes were proposed in this pull request?
- Add StreamingQueryStatus.json
- Make it not case class (to avoid unnecessarily exposing implicit object StreamingQueryStatus, consistent with StreamingQueryProgress)
- Add StreamingQuery.status to Python
- Fix post-termination status

## How was this patch tested?
New unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16075 from tdas/SPARK-18516-1.

(cherry picked from commit bc09a2b8c3b03a207a6e20627f2c5ec23c1efe8c)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 python/pyspark/sql/streaming.py               |  8 +++
 python/pyspark/sql/tests.py                   |  5 ++
 .../streaming/ProgressReporter.scala          |  5 +-
 .../execution/streaming/StreamExecution.scala |  4 ++
 .../sql/streaming/StreamingQueryStatus.scala  | 38 ++++++++++++--
 .../apache/spark/sql/streaming/progress.scala |  9 ++--
 .../StreamingQueryListenerSuite.scala         | 29 +++--------
 ...treamingQueryStatusAndProgressSuite.scala} | 34 ++++++++++---
 .../sql/streaming/StreamingQuerySuite.scala   | 49 +++++++++++++------
 9 files changed, 127 insertions(+), 54 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/streaming/{StreamingQueryProgressSuite.scala => StreamingQueryStatusAndProgressSuite.scala} (75%)

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index c420b0d016091..84f01d3d9ac0b 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -88,6 +88,14 @@ def awaitTermination(self, timeout=None):
         else:
             return self._jsq.awaitTermination()
 
+    @property
+    @since(2.1)
+    def status(self):
+        """
+        Returns the current status of the query.
+        """
+        return json.loads(self._jsq.status().json())
+
     @property
     @since(2.1)
     def recentProgresses(self):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 7151f95216e03..b7b2a5923c07f 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1097,9 +1097,14 @@ def test_stream_status_and_progress(self):
             q.processAllAvailable()
             lastProgress = q.lastProgress
             recentProgresses = q.recentProgresses
+            status = q.status
             self.assertEqual(lastProgress['name'], q.name)
             self.assertEqual(lastProgress['id'], q.id)
             self.assertTrue(any(p == lastProgress for p in recentProgresses))
+            self.assertTrue(
+                "message" in status and
+                "isDataAvailable" in status and
+                "isTriggerActive" in status)
         finally:
             q.stop()
             shutil.rmtree(tmpPath)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index b7b6e1988eef5..ba77e7c7bf2b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -70,11 +70,12 @@ trait ProgressReporter extends Logging {
   private val progressBuffer = new mutable.Queue[StreamingQueryProgress]()
 
   @volatile
-  protected var currentStatus: StreamingQueryStatus =
-    StreamingQueryStatus(
+  protected var currentStatus: StreamingQueryStatus = {
+    new StreamingQueryStatus(
       message = "Initializing StreamExecution",
       isDataAvailable = false,
       isTriggerActive = false)
+  }
 
   /** Returns the current status of the query. */
   def status: StreamingQueryStatus = currentStatus
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index e4f31af35fdf4..6d0e269d341ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -238,8 +238,10 @@ class StreamExecution(
         updateStatusMessage("Waiting for next trigger")
         isTerminated
       })
+      updateStatusMessage("Stopped")
     } catch {
       case _: InterruptedException if state == TERMINATED => // interrupted by stop()
+        updateStatusMessage("Stopped")
       case e: Throwable =>
         streamDeathCause = new StreamingQueryException(
           this,
@@ -247,6 +249,7 @@ class StreamExecution(
           e,
           Some(committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json)))
         logError(s"Query $name terminated with error", e)
+        updateStatusMessage(s"Terminated with exception: ${e.getMessage}")
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
         // handle them
         if (!NonFatal(e)) {
@@ -254,6 +257,7 @@ class StreamExecution(
         }
     } finally {
       state = TERMINATED
+      currentStatus = status.copy(isTriggerActive = false, isDataAvailable = false)
 
       // Update metrics and status
       sparkSession.sparkContext.env.metricsSystem.removeSource(streamMetrics)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
index 4c1a7ce6a03fb..44befa0d2ff76 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryStatus.scala
@@ -17,6 +17,11 @@
 
 package org.apache.spark.sql.streaming
 
+import org.json4s._
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
 /**
  * Reports information about the instantaneous status of a streaming query.
  *
@@ -27,7 +32,32 @@ package org.apache.spark.sql.streaming
  *
  * @since 2.1.0
  */
-case class StreamingQueryStatus protected[sql](
-    message: String,
-    isDataAvailable: Boolean,
-    isTriggerActive: Boolean)
+class StreamingQueryStatus protected[sql](
+    val message: String,
+    val isDataAvailable: Boolean,
+    val isTriggerActive: Boolean) {
+
+  /** The compact JSON representation of this status. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this status. */
+  def prettyJson: String = pretty(render(jsonValue))
+
+  override def toString: String = prettyJson
+
+  private[sql] def copy(
+      message: String = this.message,
+      isDataAvailable: Boolean = this.isDataAvailable,
+      isTriggerActive: Boolean = this.isTriggerActive): StreamingQueryStatus = {
+    new StreamingQueryStatus(
+      message = message,
+      isDataAvailable = isDataAvailable,
+      isTriggerActive = isTriggerActive)
+  }
+
+  private[sql] def jsonValue: JValue = {
+    ("message" -> JString(message.toString)) ~
+    ("isDataAvailable" -> JBool(isDataAvailable)) ~
+    ("isTriggerActive" -> JBool(isTriggerActive))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index 7129fa4d15ef8..4c8247458fcfe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -23,7 +23,6 @@ import java.util.UUID
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
-import org.apache.jute.compiler.JLong
 import org.json4s._
 import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
@@ -85,10 +84,10 @@ class StreamingQueryProgress private[sql](
   /** The aggregate (across all sources) rate at which Spark is processing data. */
   def processedRowsPerSecond: Double = sources.map(_.processedRowsPerSecond).sum
 
-  /** The compact JSON representation of this status. */
+  /** The compact JSON representation of this progress. */
   def json: String = compact(render(jsonValue))
 
-  /** The pretty (i.e. indented) JSON representation of this status. */
+  /** The pretty (i.e. indented) JSON representation of this progress. */
   def prettyJson: String = pretty(render(jsonValue))
 
   override def toString: String = prettyJson
@@ -179,10 +178,10 @@ class SourceProgress protected[sql](
 class SinkProgress protected[sql](
     val description: String) {
 
-  /** The compact JSON representation of this status. */
+  /** The compact JSON representation of this progress. */
   def json: String = compact(render(jsonValue))
 
-  /** The pretty (i.e. indented) JSON representation of this status. */
+  /** The pretty (i.e. indented) JSON representation of this progress. */
   def prettyJson: String = pretty(render(jsonValue))
 
   override def toString: String = prettyJson
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index c68f953b10136..08b93e7d0b498 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -106,6 +106,11 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
           assert(listener.terminationEvent !== null)
           assert(listener.terminationEvent.id === query.id)
           assert(listener.terminationEvent.exception.nonEmpty)
+          // Make sure that the exception message reported through listener
+          // contains the actual exception and relevant stack trace
+          assert(!listener.terminationEvent.exception.get.contains("StreamingQueryException"))
+          assert(listener.terminationEvent.exception.get.contains("java.lang.ArithmeticException"))
+          assert(listener.terminationEvent.exception.get.contains("StreamingQueryListenerSuite"))
           listener.checkAsyncErrors()
           true
         }
@@ -159,28 +164,6 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  testQuietly("exception should be reported in QueryTerminated") {
-    val listener = new EventCollector
-    withListenerAdded(listener) {
-      val input = MemoryStream[Int]
-      testStream(input.toDS.map(_ / 0))(
-        StartStream(),
-        AddData(input, 1),
-        ExpectFailure[SparkException](),
-        Assert {
-          spark.sparkContext.listenerBus.waitUntilEmpty(10000)
-          assert(listener.terminationEvent !== null)
-          assert(listener.terminationEvent.exception.nonEmpty)
-          // Make sure that the exception message reported through listener
-          // contains the actual exception and relevant stack trace
-          assert(!listener.terminationEvent.exception.get.contains("StreamingQueryException"))
-          assert(listener.terminationEvent.exception.get.contains("java.lang.ArithmeticException"))
-          assert(listener.terminationEvent.exception.get.contains("StreamingQueryListenerSuite"))
-        }
-      )
-    }
-  }
-
   test("QueryStartedEvent serialization") {
     val queryStarted = new StreamingQueryListener.QueryStartedEvent(UUID.randomUUID(), "name")
     val json = JsonProtocol.sparkEventToJson(queryStarted)
@@ -190,7 +173,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
 
   test("QueryProgressEvent serialization") {
     val event = new StreamingQueryListener.QueryProgressEvent(
-      StreamingQueryProgressSuite.testProgress)
+      StreamingQueryStatusAndProgressSuite.testProgress)
     val json = JsonProtocol.sparkEventToJson(event)
     val newEvent = JsonProtocol.sparkEventFromJson(json)
       .asInstanceOf[StreamingQueryListener.QueryProgressEvent]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
similarity index 75%
rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
index 45d29f6b35b92..4da712fa0f7e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryProgressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -25,12 +25,12 @@ import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.streaming.StreamingQueryProgressSuite._
+import org.apache.spark.sql.streaming.StreamingQueryStatusAndProgressSuite._
 
 
-class StreamingQueryProgressSuite extends SparkFunSuite {
+class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
 
-  test("prettyJson") {
+  test("StreamingQueryProgress - prettyJson") {
     val json = testProgress.prettyJson
     assert(json ===
       s"""
@@ -64,16 +64,36 @@ class StreamingQueryProgressSuite extends SparkFunSuite {
 
   }
 
-  test("json") {
+  test("StreamingQueryProgress - json") {
     assert(compact(parse(testProgress.json)) === testProgress.json)
   }
 
-  test("toString") {
+  test("StreamingQueryProgress - toString") {
     assert(testProgress.toString === testProgress.prettyJson)
   }
+
+  test("StreamingQueryStatus - prettyJson") {
+    val json = testStatus.prettyJson
+    assert(json ===
+      """
+        |{
+        |  "message" : "active",
+        |  "isDataAvailable" : true,
+        |  "isTriggerActive" : false
+        |}
+      """.stripMargin.trim)
+  }
+
+  test("StreamingQueryStatus - json") {
+    assert(compact(parse(testStatus.json)) === testStatus.json)
+  }
+
+  test("StreamingQueryStatus - toString") {
+    assert(testStatus.toString === testStatus.prettyJson)
+  }
 }
 
-object StreamingQueryProgressSuite {
+object StreamingQueryStatusAndProgressSuite {
   val testProgress = new StreamingQueryProgress(
     id = UUID.randomUUID(),
     name = "name",
@@ -94,5 +114,7 @@ object StreamingQueryProgressSuite {
     ),
     sink = new SinkProgress("sink")
   )
+
+  val testStatus = new StreamingQueryStatus("active", true, false)
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 4f3b4a2d7552b..56abe1201c0cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -77,7 +77,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     q2.stop()
   }
 
-  testQuietly("lifecycle states and awaitTermination") {
+  testQuietly("isActive, exception, and awaitTermination") {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map { 6 / _}
 
@@ -110,7 +110,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     )
   }
 
-  testQuietly("query statuses and progresses") {
+  testQuietly("status, lastProgress, and recentProgresses") {
     import StreamingQuerySuite._
     clock = new StreamManualClock
 
@@ -133,10 +133,10 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     }
 
     // This is to make sure thatquery waits for manual clock to be 600 first time there is data
-    val mapped = inputData.toDS().agg(count("*")).as[Long].coalesce(1).map { x =>
+    val mapped = inputData.toDS().as[Long].map { x =>
       clock.waitTillTime(1100)
-      x
-    }
+      10 / x
+    }.agg(count("*")).as[Long]
 
     case class AssertStreamExecThreadToWaitForClock()
       extends AssertOnQuery(q => {
@@ -151,25 +151,26 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
         true
       }, "")
 
+    var lastProgressBeforeStop: StreamingQueryProgress = null
+
     testStream(mapped, OutputMode.Complete)(
       StartStream(ProcessingTime(100), triggerClock = clock),
       AssertStreamExecThreadToWaitForClock(),
       AssertOnQuery(_.status.isDataAvailable === false),
       AssertOnQuery(_.status.isTriggerActive === false),
-      // TODO: test status.message before trigger has started
-      // AssertOnQuery(_.lastProgress === null)  // there is an empty trigger as soon as started
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
       AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
 
-      // Test status while offset is being fetched
+      // Test status and progress while offset is being fetched
       AddData(inputData, 1, 2),
       AdvanceManualClock(100), // time = 100 to start new trigger, will block on getOffset
       AssertStreamExecThreadToWaitForClock(),
       AssertOnQuery(_.status.isDataAvailable === false),
       AssertOnQuery(_.status.isTriggerActive === true),
-      AssertOnQuery(_.status.message.toLowerCase.contains("getting offsets from")),
+      AssertOnQuery(_.status.message.startsWith("Getting offsets from")),
       AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
 
-      // Test status while batch is being fetched
+      // Test status and progress while batch is being fetched
       AdvanceManualClock(200), // time = 300 to unblock getOffset, will block on getBatch
       AssertStreamExecThreadToWaitForClock(),
       AssertOnQuery(_.status.isDataAvailable === true),
@@ -177,14 +178,14 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.message === "Processing new data"),
       AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
 
-      // Test status while batch is being processed
+      // Test status and progress while batch is being processed
       AdvanceManualClock(300), // time = 600 to unblock getBatch, will block in Spark job
       AssertOnQuery(_.status.isDataAvailable === true),
       AssertOnQuery(_.status.isTriggerActive === true),
       AssertOnQuery(_.status.message === "Processing new data"),
       AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
 
-      // Test status while batch processing has completed
+      // Test status and progress while batch processing has completed
       AdvanceManualClock(500), // time = 1100 to unblock job
       AssertOnQuery { _ => clock.getTimeMillis() === 1100 },
       CheckAnswer(2),
@@ -237,12 +238,32 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
         true
       },
 
-      // Test status after data is not available for a trigger
+      // Test status and progress after data is not available for a trigger
       AdvanceManualClock(100), // allow another trigger
       AssertStreamExecThreadToWaitForClock(),
       AssertOnQuery(_.status.isDataAvailable === false),
       AssertOnQuery(_.status.isTriggerActive === false),
-      AssertOnQuery(_.status.message === "Waiting for next trigger")
+      AssertOnQuery(_.status.message === "Waiting for next trigger"),
+
+      // Test status and progress after query stopped
+      AssertOnQuery { query =>
+        lastProgressBeforeStop = query.lastProgress
+        true
+      },
+      StopStream,
+      AssertOnQuery(_.lastProgress.json === lastProgressBeforeStop.json),
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message === "Stopped"),
+
+      // Test status and progress after query terminated with error
+      StartStream(ProcessingTime(100), triggerClock = clock),
+      AddData(inputData, 0),
+      AdvanceManualClock(100),
+      ExpectFailure[SparkException],
+      AssertOnQuery(_.status.isDataAvailable === false),
+      AssertOnQuery(_.status.isTriggerActive === false),
+      AssertOnQuery(_.status.message.startsWith("Terminated with exception"))
     )
   }
 

From a5ec2a7b25cc8fb11f74761a9fad5833676da679 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 30 Nov 2016 15:17:29 +0800
Subject: [PATCH 233/534] [SPARK-17680][SQL][TEST] Added a Testcase for
 Verifying Unicode Character Support for Column Names and Comments

### What changes were proposed in this pull request?

Spark SQL supports Unicode characters for column names when specified within backticks(`). When the Hive support is enabled, the version of the Hive metastore must be higher than 0.12,  See the JIRA: https://issues.apache.org/jira/browse/HIVE-6013 Hive metastore supports Unicode characters for column names since 0.13.

In Spark SQL, table comments, and view comments always allow Unicode characters without backticks.

BTW, a separate PR has been submitted for database and table name validation because we do not support Unicode characters in these two cases.
### How was this patch tested?

N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #15255 from gatorsmile/unicodeSupport.

(cherry picked from commit a1d9138ab286dc58d7f61c27419de7ecbf5b828b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/hive/execution/HiveDDLSuite.scala     | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 951e0704148b3..f313db641b152 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -147,6 +147,51 @@ class HiveDDLSuite
     }
   }
 
+  test("create Hive-serde table and view with unicode columns and comment") {
+    val catalog = spark.sessionState.catalog
+    val tabName = "tab1"
+    val viewName = "view1"
+    // scalastyle:off
+    // non ascii characters are not allowed in the source code, so we disable the scalastyle.
+    val colName1 = "和"
+    val colName2 = "尼"
+    val comment = "庙"
+    // scalastyle:on
+    withTable(tabName) {
+      sql(s"""
+             |CREATE TABLE $tabName(`$colName1` int COMMENT '$comment')
+             |COMMENT '$comment'
+             |PARTITIONED BY (`$colName2` int)
+           """.stripMargin)
+      sql(s"INSERT OVERWRITE TABLE $tabName partition (`$colName2`=2) SELECT 1")
+      withView(viewName) {
+        sql(
+          s"""
+             |CREATE VIEW $viewName(`$colName1` COMMENT '$comment', `$colName2`)
+             |COMMENT '$comment'
+             |AS SELECT `$colName1`, `$colName2` FROM $tabName
+           """.stripMargin)
+        val tableMetadata = catalog.getTableMetadata(TableIdentifier(tabName, Some("default")))
+        val viewMetadata = catalog.getTableMetadata(TableIdentifier(viewName, Some("default")))
+        assert(tableMetadata.comment == Option(comment))
+        assert(viewMetadata.comment == Option(comment))
+
+        assert(tableMetadata.schema.fields.length == 2 && viewMetadata.schema.fields.length == 2)
+        val column1InTable = tableMetadata.schema.fields.head
+        val column1InView = viewMetadata.schema.fields.head
+        assert(column1InTable.name == colName1 && column1InView.name == colName1)
+        assert(column1InTable.getComment() == Option(comment))
+        assert(column1InView.getComment() == Option(comment))
+
+        assert(tableMetadata.schema.fields(1).name == colName2 &&
+          viewMetadata.schema.fields(1).name == colName2)
+
+        checkAnswer(sql(s"SELECT `$colName1`, `$colName2` FROM $tabName"), Row(1, 2) :: Nil)
+        checkAnswer(sql(s"SELECT `$colName1`, `$colName2` FROM $viewName"), Row(1, 2) :: Nil)
+      }
+    }
+  }
+
   test("create table: partition column names exist in table definition") {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) PARTITIONED BY (a string)")

From 8cd466e831a7987a6fb04833c31b9b442da092db Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Wed, 30 Nov 2016 15:25:33 +0800
Subject: [PATCH 234/534] [SPARK-18622][SQL] Fix the datatype of the Sum
 aggregate function

## What changes were proposed in this pull request?
The result of a `sum` aggregate function is typically a Decimal, Double or a Long. Currently the output dataType is based on input's dataType.

The `FunctionArgumentConversion` rule will make sure that the input is promoted to the largest type, and that also ensures that the output uses a (hopefully) sufficiently large output dataType. The issue is that sum is in a resolved state when we cast the input type, this means that rules assuming that the dataType of the expression does not change anymore could have been applied in the mean time. This is what happens if we apply `WidenSetOperationTypes` before applying the casts, and this breaks analysis.

The most straight forward and future proof solution is to make `sum` always output the widest dataType in its class (Long for IntegralTypes, Decimal for DecimalTypes & Double for FloatType and DoubleType). This PR implements that solution.

We should move expression specific type casting rules into the given Expression at some point.

## How was this patch tested?
Added (regression) tests to SQLQueryTestSuite's `union.sql`.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #16063 from hvanhovell/SPARK-18622.

(cherry picked from commit 879ba71110b6c85a4e47133620fbae7580650a6f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/expressions/aggregate/Sum.scala  |  6 +-
 .../test/resources/sql-tests/inputs/union.sql | 27 +++++++
 .../resources/sql-tests/results/union.sql.out | 80 +++++++++++++++++++
 3 files changed, 110 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/union.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/union.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
index f3731d40058e3..3c77b1198ac2d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala
@@ -33,8 +33,7 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
   // Return data type.
   override def dataType: DataType = resultType
 
-  override def inputTypes: Seq[AbstractDataType] =
-    Seq(TypeCollection(LongType, DoubleType, DecimalType))
+  override def inputTypes: Seq[AbstractDataType] = Seq(NumericType)
 
   override def checkInputDataTypes(): TypeCheckResult =
     TypeUtils.checkForNumericExpr(child.dataType, "function sum")
@@ -42,7 +41,8 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
   private lazy val resultType = child.dataType match {
     case DecimalType.Fixed(precision, scale) =>
       DecimalType.bounded(precision + 10, scale)
-    case _ => child.dataType
+    case _: IntegralType => LongType
+    case _ => DoubleType
   }
 
   private lazy val sumDataType = resultType
diff --git a/sql/core/src/test/resources/sql-tests/inputs/union.sql b/sql/core/src/test/resources/sql-tests/inputs/union.sql
new file mode 100644
index 0000000000000..1f4780abde2d2
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/union.sql
@@ -0,0 +1,27 @@
+CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2);
+CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2);
+
+-- Simple Union
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t1);
+
+-- Type Coerced Union
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t2
+        UNION ALL
+        SELECT * FROM t2);
+
+-- Regression test for SPARK-18622
+SELECT a
+FROM (SELECT 0 a, 0 b
+      UNION ALL
+      SELECT SUM(1) a, CAST(0 AS BIGINT) b
+      UNION ALL SELECT 0 a, 0 b) T;
+
+-- Clean-up
+DROP VIEW IF EXISTS t1;
+DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out
new file mode 100644
index 0000000000000..c57028cabe933
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out
@@ -0,0 +1,80 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 7
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t1)
+-- !query 2 schema
+struct<c1:int,c2:string>
+-- !query 2 output
+1	a
+1	a
+2	b
+2	b
+
+
+-- !query 3
+SELECT *
+FROM   (SELECT * FROM t1
+        UNION ALL
+        SELECT * FROM t2
+        UNION ALL
+        SELECT * FROM t2)
+-- !query 3 schema
+struct<c1:decimal(11,1),c2:string>
+-- !query 3 output
+1	1
+1	1
+1	a
+2	4
+2	4
+2	b
+
+
+-- !query 4
+SELECT a
+FROM (SELECT 0 a, 0 b
+      UNION ALL
+      SELECT SUM(1) a, CAST(0 AS BIGINT) b
+      UNION ALL SELECT 0 a, 0 b) T
+-- !query 4 schema
+struct<a:bigint>
+-- !query 4 output
+0
+0
+1
+
+
+-- !query 5
+DROP VIEW IF EXISTS t1
+-- !query 5 schema
+struct<>
+-- !query 5 output
+
+
+
+-- !query 6
+DROP VIEW IF EXISTS t2
+-- !query 6 schema
+struct<>
+-- !query 6 output
+

From 5e4afbfb6e3993533cb0ab1bece2ea504801a7cb Mon Sep 17 00:00:00 2001
From: uncleGen <hustyugm@gmail.com>
Date: Tue, 29 Nov 2016 23:45:06 -0800
Subject: [PATCH 235/534] [SPARK-18617][CORE][STREAMING] Close "kryo auto pick"
 feature for Spark Streaming

## What changes were proposed in this pull request?

#15992 provided a solution to fix the bug, i.e. **receiver data can not be deserialized properly**. As zsxwing said, it is a critical bug, but we should not break APIs between maintenance releases. It may be a rational choice to close auto pick kryo serializer for Spark Streaming in the first step. I will continue #15992 to optimize the solution.

## How was this patch tested?

existing ut

Author: uncleGen <hustyugm@gmail.com>

Closes #16052 from uncleGen/SPARK-18617.

(cherry picked from commit 56c82edabd62db9e936bb9afcf300faf8ef39362)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/serializer/SerializerManager.scala  | 16 +++++--
 .../spark/storage/memory/MemoryStore.scala    |  5 +-
 .../PartiallySerializedBlockSuite.scala       |  6 ++-
 .../streaming/StreamingContextSuite.scala     | 47 +++++++++++++++++++
 4 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
index ef8432ec0834a..7371f886575c6 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
@@ -79,8 +79,11 @@ private[spark] class SerializerManager(
     primitiveAndPrimitiveArrayClassTags.contains(ct) || ct == stringClassTag
   }
 
-  def getSerializer(ct: ClassTag[_]): Serializer = {
-    if (canUseKryo(ct)) {
+  // SPARK-18617: As feature in SPARK-13990 can not be applied to Spark Streaming now. The worst
+  // result is streaming job based on `Receiver` mode can not run on Spark 2.x properly. It may be
+  // a rational choice to close `kryo auto pick` feature for streaming in the first step.
+  def getSerializer(ct: ClassTag[_], autoPick: Boolean): Serializer = {
+    if (autoPick && canUseKryo(ct)) {
       kryoSerializer
     } else {
       defaultSerializer
@@ -161,7 +164,8 @@ private[spark] class SerializerManager(
       outputStream: OutputStream,
       values: Iterator[T]): Unit = {
     val byteStream = new BufferedOutputStream(outputStream)
-    val ser = getSerializer(implicitly[ClassTag[T]]).newInstance()
+    val autoPick = !blockId.isInstanceOf[StreamBlockId]
+    val ser = getSerializer(implicitly[ClassTag[T]], autoPick).newInstance()
     ser.serializeStream(wrapStream(blockId, byteStream)).writeAll(values).close()
   }
 
@@ -177,7 +181,8 @@ private[spark] class SerializerManager(
       classTag: ClassTag[_]): ChunkedByteBuffer = {
     val bbos = new ChunkedByteBufferOutputStream(1024 * 1024 * 4, ByteBuffer.allocate)
     val byteStream = new BufferedOutputStream(bbos)
-    val ser = getSerializer(classTag).newInstance()
+    val autoPick = !blockId.isInstanceOf[StreamBlockId]
+    val ser = getSerializer(classTag, autoPick).newInstance()
     ser.serializeStream(wrapStream(blockId, byteStream)).writeAll(values).close()
     bbos.toChunkedByteBuffer
   }
@@ -191,7 +196,8 @@ private[spark] class SerializerManager(
       inputStream: InputStream)
       (classTag: ClassTag[T]): Iterator[T] = {
     val stream = new BufferedInputStream(inputStream)
-    getSerializer(classTag)
+    val autoPick = !blockId.isInstanceOf[StreamBlockId]
+    getSerializer(classTag, autoPick)
       .newInstance()
       .deserializeStream(wrapStream(blockId, stream))
       .asIterator.asInstanceOf[Iterator[T]]
diff --git a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
index 095d32407f345..fff21218b1769 100644
--- a/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
@@ -31,7 +31,7 @@ import org.apache.spark.{SparkConf, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.memory.{MemoryManager, MemoryMode}
 import org.apache.spark.serializer.{SerializationStream, SerializerManager}
-import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel}
+import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel, StreamBlockId}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.{SizeEstimator, Utils}
 import org.apache.spark.util.collection.SizeTrackingVector
@@ -334,7 +334,8 @@ private[spark] class MemoryStore(
     val bbos = new ChunkedByteBufferOutputStream(initialMemoryThreshold.toInt, allocator)
     redirectableStream.setOutputStream(bbos)
     val serializationStream: SerializationStream = {
-      val ser = serializerManager.getSerializer(classTag).newInstance()
+      val autoPick = !blockId.isInstanceOf[StreamBlockId]
+      val ser = serializerManager.getSerializer(classTag, autoPick).newInstance()
       ser.serializeStream(serializerManager.wrapStream(blockId, redirectableStream))
     }
 
diff --git a/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala b/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala
index ec4f2637fadd0..3050f9a250235 100644
--- a/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala
@@ -67,7 +67,8 @@ class PartiallySerializedBlockSuite
       spy
     }
 
-    val serializer = serializerManager.getSerializer(implicitly[ClassTag[T]]).newInstance()
+    val serializer = serializerManager
+      .getSerializer(implicitly[ClassTag[T]], autoPick = true).newInstance()
     val redirectableOutputStream = Mockito.spy(new RedirectableOutputStream)
     redirectableOutputStream.setOutputStream(bbos)
     val serializationStream = Mockito.spy(serializer.serializeStream(redirectableOutputStream))
@@ -182,7 +183,8 @@ class PartiallySerializedBlockSuite
       Mockito.verifyNoMoreInteractions(memoryStore)
       Mockito.verify(partiallySerializedBlock.getUnrolledChunkedByteBuffer, atLeastOnce).dispose()
 
-      val serializer = serializerManager.getSerializer(implicitly[ClassTag[T]]).newInstance()
+      val serializer = serializerManager
+        .getSerializer(implicitly[ClassTag[T]], autoPick = true).newInstance()
       val deserialized =
         serializer.deserializeStream(new ByteBufferInputStream(bbos.toByteBuffer)).asIterator.toSeq
       assert(deserialized === items)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index f1482e5c06cdc..45d8f50853431 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -806,6 +806,28 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     ssc.stop()
   }
 
+  test("SPARK-18560 Receiver data should be deserialized properly.") {
+    // Start a two nodes cluster, so receiver will use one node, and Spark jobs will use the
+    // other one. Then Spark jobs need to fetch remote blocks and it will trigger SPARK-18560.
+    val conf = new SparkConf().setMaster("local-cluster[2,1,1024]").setAppName(appName)
+    ssc = new StreamingContext(conf, Milliseconds(100))
+    val input = ssc.receiverStream(new FakeByteArrayReceiver)
+    input.count().foreachRDD { rdd =>
+      // Make sure we can read from BlockRDD
+      if (rdd.collect().headOption.getOrElse(0L) > 0) {
+        // Stop StreamingContext to unblock "awaitTerminationOrTimeout"
+        new Thread() {
+          setDaemon(true)
+          override def run(): Unit = {
+            ssc.stop(stopSparkContext = true, stopGracefully = false)
+          }
+        }.start()
+      }
+    }
+    ssc.start()
+    ssc.awaitTerminationOrTimeout(60000)
+  }
+
   def addInputStream(s: StreamingContext): DStream[Int] = {
     val input = (1 to 100).map(i => 1 to i)
     val inputStream = new TestInputStream(s, input, 1)
@@ -869,6 +891,31 @@ object TestReceiver {
   val counter = new AtomicInteger(1)
 }
 
+class FakeByteArrayReceiver extends Receiver[Array[Byte]](StorageLevel.MEMORY_ONLY) with Logging {
+
+  val data: Array[Byte] = "test".getBytes
+  var receivingThreadOption: Option[Thread] = None
+
+  override def onStart(): Unit = {
+    val thread = new Thread() {
+      override def run() {
+        logInfo("Receiving started")
+        while (!isStopped) {
+          store(data)
+        }
+        logInfo("Receiving stopped")
+      }
+    }
+    receivingThreadOption = Some(thread)
+    thread.start()
+  }
+
+  override def onStop(): Unit = {
+    // no clean to be done, the receiving thread should stop on it own, so just wait for it.
+    receivingThreadOption.foreach(_.join())
+  }
+}
+
 /** Custom receiver for testing whether a slow receiver can be shutdown gracefully or not */
 class SlowTestReceiver(totalRecords: Int, recordsPerSecond: Int)
   extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging {

From 7043c6b695f77741c5e97a322d9590bd714289de Mon Sep 17 00:00:00 2001
From: Sandeep Singh <sandeep@techaddict.me>
Date: Wed, 30 Nov 2016 11:33:15 +0200
Subject: [PATCH 236/534] [SPARK-18366][PYSPARK][ML] Add handleInvalid to
 Pyspark for QuantileDiscretizer and Bucketizer

## What changes were proposed in this pull request?
added the new handleInvalid param for these transformers to Python to maintain API parity.

## How was this patch tested?
existing tests
testing is done with new doctests

Author: Sandeep Singh <sandeep@techaddict.me>

Closes #15817 from techaddict/SPARK-18366.

(cherry picked from commit fe854f2e4fb2fa1a1c501f11030e36f489ca546f)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
---
 python/pyspark/ml/feature.py | 85 ++++++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index aada38d1ad2ec..1d62b325344e5 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -125,10 +125,13 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
     """
     Maps a column of continuous features to a column of feature buckets.
 
-    >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
+    >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
+    >>> df = spark.createDataFrame(values, ["values"])
     >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
     ...     inputCol="values", outputCol="buckets")
-    >>> bucketed = bucketizer.transform(df).collect()
+    >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
+    >>> len(bucketed)
+    6
     >>> bucketed[0].buckets
     0.0
     >>> bucketed[1].buckets
@@ -144,6 +147,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
     >>> loadedBucketizer = Bucketizer.load(bucketizerPath)
     >>> loadedBucketizer.getSplits() == bucketizer.getSplits()
     True
+    >>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()
+    >>> len(bucketed)
+    4
 
     .. versionadded:: 1.4.0
     """
@@ -158,21 +164,28 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
               "splits specified will be treated as errors.",
               typeConverter=TypeConverters.toListFloat)
 
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
+                          "Options are skip (filter out rows with invalid values), " +
+                          "error (throw an error), or keep (keep invalid values in a special " +
+                          "additional bucket).",
+                          typeConverter=TypeConverters.toString)
+
     @keyword_only
-    def __init__(self, splits=None, inputCol=None, outputCol=None):
+    def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        __init__(self, splits=None, inputCol=None, outputCol=None)
+        __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
         """
         super(Bucketizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
+        self._setDefault(handleInvalid="error")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("1.4.0")
-    def setParams(self, splits=None, inputCol=None, outputCol=None):
+    def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        setParams(self, splits=None, inputCol=None, outputCol=None)
+        setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this Bucketizer.
         """
         kwargs = self.setParams._input_kwargs
@@ -192,6 +205,20 @@ def getSplits(self):
         """
         return self.getOrDefault(self.splits)
 
+    @since("2.1.0")
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        return self._set(handleInvalid=value)
+
+    @since("2.1.0")
+    def getHandleInvalid(self):
+        """
+        Gets the value of :py:attr:`handleInvalid` or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
 
 @inherit_doc
 class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
@@ -1157,12 +1184,17 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab
     :py:attr:`relativeError` parameter.
     The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.
 
-    >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
+    >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
+    >>> df = spark.createDataFrame(values, ["values"])
     >>> qds = QuantileDiscretizer(numBuckets=2,
-    ...     inputCol="values", outputCol="buckets", relativeError=0.01)
+    ...     inputCol="values", outputCol="buckets", relativeError=0.01, handleInvalid="error")
     >>> qds.getRelativeError()
     0.01
     >>> bucketizer = qds.fit(df)
+    >>> qds.setHandleInvalid("keep").fit(df).transform(df).count()
+    6
+    >>> qds.setHandleInvalid("skip").fit(df).transform(df).count()
+    4
     >>> splits = bucketizer.getSplits()
     >>> splits[0]
     -inf
@@ -1190,23 +1222,33 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab
                           "Must be in the range [0, 1].",
                           typeConverter=TypeConverters.toFloat)
 
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
+                          "Options are skip (filter out rows with invalid values), " +
+                          "error (throw an error), or keep (keep invalid values in a special " +
+                          "additional bucket).",
+                          typeConverter=TypeConverters.toString)
+
     @keyword_only
-    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
+    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001,
+                 handleInvalid="error"):
         """
-        __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
+        __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \
+                 handleInvalid="error")
         """
         super(QuantileDiscretizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
                                             self.uid)
-        self._setDefault(numBuckets=2, relativeError=0.001)
+        self._setDefault(numBuckets=2, relativeError=0.001, handleInvalid="error")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("2.0.0")
-    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
+    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001,
+                  handleInvalid="error"):
         """
-        setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
+        setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \
+                  handleInvalid="error")
         Set the params for the QuantileDiscretizer
         """
         kwargs = self.setParams._input_kwargs
@@ -1240,13 +1282,28 @@ def getRelativeError(self):
         """
         return self.getOrDefault(self.relativeError)
 
+    @since("2.1.0")
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        return self._set(handleInvalid=value)
+
+    @since("2.1.0")
+    def getHandleInvalid(self):
+        """
+        Gets the value of :py:attr:`handleInvalid` or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
     def _create_model(self, java_model):
         """
         Private method to convert the java_model to a Python model.
         """
         return Bucketizer(splits=list(java_model.getSplits()),
                           inputCol=self.getInputCol(),
-                          outputCol=self.getOutputCol())
+                          outputCol=self.getOutputCol(),
+                          handleInvalid=self.getHandleInvalid())
 
 
 @inherit_doc

From 05ba5eed71309e104feb1951aa8197e4336cdb2a Mon Sep 17 00:00:00 2001
From: Anthony Truchet <a.truchet@criteo.com>
Date: Wed, 30 Nov 2016 10:04:47 +0000
Subject: [PATCH 237/534] [SPARK-18612][MLLIB] Delete broadcasted variable in
 LBFGS CostFun

## What changes were proposed in this pull request?

Fix a broadcasted variable leak occurring at each invocation of CostFun in L-BFGS.

## How was this patch tested?

UTests + check that fixed fatal memory consumption on Criteo's use cases.

This contribution is made on behalf of Criteo S.A.
(http://labs.criteo.com/) under the terms of the Apache v2 License.

Author: Anthony Truchet <a.truchet@criteo.com>

Closes #16040 from AnthonyTruchet/SPARK-18612-lbfgs-cost-fun.

(cherry picked from commit c5a64d760600ff430899e401751c41dc6b27cee6)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/scala/org/apache/spark/mllib/optimization/LBFGS.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index 900eec18489c1..e0e41f711b981 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -252,6 +252,9 @@ object LBFGS extends Logging {
             (grad1, loss1 + loss2)
           })
 
+      // broadcasted model is not needed anymore
+      bcW.destroy()
+
       /**
        * regVal is sum of weight squares if it's L2 updater;
        * for other updater, the same logic is followed.

From 6e044ab9a9d417fb12d53f6327b90d9166c01f35 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 30 Nov 2016 19:40:58 +0800
Subject: [PATCH 238/534] [SPARK-17897][SQL] Fixed IsNotNull Constraint
 Inference Rule

### What changes were proposed in this pull request?
The `constraints` of an operator is the expressions that evaluate to `true` for all the rows produced. That means, the expression result should be neither `false` nor `unknown` (NULL). Thus, we can conclude that `IsNotNull` on all the constraints, which are generated by its own predicates or propagated from the children. The constraint can be a complex expression. For better usage of these constraints, we try to push down `IsNotNull` to the lowest-level expressions (i.e., `Attribute`). `IsNotNull` can be pushed through an expression when it is null intolerant. (When the input is NULL, the null-intolerant expression always evaluates to NULL.)

Below is the existing code we have for `IsNotNull` pushdown.
```Scala
  private def scanNullIntolerantExpr(expr: Expression): Seq[Attribute] = expr match {
    case a: Attribute => Seq(a)
    case _: NullIntolerant | IsNotNull(_: NullIntolerant) =>
      expr.children.flatMap(scanNullIntolerantExpr)
    case _ => Seq.empty[Attribute]
  }
```

**`IsNotNull` itself is not null-intolerant.** It converts `null` to `false`. If the expression does not include any `Not`-like expression, it works; otherwise, it could generate a wrong result. This PR is to fix the above function by removing the `IsNotNull` from the inference. After the fix, when a constraint has a `IsNotNull` expression, we infer new attribute-specific `IsNotNull` constraints if and only if `IsNotNull` appears in the root.

Without the fix, the following test case will return empty.
```Scala
val data = Seq[java.lang.Integer](1, null).toDF("key")
data.filter("not key is not null").show()
```
Before the fix, the optimized plan is like
```
== Optimized Logical Plan ==
Project [value#1 AS key#3]
+- Filter (isnotnull(value#1) && NOT isnotnull(value#1))
   +- LocalRelation [value#1]
```

After the fix, the optimized plan is like
```
== Optimized Logical Plan ==
Project [value#1 AS key#3]
+- Filter NOT isnotnull(value#1)
   +- LocalRelation [value#1]
```

### How was this patch tested?
Added a test

Author: gatorsmile <gatorsmile@gmail.com>

Closes #16067 from gatorsmile/isNotNull2.

(cherry picked from commit 2eb093decb5e87a1ea71bbaa28092876a8c84996)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/plans/QueryPlan.scala  | 27 ++++++++++++++-----
 .../plans/ConstraintPropagationSuite.scala    |  9 +++++++
 .../org/apache/spark/sql/DataFrameSuite.scala |  6 +++++
 3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 45ee2964d4db0..b108017c4c482 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -40,14 +40,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
   }
 
   /**
-   * Infers a set of `isNotNull` constraints from a given set of equality/comparison expressions as
-   * well as non-nullable attributes. For e.g., if an expression is of the form (`a > 5`), this
+   * Infers a set of `isNotNull` constraints from null intolerant expressions as well as
+   * non-nullable attributes. For e.g., if an expression is of the form (`a > 5`), this
    * returns a constraint of the form `isNotNull(a)`
    */
   private def constructIsNotNullConstraints(constraints: Set[Expression]): Set[Expression] = {
     // First, we propagate constraints from the null intolerant expressions.
-    var isNotNullConstraints: Set[Expression] =
-      constraints.flatMap(scanNullIntolerantExpr).map(IsNotNull(_))
+    var isNotNullConstraints: Set[Expression] = constraints.flatMap(inferIsNotNullConstraints)
 
     // Second, we infer additional constraints from non-nullable attributes that are part of the
     // operator's output
@@ -57,14 +56,28 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
     isNotNullConstraints -- constraints
   }
 
+  /**
+   * Infer the Attribute-specific IsNotNull constraints from the null intolerant child expressions
+   * of constraints.
+   */
+  private def inferIsNotNullConstraints(constraint: Expression): Seq[Expression] =
+    constraint match {
+      // When the root is IsNotNull, we can push IsNotNull through the child null intolerant
+      // expressions
+      case IsNotNull(expr) => scanNullIntolerantAttribute(expr).map(IsNotNull(_))
+      // Constraints always return true for all the inputs. That means, null will never be returned.
+      // Thus, we can infer `IsNotNull(constraint)`, and also push IsNotNull through the child
+      // null intolerant expressions.
+      case _ => scanNullIntolerantAttribute(constraint).map(IsNotNull(_))
+    }
+
   /**
    * Recursively explores the expressions which are null intolerant and returns all attributes
    * in these expressions.
    */
-  private def scanNullIntolerantExpr(expr: Expression): Seq[Attribute] = expr match {
+  private def scanNullIntolerantAttribute(expr: Expression): Seq[Attribute] = expr match {
     case a: Attribute => Seq(a)
-    case _: NullIntolerant | IsNotNull(_: NullIntolerant) =>
-      expr.children.flatMap(scanNullIntolerantExpr)
+    case _: NullIntolerant => expr.children.flatMap(scanNullIntolerantAttribute)
     case _ => Seq.empty[Attribute]
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
index 8068ce922e636..a191aa8fee702 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/ConstraintPropagationSuite.scala
@@ -351,6 +351,15 @@ class ConstraintPropagationSuite extends SparkFunSuite {
         IsNotNull(IsNotNull(resolveColumn(tr, "b"))),
         IsNotNull(resolveColumn(tr, "a")),
         IsNotNull(resolveColumn(tr, "c")))))
+
+    verifyConstraints(
+      tr.where('a.attr === 1 && IsNotNull(resolveColumn(tr, "b")) &&
+        IsNotNull(resolveColumn(tr, "c"))).analyze.constraints,
+      ExpressionSet(Seq(
+        resolveColumn(tr, "a") === 1,
+        IsNotNull(resolveColumn(tr, "c")),
+        IsNotNull(resolveColumn(tr, "a")),
+        IsNotNull(resolveColumn(tr, "b")))))
   }
 
   test("infer IsNotNull constraints from non-nullable attributes") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index f5bc8785d5a2c..312cd17c26d60 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1697,6 +1697,12 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       expr = "cast((_1 + _2) as boolean)", expectedNonNullableColumns = Seq("_1", "_2"))
   }
 
+  test("SPARK-17897: Fixed IsNotNull Constraint Inference Rule") {
+    val data = Seq[java.lang.Integer](1, null).toDF("key")
+    checkAnswer(data.filter(!$"key".isNotNull), Row(null))
+    checkAnswer(data.filter(!(- $"key").isNotNull), Row(null))
+  }
+
   test("SPARK-17957: outer join + na.fill") {
     val df1 = Seq((1, 2), (2, 3)).toDF("a", "b")
     val df2 = Seq((2, 5), (3, 4)).toDF("a", "c")

From 3de93fb480ce316e9b35a025dd350123084c3565 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 30 Nov 2016 09:47:30 -0800
Subject: [PATCH 239/534] [SPARK-18220][SQL] read Hive orc table with varchar
 column should not fail

## What changes were proposed in this pull request?

Spark SQL only has `StringType`, when reading hive table with varchar column, we will read that column as `StringType`. However, we still need to use varchar `ObjectInspector` to read varchar column in hive table, which means we need to know the actual column type at hive side.

In Spark 2.1, after https://github.com/apache/spark/pull/14363 , we parse hive type string to catalyst type, which means the actual column type at hive side is erased. Then we may use string `ObjectInspector` to read varchar column and fail.

This PR keeps the original hive column type string in the metadata of `StructField`, and use it when we convert it to a hive column.

## How was this patch tested?

newly added regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16060 from cloud-fan/varchar.

(cherry picked from commit 3f03c90a807872d47588f3c3920769b8978033bf)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/sql/hive/HiveUtils.scala     |  8 ++++++++
 .../apache/spark/sql/hive/MetastoreRelation.scala |  7 ++++++-
 .../spark/sql/hive/client/HiveClientImpl.scala    | 15 ++++++++++++---
 ...xternalCatalogBackwardCompatibilitySuite.scala |  4 ++--
 .../spark/sql/hive/orc/OrcSourceSuite.scala       | 12 ++++++++++++
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index 81cd65c3cc337..26b1994308f5d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -54,6 +54,14 @@ private[spark] object HiveUtils extends Logging {
   /** The version of hive used internally by Spark SQL. */
   val hiveExecutionVersion: String = "1.2.1"
 
+  /**
+   * The property key that is used to store the raw hive type string in the metadata of StructField.
+   * For example, in the case where the Hive type is varchar, the type gets mapped to a string type
+   * in Spark SQL, but we need to preserve the original type in order to invoke the correct object
+   * inspector in Hive.
+   */
+  val hiveTypeString: String = "HIVE_TYPE_STRING"
+
   val HIVE_METASTORE_VERSION = SQLConfigBuilder("spark.sql.hive.metastore.version")
     .doc("Version of the Hive metastore. Available options are " +
         s"<code>0.12.0</code> through <code>$hiveExecutionVersion</code>.")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
index da809cf991de2..3bbac05a79c23 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala
@@ -61,7 +61,12 @@ private[hive] case class MetastoreRelation(
   override protected def otherCopyArgs: Seq[AnyRef] = catalogTable :: sparkSession :: Nil
 
   private def toHiveColumn(c: StructField): FieldSchema = {
-    new FieldSchema(c.name, c.dataType.catalogString, c.getComment.orNull)
+    val typeString = if (c.metadata.contains(HiveUtils.hiveTypeString)) {
+      c.metadata.getString(HiveUtils.hiveTypeString)
+    } else {
+      c.dataType.catalogString
+    }
+    new FieldSchema(c.name, typeString, c.getComment.orNull)
   }
 
   // TODO: merge this with HiveClientImpl#toHiveTable
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 68dcfd86731bd..590029a517e09 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -46,7 +46,8 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 import org.apache.spark.sql.execution.QueryExecutionException
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType}
 import org.apache.spark.util.{CircularBuffer, Utils}
 
 /**
@@ -748,7 +749,12 @@ private[hive] class HiveClientImpl(
       .asInstanceOf[Class[_ <: org.apache.hadoop.hive.ql.io.HiveOutputFormat[_, _]]]
 
   private def toHiveColumn(c: StructField): FieldSchema = {
-    new FieldSchema(c.name, c.dataType.catalogString, c.getComment().orNull)
+    val typeString = if (c.metadata.contains(HiveUtils.hiveTypeString)) {
+      c.metadata.getString(HiveUtils.hiveTypeString)
+    } else {
+      c.dataType.catalogString
+    }
+    new FieldSchema(c.name, typeString, c.getComment().orNull)
   }
 
   private def fromHiveColumn(hc: FieldSchema): StructField = {
@@ -758,10 +764,13 @@ private[hive] class HiveClientImpl(
       case e: ParseException =>
         throw new SparkException("Cannot recognize hive type string: " + hc.getType, e)
     }
+
+    val metadata = new MetadataBuilder().putString(HiveUtils.hiveTypeString, hc.getType).build()
     val field = StructField(
       name = hc.getName,
       dataType = columnType,
-      nullable = true)
+      nullable = true,
+      metadata = metadata)
     Option(hc.getComment).map(field.withComment).getOrElse(field)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
index cca4480c44150..c5753cec80da7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogBackwardCompatibilitySuite.scala
@@ -205,7 +205,7 @@ class HiveExternalCatalogBackwardCompatibilitySuite extends QueryTest
   test("make sure we can read table created by old version of Spark") {
     for ((tbl, expectedSchema) <- rawTablesAndExpectations) {
       val readBack = getTableMetadata(tbl.identifier.table)
-      assert(readBack.schema == expectedSchema)
+      assert(readBack.schema.sameType(expectedSchema))
 
       if (tbl.tableType == CatalogTableType.EXTERNAL) {
         // trim the URI prefix
@@ -235,7 +235,7 @@ class HiveExternalCatalogBackwardCompatibilitySuite extends QueryTest
       sql(s"ALTER TABLE ${tbl.identifier} RENAME TO $newName")
 
       val readBack = getTableMetadata(newName)
-      assert(readBack.schema == expectedSchema)
+      assert(readBack.schema.sameType(expectedSchema))
 
       // trim the URI prefix
       val actualTableLocation = new URI(readBack.storage.locationUri.get).getPath
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 12f948041a8ab..2b404690510cd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -150,6 +151,17 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
   test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
     assert(new OrcOptions(Map("Orc.Compress" -> "NONE")).compressionCodec == "NONE")
   }
+
+  test("SPARK-18220: read Hive orc table with varchar column") {
+    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+    try {
+      hiveClient.runSqlHive("CREATE TABLE orc_varchar(a VARCHAR(10)) STORED AS orc")
+      hiveClient.runSqlHive("INSERT INTO TABLE orc_varchar SELECT 'a' FROM (SELECT 1) t")
+      checkAnswer(spark.table("orc_varchar"), Row("a"))
+    } finally {
+      hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar")
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {

From eae85da388e27c7eda8be3933f673ad7f1a3c6af Mon Sep 17 00:00:00 2001
From: manishAtGit <manish@knoldus.com>
Date: Wed, 30 Nov 2016 14:46:50 -0500
Subject: [PATCH 240/534] [SPARK][EXAMPLE] Added missing semicolon in
 quick-start-guide example

## What changes were proposed in this pull request?

Added missing semicolon in quick-start-guide java example code which wasn't compiling before.

## How was this patch tested?
Locally by running and generating site for docs. You can see the last line contains ";" in the below snapshot.
![image](https://cloud.githubusercontent.com/assets/10628224/20751760/9a7e0402-b723-11e6-9aa8-3b6ca2d92ebf.png)

Author: manishAtGit <manish@knoldus.com>

Closes #16081 from manishatGit/fixed-quick-start-guide.

(cherry picked from commit bc95ea0be5b880673d452f5eec47fbfd403d94ce)
Signed-off-by: Andrew Or <andrewor14@gmail.com>
---
 docs/quick-start.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/quick-start.md b/docs/quick-start.md
index cb9a378199562..0836c602feafa 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -330,7 +330,7 @@ public class SimpleApp {
 
     System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
     
-    sc.stop()
+    sc.stop();
   }
 }
 {% endhighlight %}

From 7c0e2962d5e0fb80e4472d29dd467477f1cbcf8a Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 30 Nov 2016 14:47:41 -0500
Subject: [PATCH 241/534] [SPARK-18640] Add synchronization to
 TaskScheduler.runningTasksByExecutors

## What changes were proposed in this pull request?

The method `TaskSchedulerImpl.runningTasksByExecutors()` accesses the mutable `executorIdToRunningTaskIds` map without proper synchronization. In addition, as markhamstra pointed out in #15986, the signature's use of parentheses is a little odd given that this is a pure getter method.

This patch fixes both issues.

## How was this patch tested?

Covered by existing tests.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #16073 from JoshRosen/runningTasksByExecutors-thread-safety.

(cherry picked from commit c51c7725944d60738e2bac3e11f6aea74812905c)
Signed-off-by: Andrew Or <andrewor14@gmail.com>
---
 core/src/main/scala/org/apache/spark/SparkStatusTracker.scala | 2 +-
 .../scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala  | 2 +-
 .../org/apache/spark/scheduler/TaskSchedulerImplSuite.scala   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
index 52c4656c271bc..22a553e68439a 100644
--- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
+++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
@@ -112,7 +112,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    */
   def getExecutorInfos: Array[SparkExecutorInfo] = {
     val executorIdToRunningTasks: Map[String, Int] =
-      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors()
+      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors
 
     sc.getExecutorStorageStatus.map { status =>
       val bmId = status.blockManagerId
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 67446da0a8b8d..b03cfe4f0dc49 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -96,7 +96,7 @@ private[spark] class TaskSchedulerImpl(
   // IDs of the tasks running on each executor
   private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]]
 
-  def runningTasksByExecutors(): Map[String, Int] = {
+  def runningTasksByExecutors: Map[String, Int] = synchronized {
     executorIdToRunningTaskIds.toMap.mapValues(_.size)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 48ec04bd5aab3..e736c6c1145f9 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -442,7 +442,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     // Check that state associated with the lost task attempt is cleaned up:
     assert(taskScheduler.taskIdToExecutorId.isEmpty)
     assert(taskScheduler.taskIdToTaskSetManager.isEmpty)
-    assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty)
+    assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty)
   }
 
   test("if a task finishes with TaskState.LOST its executor is marked as dead") {
@@ -473,7 +473,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     // Check that state associated with the lost task attempt is cleaned up:
     assert(taskScheduler.taskIdToExecutorId.isEmpty)
     assert(taskScheduler.taskIdToTaskSetManager.isEmpty)
-    assert(taskScheduler.runningTasksByExecutors().get("executor0").isEmpty)
+    assert(taskScheduler.runningTasksByExecutors.get("executor0").isEmpty)
 
     // Check that the executor has been marked as dead
     assert(!taskScheduler.isExecutorAlive("executor0"))

From f542df3107e6161f90a7394a36ab95932a0b3425 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 30 Nov 2016 13:21:05 -0800
Subject: [PATCH 242/534] [SPARK-18318][ML] ML, Graph 2.1 QA: API: New Scala
 APIs, docs

## What changes were proposed in this pull request?
API review for 2.1, except ```LSH``` related classes which are still under development.

## How was this patch tested?
Only doc changes, no new tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16009 from yanboliang/spark-18318.

(cherry picked from commit 60022bfd65e4637efc0eb5f4cc0112289c783147)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 docs/ml-features.md                                   |  4 +++-
 .../spark/ml/classification/LogisticRegression.scala  |  6 +++---
 .../apache/spark/ml/classification/NaiveBayes.scala   |  2 +-
 .../org/apache/spark/ml/feature/Bucketizer.scala      |  7 ++++---
 .../org/apache/spark/ml/feature/ChiSqSelector.scala   |  2 ++
 .../apache/spark/ml/feature/QuantileDiscretizer.scala | 11 +++++++----
 .../apache/spark/ml/optim/NormalEquationSolver.scala  |  8 ++++----
 .../spark/mllib/classification/NaiveBayes.scala       |  6 +++---
 .../apache/spark/mllib/feature/ChiSqSelector.scala    |  2 +-
 .../org/apache/spark/mllib/feature/HashingTF.scala    |  6 +++---
 10 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 45724a3716e74..9eecc1333d06f 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1158,7 +1158,9 @@ categorical features. The number of bins is set by the `numBuckets` parameter. I
 that the number of buckets used will be smaller than this value, for example, if there are too few
 distinct values of the input to create enough distinct quantiles.
 
-NaN values: Note also that QuantileDiscretizer
+NaN values:
+NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will produce
+a `Bucketizer` model for making predictions. During the transformation, `Bucketizer`
 will raise an error when it finds NaN values in the dataset, but the user can also choose to either
 keep or remove NaN values within the dataset by setting `handleInvalid`. If the user chooses to keep
 NaN values, they will be handled specially and placed into their own bucket, for example, if 4 buckets
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index d3ae62e243302..5e1d6eec96a3e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -312,7 +312,6 @@ class LogisticRegression @Since("1.2.0") (
 
   private var optInitialModel: Option[LogisticRegressionModel] = None
 
-  /** @group setParam */
   private[spark] def setInitialModel(model: LogisticRegressionModel): this.type = {
     this.optInitialModel = Some(model)
     this
@@ -323,8 +322,9 @@ class LogisticRegression @Since("1.2.0") (
     train(dataset, handlePersistence)
   }
 
-  protected[spark] def train(dataset: Dataset[_], handlePersistence: Boolean):
-      LogisticRegressionModel = {
+  protected[spark] def train(
+      dataset: Dataset[_],
+      handlePersistence: Boolean): LogisticRegressionModel = {
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
     val instances: RDD[Instance] =
       dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index a2ac7000003d4..94ee2a2e7d9f4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types.DoubleType
 /**
  * Params for Naive Bayes Classifiers.
  */
-private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
+private[classification] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
 
   /**
    * The smoothing parameter.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 260159f8b7ac4..eb4d42f255345 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -84,11 +84,12 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make Bucketizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
 
   /** @group getParam */
   @Since("2.1.0")
@@ -145,7 +146,7 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] {
   private[feature] val SKIP_INVALID: String = "skip"
   private[feature] val ERROR_INVALID: String = "error"
   private[feature] val KEEP_INVALID: String = "keep"
-  private[feature] val supportedHandleInvalid: Array[String] =
+  private[feature] val supportedHandleInvalids: Array[String] =
     Array(SKIP_INVALID, ERROR_INVALID, KEEP_INVALID)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 7cd0f159c6be7..8699929bab793 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -82,11 +82,13 @@ private[feature] trait ChiSqSelectorParams extends Params
    * Default value is 0.05.
    * @group param
    */
+  @Since("2.1.0")
   final val fpr = new DoubleParam(this, "fpr", "The highest p-value for features to be kept.",
     ParamValidators.inRange(0, 1))
   setDefault(fpr -> 0.05)
 
   /** @group getParam */
+  @Since("2.1.0")
   def getFpr: Double = $(fpr)
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index d8f33cd768dcd..b4fcfa2da47de 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -72,11 +72,12 @@ private[feature] trait QuantileDiscretizerBase extends Params
    * Default: "error"
    * @group param
    */
+  // TODO: SPARK-18619 Make QuantileDiscretizer inherit from HasHandleInvalid.
   @Since("2.1.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle" +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "invalid entries. Options are skip (filter out rows with invalid values), " +
     "error (throw an error), or keep (keep invalid values in a special additional bucket).",
-    ParamValidators.inArray(Bucketizer.supportedHandleInvalid))
+    ParamValidators.inArray(Bucketizer.supportedHandleInvalids))
   setDefault(handleInvalid, Bucketizer.ERROR_INVALID)
 
   /** @group getParam */
@@ -91,8 +92,10 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * possible that the number of buckets used will be smaller than this value, for example, if there
  * are too few distinct values of the input to create enough distinct quantiles.
  *
- * NaN handling: Note also that
- * QuantileDiscretizer will raise an error when it finds NaN values in the dataset, but the user can
+ * NaN handling:
+ * NaN values will be removed from the column during `QuantileDiscretizer` fitting. This will
+ * produce a `Bucketizer` model for making predictions. During the transformation,
+ * `Bucketizer` will raise an error when it finds NaN values in the dataset, but the user can
  * also choose to either keep or remove NaN values within the dataset by setting `handleInvalid`.
  * If the user chooses to keep NaN values, they will be handled specially and placed into their own
  * bucket, for example, if 4 buckets are used, then non-NaN data will be put into buckets[0-3],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
index 96fd0d18b5ae9..dc3bcc6627339 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/NormalEquationSolver.scala
@@ -34,7 +34,7 @@ import org.apache.spark.mllib.linalg.CholeskyDecomposition
  * @param objectiveHistory Option containing the objective history when an optimization program is
  *                         used to solve the normal equations. None when an analytic solver is used.
  */
-private[ml] class NormalEquationSolution(
+private[optim] class NormalEquationSolution(
     val coefficients: Array[Double],
     val aaInv: Option[Array[Double]],
     val objectiveHistory: Option[Array[Double]])
@@ -42,7 +42,7 @@ private[ml] class NormalEquationSolution(
 /**
  * Interface for classes that solve the normal equations locally.
  */
-private[ml] sealed trait NormalEquationSolver {
+private[optim] sealed trait NormalEquationSolver {
 
   /** Solve the normal equations from summary statistics. */
   def solve(
@@ -56,7 +56,7 @@ private[ml] sealed trait NormalEquationSolver {
 /**
  * A class that solves the normal equations directly, using Cholesky decomposition.
  */
-private[ml] class CholeskySolver extends NormalEquationSolver {
+private[optim] class CholeskySolver extends NormalEquationSolver {
 
   override def solve(
       bBar: Double,
@@ -75,7 +75,7 @@ private[ml] class CholeskySolver extends NormalEquationSolver {
 /**
  * A class for solving the normal equations using Quasi-Newton optimization methods.
  */
-private[ml] class QuasiNewtonSolver(
+private[optim] class QuasiNewtonSolver(
     fitIntercept: Boolean,
     maxIter: Int,
     tol: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index fa46ba3ace508..9e8774732efe6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -392,13 +392,13 @@ class NaiveBayes private (
 object NaiveBayes {
 
   /** String name for multinomial model type. */
-  private[spark] val Multinomial: String = "multinomial"
+  private[classification] val Multinomial: String = "multinomial"
 
   /** String name for Bernoulli model type. */
-  private[spark] val Bernoulli: String = "bernoulli"
+  private[classification] val Bernoulli: String = "bernoulli"
 
   /* Set of modelTypes that NaiveBayes supports */
-  private[spark] val supportedModelTypes = Set(Multinomial, Bernoulli)
+  private[classification] val supportedModelTypes = Set(Multinomial, Bernoulli)
 
   /**
    * Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 05ad2492f8c43..7ef2a95b96f2d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -266,7 +266,7 @@ private[spark] object ChiSqSelector {
   val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
-  private[spark] val FPR: String = "fpr"
+  val FPR: String = "fpr"
 
   /** Set of selector types that ChiSqSelector supports. */
   val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index bc26655104a9b..9abdd44a635d1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -131,9 +131,9 @@ class HashingTF(val numFeatures: Int) extends Serializable {
 
 object HashingTF {
 
-  private[spark] val Native: String = "native"
+  private[HashingTF] val Native: String = "native"
 
-  private[spark] val Murmur3: String = "murmur3"
+  private[HashingTF] val Murmur3: String = "murmur3"
 
   private val seed = 42
 
@@ -141,7 +141,7 @@ object HashingTF {
    * Calculate a hash code value for the term object using the native Scala implementation.
    * This is the default hash algorithm used in Spark 1.6 and earlier.
    */
-  private[spark] def nativeHash(term: Any): Int = term.##
+  private[HashingTF] def nativeHash(term: Any): Int = term.##
 
   /**
    * Calculate a hash code value for the term object using

From 9e96ac5a986c53ca1689e3d1f1365cc5107b5d88 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 30 Nov 2016 13:36:17 -0800
Subject: [PATCH 243/534] [SPARK-18251][SQL] the type of Dataset can't be
 Option of non-flat type

## What changes were proposed in this pull request?

For input object of non-flat type, we can't encode it to row if it's null, as Spark SQL doesn't allow the entire row to be null, only its columns can be null. That's the reason we forbid users to use top level null objects in https://github.com/apache/spark/pull/13469

However, if users wrap non-flat type with `Option`, then we may still encoder top level null object to row, which is not allowed.

This PR fixes this case, and suggests users to wrap their type with `Tuple1` if they do wanna top level null objects.

## How was this patch tested?

new test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #15979 from cloud-fan/option.

(cherry picked from commit f135b70fd590438bebb2a54012a6f73074219758)
Signed-off-by: Cheng Lian <lian@databricks.com>
---
 .../spark/sql/catalyst/ScalaReflection.scala       | 13 +++++++++++++
 .../sql/catalyst/encoders/ExpressionEncoder.scala  | 14 ++++++++++++--
 .../scala/org/apache/spark/sql/DatasetSuite.scala  | 13 +++++++++++--
 .../org/apache/spark/sql/JsonFunctionsSuite.scala  |  2 +-
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 7bcaea7ea2f79..0aa21b9347a9d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -605,6 +605,19 @@ object ScalaReflection extends ScalaReflection {
 
   }
 
+  /**
+   * Returns true if the given type is option of product type, e.g. `Option[Tuple2]`. Note that,
+   * we also treat [[DefinedByConstructorParams]] as product type.
+   */
+  def optionOfProductType(tpe: `Type`): Boolean = ScalaReflectionLock.synchronized {
+    tpe match {
+      case t if t <:< localTypeOf[Option[_]] =>
+        val TypeRef(_, _, Seq(optType)) = t
+        definedByConstructorParams(optType)
+      case _ => false
+    }
+  }
+
   /**
    * Returns the parameter names and types for the primary constructor of this class.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 82e1a8a7cad96..9c4818db6333b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -47,6 +47,16 @@ object ExpressionEncoder {
     // We convert the not-serializable TypeTag into StructType and ClassTag.
     val mirror = typeTag[T].mirror
     val tpe = typeTag[T].tpe
+
+    if (ScalaReflection.optionOfProductType(tpe)) {
+      throw new UnsupportedOperationException(
+        "Cannot create encoder for Option of Product type, because Product type is represented " +
+          "as a row, and the entire row can not be null in Spark SQL like normal databases. " +
+          "You can wrap your type with Tuple1 if you do want top level null Product objects, " +
+          "e.g. instead of creating `Dataset[Option[MyClass]]`, you can do something like " +
+          "`val ds: Dataset[Tuple1[MyClass]] = Seq(Tuple1(MyClass(...)), Tuple1(null)).toDS`")
+    }
+
     val cls = mirror.runtimeClass(tpe)
     val flat = !ScalaReflection.definedByConstructorParams(tpe)
 
@@ -54,9 +64,9 @@ object ExpressionEncoder {
     val nullSafeInput = if (flat) {
       inputObject
     } else {
-      // For input object of non-flat type, we can't encode it to row if it's null, as Spark SQL
+      // For input object of Product type, we can't encode it to row if it's null, as Spark SQL
       // doesn't allow top-level row to be null, only its columns can be null.
-      AssertNotNull(inputObject, Seq("top level non-flat input object"))
+      AssertNotNull(inputObject, Seq("top level Product input object"))
     }
     val serializer = ScalaReflection.serializerFor[T](nullSafeInput)
     val deserializer = ScalaReflection.deserializerFor[T]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 81fa8cbf22384..1174d7354f931 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -867,10 +867,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     checkDataset(Seq("a", null).toDS(), "a", null)
   }
 
-  test("Dataset should throw RuntimeException if non-flat input object is null") {
+  test("Dataset should throw RuntimeException if top-level product input object is null") {
     val e = intercept[RuntimeException](Seq(ClassData("a", 1), null).toDS())
     assert(e.getMessage.contains("Null value appeared in non-nullable field"))
-    assert(e.getMessage.contains("top level non-flat input object"))
+    assert(e.getMessage.contains("top level Product input object"))
   }
 
   test("dropDuplicates") {
@@ -1051,6 +1051,15 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     checkDataset(dsDouble, arrayDouble)
     checkDataset(dsString, arrayString)
   }
+
+  test("SPARK-18251: the type of Dataset can't be Option of Product type") {
+    checkDataset(Seq(Some(1), None).toDS(), Some(1), None)
+
+    val e = intercept[UnsupportedOperationException] {
+      Seq(Some(1 -> "a"), None).toDS()
+    }
+    assert(e.getMessage.contains("Cannot create encoder for Option of Product type"))
+  }
 }
 
 case class Generic[T](id: T, value: Double)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 7d63d31d9b979..890cc5b560d02 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -143,7 +143,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("roundtrip in to_json and from_json") {
-    val dfOne = Seq(Some(Tuple1(Tuple1(1))), None).toDF("struct")
+    val dfOne = Seq(Tuple1(Tuple1(1)), Tuple1(null)).toDF("struct")
     val schemaOne = dfOne.schema(0).dataType.asInstanceOf[StructType]
     val readBackOne = dfOne.select(to_json($"struct").as("json"))
       .select(from_json($"json", schemaOne).as("struct"))

From c2c2fdcb71e9bc82f0e88567148d1bae283f256a Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 30 Nov 2016 14:10:32 -0800
Subject: [PATCH 244/534] [SPARK-18546][CORE] Fix merging shuffle spills when
 using encryption.

The problem exists because it's not possible to just concatenate encrypted
partition data from different spill files; currently each partition would
have its own initial vector to set up encryption, and the final merged file
should contain a single initial vector for each merged partiton, otherwise
iterating over each record becomes really hard.

To fix that, UnsafeShuffleWriter now decrypts the partitions when merging,
so that the merged file contains a single initial vector at the start of
the partition data.

Because it's not possible to do that using the fast transferTo path, when
encryption is enabled UnsafeShuffleWriter will revert back to using file
streams when merging. It may be possible to use a hybrid approach when
using encryption, using an intermediate direct buffer when reading from
files and encrypting the data, but that's better left for a separate patch.

As part of the change I made DiskBlockObjectWriter take a SerializerManager
instead of a "wrap stream" closure, since that makes it easier to test the
code without having to mock SerializerManager functionality.

Tested with newly added unit tests (UnsafeShuffleWriterSuite for the write
side and ExternalAppendOnlyMapSuite for integration), and by running some
apps that failed without the fix.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #15982 from vanzin/SPARK-18546.

(cherry picked from commit 93e9d880bf8a144112d74a6897af4e36fcfa5807)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../shuffle/sort/UnsafeShuffleWriter.java     |  48 +++++----
 .../spark/serializer/SerializerManager.scala  |   6 +-
 .../apache/spark/storage/BlockManager.scala   |   5 +-
 .../spark/storage/DiskBlockObjectWriter.scala |   6 +-
 .../sort/UnsafeShuffleWriterSuite.java        | 100 +++++++++++++-----
 .../map/AbstractBytesToBytesMapSuite.java     |  11 +-
 .../sort/UnsafeExternalSorterSuite.java       |  21 ++--
 .../BypassMergeSortShuffleWriterSuite.scala   |   5 +-
 .../storage/DiskBlockObjectWriterSuite.scala  |  54 ++++------
 .../ExternalAppendOnlyMapSuite.scala          |   8 +-
 10 files changed, 145 insertions(+), 119 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index f235c434be7b1..8a1771848dee6 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -40,6 +40,8 @@
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.io.CompressionCodec;
 import org.apache.spark.io.CompressionCodec$;
+import org.apache.commons.io.output.CloseShieldOutputStream;
+import org.apache.commons.io.output.CountingOutputStream;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.network.util.LimitedInputStream;
 import org.apache.spark.scheduler.MapStatus;
@@ -264,6 +266,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
       sparkConf.getBoolean("spark.shuffle.unsafe.fastMergeEnabled", true);
     final boolean fastMergeIsSupported = !compressionEnabled ||
       CompressionCodec$.MODULE$.supportsConcatenationOfSerializedStreams(compressionCodec);
+    final boolean encryptionEnabled = blockManager.serializerManager().encryptionEnabled();
     try {
       if (spills.length == 0) {
         new FileOutputStream(outputFile).close(); // Create an empty file
@@ -289,7 +292,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
           // Compression is disabled or we are using an IO compression codec that supports
           // decompression of concatenated compressed streams, so we can perform a fast spill merge
           // that doesn't need to interpret the spilled bytes.
-          if (transferToEnabled) {
+          if (transferToEnabled && !encryptionEnabled) {
             logger.debug("Using transferTo-based fast merge");
             partitionLengths = mergeSpillsWithTransferTo(spills, outputFile);
           } else {
@@ -320,9 +323,9 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
   /**
    * Merges spill files using Java FileStreams. This code path is slower than the NIO-based merge,
    * {@link UnsafeShuffleWriter#mergeSpillsWithTransferTo(SpillInfo[], File)}, so it's only used in
-   * cases where the IO compression codec does not support concatenation of compressed data, or in
-   * cases where users have explicitly disabled use of {@code transferTo} in order to work around
-   * kernel bugs.
+   * cases where the IO compression codec does not support concatenation of compressed data, when
+   * encryption is enabled, or when users have explicitly disabled use of {@code transferTo} in
+   * order to work around kernel bugs.
    *
    * @param spills the spills to merge.
    * @param outputFile the file to write the merged data to.
@@ -337,7 +340,11 @@ private long[] mergeSpillsWithFileStream(
     final int numPartitions = partitioner.numPartitions();
     final long[] partitionLengths = new long[numPartitions];
     final InputStream[] spillInputStreams = new FileInputStream[spills.length];
-    OutputStream mergedFileOutputStream = null;
+
+    // Use a counting output stream to avoid having to close the underlying file and ask
+    // the file system for its size after each partition is written.
+    final CountingOutputStream mergedFileOutputStream = new CountingOutputStream(
+      new FileOutputStream(outputFile));
 
     boolean threwException = true;
     try {
@@ -345,34 +352,35 @@ private long[] mergeSpillsWithFileStream(
         spillInputStreams[i] = new FileInputStream(spills[i].file);
       }
       for (int partition = 0; partition < numPartitions; partition++) {
-        final long initialFileLength = outputFile.length();
-        mergedFileOutputStream =
-          new TimeTrackingOutputStream(writeMetrics, new FileOutputStream(outputFile, true));
+        final long initialFileLength = mergedFileOutputStream.getByteCount();
+        // Shield the underlying output stream from close() calls, so that we can close the higher
+        // level streams to make sure all data is really flushed and internal state is cleaned.
+        OutputStream partitionOutput = new CloseShieldOutputStream(
+          new TimeTrackingOutputStream(writeMetrics, mergedFileOutputStream));
+        partitionOutput = blockManager.serializerManager().wrapForEncryption(partitionOutput);
         if (compressionCodec != null) {
-          mergedFileOutputStream = compressionCodec.compressedOutputStream(mergedFileOutputStream);
+          partitionOutput = compressionCodec.compressedOutputStream(partitionOutput);
         }
-
         for (int i = 0; i < spills.length; i++) {
           final long partitionLengthInSpill = spills[i].partitionLengths[partition];
           if (partitionLengthInSpill > 0) {
-            InputStream partitionInputStream = null;
-            boolean innerThrewException = true;
+            InputStream partitionInputStream = new LimitedInputStream(spillInputStreams[i],
+              partitionLengthInSpill, false);
             try {
-              partitionInputStream =
-                  new LimitedInputStream(spillInputStreams[i], partitionLengthInSpill, false);
+              partitionInputStream = blockManager.serializerManager().wrapForEncryption(
+                partitionInputStream);
               if (compressionCodec != null) {
                 partitionInputStream = compressionCodec.compressedInputStream(partitionInputStream);
               }
-              ByteStreams.copy(partitionInputStream, mergedFileOutputStream);
-              innerThrewException = false;
+              ByteStreams.copy(partitionInputStream, partitionOutput);
             } finally {
-              Closeables.close(partitionInputStream, innerThrewException);
+              partitionInputStream.close();
             }
           }
         }
-        mergedFileOutputStream.flush();
-        mergedFileOutputStream.close();
-        partitionLengths[partition] = (outputFile.length() - initialFileLength);
+        partitionOutput.flush();
+        partitionOutput.close();
+        partitionLengths[partition] = (mergedFileOutputStream.getByteCount() - initialFileLength);
       }
       threwException = false;
     } finally {
diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
index 7371f886575c6..686305e9335dc 100644
--- a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala
@@ -75,6 +75,8 @@ private[spark] class SerializerManager(
    * loaded yet. */
   private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf)
 
+  def encryptionEnabled: Boolean = encryptionKey.isDefined
+
   def canUseKryo(ct: ClassTag[_]): Boolean = {
     primitiveAndPrimitiveArrayClassTags.contains(ct) || ct == stringClassTag
   }
@@ -129,7 +131,7 @@ private[spark] class SerializerManager(
   /**
    * Wrap an input stream for encryption if shuffle encryption is enabled
    */
-  private[this] def wrapForEncryption(s: InputStream): InputStream = {
+  def wrapForEncryption(s: InputStream): InputStream = {
     encryptionKey
       .map { key => CryptoStreamUtils.createCryptoInputStream(s, conf, key) }
       .getOrElse(s)
@@ -138,7 +140,7 @@ private[spark] class SerializerManager(
   /**
    * Wrap an output stream for encryption if shuffle encryption is enabled
    */
-  private[this] def wrapForEncryption(s: OutputStream): OutputStream = {
+  def wrapForEncryption(s: OutputStream): OutputStream = {
     encryptionKey
       .map { key => CryptoStreamUtils.createCryptoOutputStream(s, conf, key) }
       .getOrElse(s)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 982b83324e0fc..04521c9159eac 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -62,7 +62,7 @@ private[spark] class BlockManager(
     executorId: String,
     rpcEnv: RpcEnv,
     val master: BlockManagerMaster,
-    serializerManager: SerializerManager,
+    val serializerManager: SerializerManager,
     val conf: SparkConf,
     memoryManager: MemoryManager,
     mapOutputTracker: MapOutputTracker,
@@ -745,9 +745,8 @@ private[spark] class BlockManager(
       serializerInstance: SerializerInstance,
       bufferSize: Int,
       writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
-    val wrapStream: OutputStream => OutputStream = serializerManager.wrapStream(blockId, _)
     val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
-    new DiskBlockObjectWriter(file, serializerInstance, bufferSize, wrapStream,
+    new DiskBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize,
       syncWrites, writeMetrics, blockId)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index a499827ae1598..3cb12fca7dccb 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -22,7 +22,7 @@ import java.nio.channels.FileChannel
 
 import org.apache.spark.executor.ShuffleWriteMetrics
 import org.apache.spark.internal.Logging
-import org.apache.spark.serializer.{SerializationStream, SerializerInstance}
+import org.apache.spark.serializer.{SerializationStream, SerializerInstance, SerializerManager}
 import org.apache.spark.util.Utils
 
 /**
@@ -37,9 +37,9 @@ import org.apache.spark.util.Utils
  */
 private[spark] class DiskBlockObjectWriter(
     val file: File,
+    serializerManager: SerializerManager,
     serializerInstance: SerializerInstance,
     bufferSize: Int,
-    wrapStream: OutputStream => OutputStream,
     syncWrites: Boolean,
     // These write metrics concurrently shared with other active DiskBlockObjectWriters who
     // are themselves performing writes. All updates must be relative.
@@ -116,7 +116,7 @@ private[spark] class DiskBlockObjectWriter(
       initialized = true
     }
 
-    bs = wrapStream(mcs)
+    bs = serializerManager.wrapStream(blockId, mcs)
     objOut = serializerInstance.serializeStream(bs)
     streamOpen = true
     this
diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
index a96cd82382e2c..088b68132d905 100644
--- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
+++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java
@@ -26,11 +26,9 @@
 import scala.Tuple2;
 import scala.Tuple2$;
 import scala.collection.Iterator;
-import scala.runtime.AbstractFunction1;
 
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.Iterators;
-import com.google.common.io.ByteStreams;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -53,6 +51,7 @@
 import org.apache.spark.memory.TestMemoryManager;
 import org.apache.spark.network.util.LimitedInputStream;
 import org.apache.spark.scheduler.MapStatus;
+import org.apache.spark.security.CryptoStreamUtils;
 import org.apache.spark.serializer.*;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
 import org.apache.spark.storage.*;
@@ -77,7 +76,6 @@ public class UnsafeShuffleWriterSuite {
   final LinkedList<File> spillFilesCreated = new LinkedList<>();
   SparkConf conf;
   final Serializer serializer = new KryoSerializer(new SparkConf());
-  final SerializerManager serializerManager = new SerializerManager(serializer, new SparkConf());
   TaskMetrics taskMetrics;
 
   @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
@@ -86,17 +84,6 @@ public class UnsafeShuffleWriterSuite {
   @Mock(answer = RETURNS_SMART_NULLS) TaskContext taskContext;
   @Mock(answer = RETURNS_SMART_NULLS) ShuffleDependency<Object, Object, Object> shuffleDep;
 
-  private final class WrapStream extends AbstractFunction1<OutputStream, OutputStream> {
-    @Override
-    public OutputStream apply(OutputStream stream) {
-      if (conf.getBoolean("spark.shuffle.compress", true)) {
-        return CompressionCodec$.MODULE$.createCodec(conf).compressedOutputStream(stream);
-      } else {
-        return stream;
-      }
-    }
-  }
-
   @After
   public void tearDown() {
     Utils.deleteRecursively(tempDir);
@@ -121,6 +108,11 @@ public void setUp() throws IOException {
     memoryManager = new TestMemoryManager(conf);
     taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
 
+    // Some tests will override this manager because they change the configuration. This is a
+    // default for tests that don't need a specific one.
+    SerializerManager manager = new SerializerManager(serializer, conf);
+    when(blockManager.serializerManager()).thenReturn(manager);
+
     when(blockManager.diskBlockManager()).thenReturn(diskBlockManager);
     when(blockManager.getDiskWriter(
       any(BlockId.class),
@@ -131,12 +123,11 @@ public void setUp() throws IOException {
       @Override
       public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Throwable {
         Object[] args = invocationOnMock.getArguments();
-
         return new DiskBlockObjectWriter(
           (File) args[1],
+          blockManager.serializerManager(),
           (SerializerInstance) args[2],
           (Integer) args[3],
-          new WrapStream(),
           false,
           (ShuffleWriteMetrics) args[4],
           (BlockId) args[0]
@@ -201,9 +192,10 @@ private List<Tuple2<Object, Object>> readRecordsFromFile() throws IOException {
     for (int i = 0; i < NUM_PARTITITONS; i++) {
       final long partitionSize = partitionSizesInMergedFile[i];
       if (partitionSize > 0) {
-        InputStream in = new FileInputStream(mergedOutputFile);
-        ByteStreams.skipFully(in, startOffset);
-        in = new LimitedInputStream(in, partitionSize);
+        FileInputStream fin = new FileInputStream(mergedOutputFile);
+        fin.getChannel().position(startOffset);
+        InputStream in = new LimitedInputStream(fin, partitionSize);
+        in = blockManager.serializerManager().wrapForEncryption(in);
         if (conf.getBoolean("spark.shuffle.compress", true)) {
           in = CompressionCodec$.MODULE$.createCodec(conf).compressedInputStream(in);
         }
@@ -294,14 +286,32 @@ public void writeWithoutSpilling() throws Exception {
   }
 
   private void testMergingSpills(
-      boolean transferToEnabled,
-      String compressionCodecName) throws IOException {
+      final boolean transferToEnabled,
+      String compressionCodecName,
+      boolean encrypt) throws Exception {
     if (compressionCodecName != null) {
       conf.set("spark.shuffle.compress", "true");
       conf.set("spark.io.compression.codec", compressionCodecName);
     } else {
       conf.set("spark.shuffle.compress", "false");
     }
+    conf.set(org.apache.spark.internal.config.package$.MODULE$.IO_ENCRYPTION_ENABLED(), encrypt);
+
+    SerializerManager manager;
+    if (encrypt) {
+      manager = new SerializerManager(serializer, conf,
+        Option.apply(CryptoStreamUtils.createKey(conf)));
+    } else {
+      manager = new SerializerManager(serializer, conf);
+    }
+
+    when(blockManager.serializerManager()).thenReturn(manager);
+    testMergingSpills(transferToEnabled, encrypt);
+  }
+
+  private void testMergingSpills(
+      boolean transferToEnabled,
+      boolean encrypted) throws IOException {
     final UnsafeShuffleWriter<Object, Object> writer = createWriter(transferToEnabled);
     final ArrayList<Product2<Object, Object>> dataToWrite = new ArrayList<>();
     for (int i : new int[] { 1, 2, 3, 4, 4, 2 }) {
@@ -324,6 +334,7 @@ private void testMergingSpills(
     for (long size: partitionSizesInMergedFile) {
       sumOfPartitionSizes += size;
     }
+
     assertEquals(sumOfPartitionSizes, mergedOutputFile.length());
 
     assertEquals(HashMultiset.create(dataToWrite), HashMultiset.create(readRecordsFromFile()));
@@ -338,42 +349,72 @@ private void testMergingSpills(
 
   @Test
   public void mergeSpillsWithTransferToAndLZF() throws Exception {
-    testMergingSpills(true, LZFCompressionCodec.class.getName());
+    testMergingSpills(true, LZFCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndLZF() throws Exception {
-    testMergingSpills(false, LZFCompressionCodec.class.getName());
+    testMergingSpills(false, LZFCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithTransferToAndLZ4() throws Exception {
-    testMergingSpills(true, LZ4CompressionCodec.class.getName());
+    testMergingSpills(true, LZ4CompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndLZ4() throws Exception {
-    testMergingSpills(false, LZ4CompressionCodec.class.getName());
+    testMergingSpills(false, LZ4CompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithTransferToAndSnappy() throws Exception {
-    testMergingSpills(true, SnappyCompressionCodec.class.getName());
+    testMergingSpills(true, SnappyCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndSnappy() throws Exception {
-    testMergingSpills(false, SnappyCompressionCodec.class.getName());
+    testMergingSpills(false, SnappyCompressionCodec.class.getName(), false);
   }
 
   @Test
   public void mergeSpillsWithTransferToAndNoCompression() throws Exception {
-    testMergingSpills(true, null);
+    testMergingSpills(true, null, false);
   }
 
   @Test
   public void mergeSpillsWithFileStreamAndNoCompression() throws Exception {
-    testMergingSpills(false, null);
+    testMergingSpills(false, null, false);
+  }
+
+  @Test
+  public void mergeSpillsWithCompressionAndEncryption() throws Exception {
+    // This should actually be translated to a "file stream merge" internally, just have the
+    // test to make sure that it's the case.
+    testMergingSpills(true, LZ4CompressionCodec.class.getName(), true);
+  }
+
+  @Test
+  public void mergeSpillsWithFileStreamAndCompressionAndEncryption() throws Exception {
+    testMergingSpills(false, LZ4CompressionCodec.class.getName(), true);
+  }
+
+  @Test
+  public void mergeSpillsWithCompressionAndEncryptionSlowPath() throws Exception {
+    conf.set("spark.shuffle.unsafe.fastMergeEnabled", "false");
+    testMergingSpills(false, LZ4CompressionCodec.class.getName(), true);
+  }
+
+  @Test
+  public void mergeSpillsWithEncryptionAndNoCompression() throws Exception {
+    // This should actually be translated to a "file stream merge" internally, just have the
+    // test to make sure that it's the case.
+    testMergingSpills(true, null, true);
+  }
+
+  @Test
+  public void mergeSpillsWithFileStreamAndEncryptionAndNoCompression() throws Exception {
+    testMergingSpills(false, null, true);
   }
 
   @Test
@@ -531,4 +572,5 @@ public void testPeakMemoryUsed() throws Exception {
       writer.stop(false);
     }
   }
+
 }
diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
index 33709b454c4c9..26568146bf4d7 100644
--- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
+++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java
@@ -19,13 +19,11 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.util.*;
 
 import scala.Tuple2;
 import scala.Tuple2$;
-import scala.runtime.AbstractFunction1;
 
 import org.junit.After;
 import org.junit.Assert;
@@ -75,13 +73,6 @@ public abstract class AbstractBytesToBytesMapSuite {
   @Mock(answer = RETURNS_SMART_NULLS) BlockManager blockManager;
   @Mock(answer = RETURNS_SMART_NULLS) DiskBlockManager diskBlockManager;
 
-  private static final class WrapStream extends AbstractFunction1<OutputStream, OutputStream> {
-    @Override
-    public OutputStream apply(OutputStream stream) {
-      return stream;
-    }
-  }
-
   @Before
   public void setup() {
     memoryManager =
@@ -120,9 +111,9 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
 
         return new DiskBlockObjectWriter(
           (File) args[1],
+          serializerManager,
           (SerializerInstance) args[2],
           (Integer) args[3],
-          new WrapStream(),
           false,
           (ShuffleWriteMetrics) args[4],
           (BlockId) args[0]
diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
index a9cf8ff520ed4..fbbe530a132e1 100644
--- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
+++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java
@@ -19,14 +19,12 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.UUID;
 
 import scala.Tuple2;
 import scala.Tuple2$;
-import scala.runtime.AbstractFunction1;
 
 import org.junit.After;
 import org.junit.Before;
@@ -57,13 +55,15 @@
 
 public class UnsafeExternalSorterSuite {
 
+  private final SparkConf conf = new SparkConf();
+
   final LinkedList<File> spillFilesCreated = new LinkedList<>();
   final TestMemoryManager memoryManager =
-    new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false"));
+    new TestMemoryManager(conf.clone().set("spark.memory.offHeap.enabled", "false"));
   final TaskMemoryManager taskMemoryManager = new TaskMemoryManager(memoryManager, 0);
   final SerializerManager serializerManager = new SerializerManager(
-    new JavaSerializer(new SparkConf()),
-    new SparkConf().set("spark.shuffle.spill.compress", "false"));
+    new JavaSerializer(conf),
+    conf.clone().set("spark.shuffle.spill.compress", "false"));
   // Use integer comparison for comparing prefixes (which are partition ids, in this case)
   final PrefixComparator prefixComparator = PrefixComparators.LONG;
   // Since the key fits within the 8-byte prefix, we don't need to do any record comparison, so
@@ -86,14 +86,7 @@ public int compare(
 
   protected boolean shouldUseRadixSort() { return false; }
 
-  private final long pageSizeBytes = new SparkConf().getSizeAsBytes("spark.buffer.pageSize", "4m");
-
-  private static final class WrapStream extends AbstractFunction1<OutputStream, OutputStream> {
-    @Override
-    public OutputStream apply(OutputStream stream) {
-      return stream;
-    }
-  }
+  private final long pageSizeBytes = conf.getSizeAsBytes("spark.buffer.pageSize", "4m");
 
   @Before
   public void setUp() {
@@ -126,9 +119,9 @@ public DiskBlockObjectWriter answer(InvocationOnMock invocationOnMock) throws Th
 
         return new DiskBlockObjectWriter(
           (File) args[1],
+          serializerManager,
           (SerializerInstance) args[2],
           (Integer) args[3],
-          new WrapStream(),
           false,
           (ShuffleWriteMetrics) args[4],
           (BlockId) args[0]
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
index 442941685f1ae..85ccb33471048 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala
@@ -33,7 +33,7 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark._
 import org.apache.spark.executor.{ShuffleWriteMetrics, TaskMetrics}
-import org.apache.spark.serializer.{JavaSerializer, SerializerInstance}
+import org.apache.spark.serializer.{JavaSerializer, SerializerInstance, SerializerManager}
 import org.apache.spark.shuffle.IndexShuffleBlockResolver
 import org.apache.spark.storage._
 import org.apache.spark.util.Utils
@@ -90,11 +90,12 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte
     )).thenAnswer(new Answer[DiskBlockObjectWriter] {
       override def answer(invocation: InvocationOnMock): DiskBlockObjectWriter = {
         val args = invocation.getArguments
+        val manager = new SerializerManager(new JavaSerializer(conf), conf)
         new DiskBlockObjectWriter(
           args(1).asInstanceOf[File],
+          manager,
           args(2).asInstanceOf[SerializerInstance],
           args(3).asInstanceOf[Int],
-          wrapStream = identity,
           syncWrites = false,
           args(4).asInstanceOf[ShuffleWriteMetrics],
           blockId = args(0).asInstanceOf[BlockId]
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
index 684e978d11864..bfb3ac4c15bca 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala
@@ -22,7 +22,7 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.executor.ShuffleWriteMetrics
-import org.apache.spark.serializer.JavaSerializer
+import org.apache.spark.serializer.{JavaSerializer, SerializerManager}
 import org.apache.spark.util.Utils
 
 class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
@@ -42,11 +42,19 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
-  test("verify write metrics") {
+  private def createWriter(): (DiskBlockObjectWriter, File, ShuffleWriteMetrics) = {
     val file = new File(tempDir, "somefile")
+    val conf = new SparkConf()
+    val serializerManager = new SerializerManager(new JavaSerializer(conf), conf)
     val writeMetrics = new ShuffleWriteMetrics()
     val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+      file, serializerManager, new JavaSerializer(new SparkConf()).newInstance(), 1024, true,
+      writeMetrics)
+    (writer, file, writeMetrics)
+  }
+
+  test("verify write metrics") {
+    val (writer, file, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -66,10 +74,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("verify write metrics on revert") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, _, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     // Record metrics update on every write
@@ -89,10 +94,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("Reopening a closed block writer") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, _, _) = createWriter()
 
     writer.open()
     writer.close()
@@ -102,10 +104,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("calling revertPartialWritesAndClose() on a partial write should truncate up to commit") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     val firstSegment = writer.commitAndGet()
@@ -120,10 +119,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("calling revertPartialWritesAndClose() after commit() should have no effect") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
 
     writer.write(Long.box(20), Long.box(30))
     val firstSegment = writer.commitAndGet()
@@ -136,10 +132,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("calling revertPartialWritesAndClose() on a closed block writer should have no effect") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -153,10 +146,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("commit() and close() should be idempotent") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -173,10 +163,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("revertPartialWritesAndClose() should be idempotent") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, file, writeMetrics) = createWriter()
     for (i <- 1 to 1000) {
       writer.write(i, i)
     }
@@ -191,10 +178,7 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach {
   }
 
   test("commit() and close() without ever opening or writing") {
-    val file = new File(tempDir, "somefile")
-    val writeMetrics = new ShuffleWriteMetrics()
-    val writer = new DiskBlockObjectWriter(
-      file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)
+    val (writer, _, _) = createWriter()
     val segment = writer.commitAndGet()
     writer.close()
     assert(segment.length === 0)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 5141e36d9e38d..7f0838268a111 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.util.collection
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark._
+import org.apache.spark.internal.config._
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.memory.MemoryTestingUtils
 
@@ -230,14 +231,19 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("spilling with compression and encryption") {
+    testSimpleSpilling(Some(CompressionCodec.DEFAULT_COMPRESSION_CODEC), encrypt = true)
+  }
+
   /**
    * Test spilling through simple aggregations and cogroups.
    * If a compression codec is provided, use it. Otherwise, do not compress spills.
    */
-  private def testSimpleSpilling(codec: Option[String] = None): Unit = {
+  private def testSimpleSpilling(codec: Option[String] = None, encrypt: Boolean = false): Unit = {
     val size = 1000
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
     conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    conf.set(IO_ENCRYPTION_ENABLED, encrypt)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     assertSpilled(sc, "reduceByKey") {

From 6e2e987bd8d4f4417b6fd6ff15dc2f38e9c7e661 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 30 Nov 2016 16:18:53 -0800
Subject: [PATCH 245/534] [SPARK-18655][SS] Ignore Structured Streaming 2.0.2
 logs in history server

## What changes were proposed in this pull request?

As `queryStatus` in StreamingQueryListener events was removed in #15954, parsing 2.0.2 structured streaming logs will throw the following errror:

```
[info]   com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException: Unrecognized field "queryStatus" (class org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent), not marked as ignorable (2 known properties: "id", "exception"])
[info]  at [Source: {"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491532753,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":0.0,"processingRate":0.0,"triggerDetails":{"latency.getOffset.source":"1","triggerId":"1"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[#0]"},"triggerDetails":{}},"exception":null}; line: 1, column: 521] (through reference chain: org.apache.spark.sql.streaming.QueryTerminatedEvent["queryStatus"])
[info]   at com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException.from(UnrecognizedPropertyException.java:51)
[info]   at com.fasterxml.jackson.databind.DeserializationContext.reportUnknownProperty(DeserializationContext.java:839)
[info]   at com.fasterxml.jackson.databind.deser.std.StdDeserializer.handleUnknownProperty(StdDeserializer.java:1045)
[info]   at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.handleUnknownProperty(BeanDeserializerBase.java:1352)
[info]   at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.handleUnknownProperties(BeanDeserializerBase.java:1306)
[info]   at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeUsingPropertyBased(BeanDeserializer.java:453)
[info]   at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.deserializeFromObjectUsingNonDefault(BeanDeserializerBase.java:1099)
...
```

This PR just ignores such errors and adds a test to make sure we can read 2.0.2 logs.

## How was this patch tested?

`query-event-logs-version-2.0.2.txt` has all types of events generated by Structured Streaming in Spark 2.0.2. `testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.2")` verified we can load them without any error.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16085 from zsxwing/SPARK-18655.

(cherry picked from commit c4979f6ea8ed44fd87ded3133efa6df39d4842c3)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../org/apache/spark/scheduler/ReplayListenerBus.scala    | 7 +++++++
 .../query-event-logs-version-2.0.2.txt                    | 5 +++++
 .../spark/sql/streaming/StreamingQueryListenerSuite.scala | 8 ++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 0bd5a6bc59a9e..08e05ae0c095b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -22,6 +22,7 @@ import java.io.{InputStream, IOException}
 import scala.io.Source
 
 import com.fasterxml.jackson.core.JsonParseException
+import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.internal.Logging
@@ -87,6 +88,12 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
             // Ignore events generated by Structured Streaming in Spark 2.0.0 and 2.0.1.
             // It's safe since no place uses them.
             logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
+          case e: UnrecognizedPropertyException if e.getMessage != null && e.getMessage.startsWith(
+            "Unrecognized field \"queryStatus\" " +
+              "(class org.apache.spark.sql.streaming.StreamingQueryListener$") =>
+            // Ignore events generated by Structured Streaming in Spark 2.0.2
+            // It's safe since no place uses them.
+            logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
           case jpe: JsonParseException =>
             // We can only ignore exception from last line of the file that might be truncated
             // the last entry may not be the very last line in the event log, but we treat it
diff --git a/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt
new file mode 100644
index 0000000000000..57c44c8627252
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/query-event-logs-version-2.0.2.txt
@@ -0,0 +1,5 @@
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491481350,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"-","inputRate":0.0,"processingRate":0.0,"triggerDetails":{}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[-]"},"triggerDetails":{}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491493386,"inputRate":83.33333333333333,"processingRate":0.5773672055427251,"latency":1738.0,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":83.33333333333333,"processingRate":0.5773672055427251,"triggerDetails":{"latency.getBatch.source":"39","numRows.input.source":"1","latency.getOffset.source":"91","triggerId":"0"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[#0]"},"triggerDetails":{"timestamp.afterGetBatch":"1480491491817","latency.offsetLogWrite":"26","timestamp.triggerStart":"1480491491653","triggerId":"0","timestamp.triggerFinish":"1480491493385","latency.fullTrigger":"1732","latency.getBatch.total":"44","timestamp.afterGetOffset":"1480491491772","numRows.input.total":"1","isTriggerActive":"false","latency.optimizer":"406","latency.getOffset.total":"91","isDataPresentInTrigger":"true"}}}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent","queryStatus":{"name":"query-1","id":1,"timestamp":1480491532753,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":0.0,"processingRate":0.0,"triggerDetails":{"latency.getOffset.source":"1","triggerId":"1"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[#0]"},"triggerDetails":{}},"exception":null}
+{"Event":"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent","queryStatus":{"name":"query-0","id":0,"timestamp":1480491812530,"inputRate":0.0,"processingRate":0.0,"latency":null,"sourceStatuses":[{"description":"FileStreamSource[file:/Users/zsx/stream]","offsetDesc":"#0","inputRate":0.0,"processingRate":0.0,"triggerDetails":{"latency.getBatch.source":"25","latency.getOffset.source":"65","triggerId":"0"}}],"sinkStatus":{"description":"FileSink[/Users/zsx/stream2]","offsetDesc":"[-]"},"triggerDetails":{}},"exception":"org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:183)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:155)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:153)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:86)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:172)\n\t... 8 more\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)\n\tat scala.Option.foreach(Option.scala:257)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1873)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1886)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1906)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.write(FileStreamSink.scala:151)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:70)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:437)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply$mcZ$sp(StreamExecution.scala:225)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:212)\n\tat org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:43)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:208)\n\tat org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:142)\nCaused by: org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:183)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:155)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter$$anonfun$write$1.apply(FileStreamSink.scala:153)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:86)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.lang.ArithmeticException: / by zero\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat $line15.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>:25)\n\tat org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)\n\tat org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)\n\tat org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)\n\tat org.apache.spark.sql.execution.streaming.FileStreamSinkWriter.writePartitionToSingleFile(FileStreamSink.scala:172)\n\t... 8 more\n"}
+{"Event":"SparkListenerApplicationEnd","Timestamp":1480491541552}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 08b93e7d0b498..07a13a48a18c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -207,6 +207,14 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.1.txt")
   }
 
+  testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.2") {
+    // query-event-logs-version-2.0.2.txt has all types of events generated by
+    // Structured Streaming in Spark 2.0.2.
+    // SparkListenerApplicationEnd is the only valid event and it's the last event. We use it
+    // to verify that we can skip broken jsons generated by Structured Streaming.
+    testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.2.txt")
+  }
+
   private def testReplayListenerBusWithBorkenEventJsons(fileName: String): Unit = {
     val input = getClass.getResourceAsStream(s"/structured-streaming/$fileName")
     val events = mutable.ArrayBuffer[SparkListenerEvent]()

From 7d4596734b6ebd021adc32ff87aa859bc2eeb976 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 30 Nov 2016 17:41:43 -0800
Subject: [PATCH 246/534] [SPARK-18617][SPARK-18560][TEST] Fix flaky test:
 StreamingContextSuite. Receiver data should be deserialized properly

## What changes were proposed in this pull request?

Fixed the potential SparkContext leak in `StreamingContextSuite.SPARK-18560 Receiver data should be deserialized properly` which was added in #16052. I also removed FakeByteArrayReceiver and used TestReceiver directly.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16091 from zsxwing/SPARK-18617-follow-up.

(cherry picked from commit 0a811210f809eb5b80eae14694d484d45b48b3f6)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../streaming/StreamingContextSuite.scala     | 34 +++++--------------
 1 file changed, 8 insertions(+), 26 deletions(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 45d8f50853431..35eeb9dfa5ef8 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.streaming
 
 import java.io.{File, NotSerializableException}
+import java.util.concurrent.{CountDownLatch, TimeUnit}
 import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.ArrayBuffer
@@ -811,7 +812,8 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     // other one. Then Spark jobs need to fetch remote blocks and it will trigger SPARK-18560.
     val conf = new SparkConf().setMaster("local-cluster[2,1,1024]").setAppName(appName)
     ssc = new StreamingContext(conf, Milliseconds(100))
-    val input = ssc.receiverStream(new FakeByteArrayReceiver)
+    val input = ssc.receiverStream(new TestReceiver)
+    val latch = new CountDownLatch(1)
     input.count().foreachRDD { rdd =>
       // Make sure we can read from BlockRDD
       if (rdd.collect().headOption.getOrElse(0L) > 0) {
@@ -820,12 +822,17 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
           setDaemon(true)
           override def run(): Unit = {
             ssc.stop(stopSparkContext = true, stopGracefully = false)
+            latch.countDown()
           }
         }.start()
       }
     }
     ssc.start()
     ssc.awaitTerminationOrTimeout(60000)
+    // Wait until `ssc.top` returns. Otherwise, we may finish this test too fast and leak an active
+    // SparkContext. Note: the stop codes in `after` will just do nothing if `ssc.stop` in this test
+    // is running.
+    assert(latch.await(60, TimeUnit.SECONDS))
   }
 
   def addInputStream(s: StreamingContext): DStream[Int] = {
@@ -891,31 +898,6 @@ object TestReceiver {
   val counter = new AtomicInteger(1)
 }
 
-class FakeByteArrayReceiver extends Receiver[Array[Byte]](StorageLevel.MEMORY_ONLY) with Logging {
-
-  val data: Array[Byte] = "test".getBytes
-  var receivingThreadOption: Option[Thread] = None
-
-  override def onStart(): Unit = {
-    val thread = new Thread() {
-      override def run() {
-        logInfo("Receiving started")
-        while (!isStopped) {
-          store(data)
-        }
-        logInfo("Receiving stopped")
-      }
-    }
-    receivingThreadOption = Some(thread)
-    thread.start()
-  }
-
-  override def onStop(): Unit = {
-    // no clean to be done, the receiving thread should stop on it own, so just wait for it.
-    receivingThreadOption.foreach(_.join())
-  }
-}
-
 /** Custom receiver for testing whether a slow receiver can be shutdown gracefully or not */
 class SlowTestReceiver(totalRecords: Int, recordsPerSecond: Int)
   extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging {

From e8d8e350998e6e44a6dee7f78dbe2d1aa997c1d6 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 30 Nov 2016 20:32:17 -0800
Subject: [PATCH 247/534] [SPARK-18476][SPARKR][ML] SparkR Logistic Regression
 should should support output original label.

## What changes were proposed in this pull request?

Similar to SPARK-18401, as a classification algorithm, logistic regression should support output original label instead of supporting index label.

In this PR, original label output is supported and test cases are modified and added. Document is also modified.

## How was this patch tested?

Unit tests.

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #15910 from wangmiao1981/audit.

(cherry picked from commit 2eb6764fbb23553fc17772d8a4a1cad55ff7ba6e)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/mllib.R                               | 19 +++++-----
 R/pkg/inst/tests/testthat/test_mllib.R        | 26 +++++++++----
 .../scala/org/apache/spark/SparkContext.scala |  2 +-
 .../ml/r/LogisticRegressionWrapper.scala      | 37 +++++++++++++------
 4 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 02bc6456de4d0..eed829356f2be 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -712,7 +712,6 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
-#' @param fitIntercept whether to fit an intercept term.
 #' @param family the name of family which is a description of the label distribution to be used in the model.
 #'               Supported options:
 #'                 \itemize{
@@ -747,11 +746,11 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' \dontrun{
 #' sparkR.session()
 #' # binary logistic regression
-#' label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
-#' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-#' binary_data <- as.data.frame(cbind(label, feature))
+#' label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+#' features <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+#' binary_data <- as.data.frame(cbind(label, features))
 #' binary_df <- createDataFrame(binary_data)
-#' blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+#' blr_model <- spark.logit(binary_df, label ~ features, thresholds = 1.0)
 #' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
 #'
 #' # summary of binary logistic regression
@@ -783,7 +782,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @note spark.logit since 2.1.0
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
-                   tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE,
+                   tol = 1E-6, family = "auto", standardization = TRUE,
                    thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
                    probabilityCol = "probability") {
             formula <- paste(deparse(formula), collapse = "")
@@ -795,10 +794,10 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam),
                                 as.numeric(elasticNetParam), as.integer(maxIter),
-                                as.numeric(tol), as.logical(fitIntercept),
-                                as.character(family), as.logical(standardization),
-                                as.array(thresholds), as.character(weightCol),
-                                as.integer(aggregationDepth), as.character(probabilityCol))
+                                as.numeric(tol), as.character(family),
+                                as.logical(standardization), as.array(thresholds),
+                                as.character(weightCol), as.integer(aggregationDepth),
+                                as.character(probabilityCol))
             new("LogisticRegressionModel", jobj = jobj)
           })
 
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index b05be476a3fa8..c8f062d8ac5dc 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -646,30 +646,30 @@ test_that("spark.isotonicRegression", {
 
 test_that("spark.logit", {
   # test binary logistic regression
-  label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
   feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
   binary_data <- as.data.frame(cbind(label, feature))
   binary_df <- createDataFrame(binary_data)
 
   blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
   blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
+  expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
   blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
   blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
-  expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
+  expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0"))
 
   # test summary of binary logistic regression
   blr_summary <- summary(blr_model)
   blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
-  expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
+  expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
                tolerance = 1e-4)
-  expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
+  expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
                tolerance = 1e-4)
   blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
-  expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
+  expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
                tolerance = 1e-4)
   blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
-  expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
+  expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
                tolerance = 1e-4)
 
   # test model save and read
@@ -683,6 +683,16 @@ test_that("spark.logit", {
   expect_error(summary(blr_model2))
   unlink(modelPath)
 
+  # test prediction label as text
+  training <- suppressWarnings(createDataFrame(iris))
+  binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ]
+  binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width)
+  prediction <- predict(binomial_model, binomial_training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
+                "versicolor", "virginica", "versicolor", "virginica", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
   # test multinomial logistic regression
   label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
   feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
@@ -694,7 +704,7 @@ test_that("spark.logit", {
 
   model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
   predict1 <- collect(select(predict(model, df), "prediction"))
-  expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
+  expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
   # Summary of multinomial logistic regression is not implemented yet
   expect_error(summary(model))
 })
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1cb39a4209a1c..b8414b5d099c5 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -19,7 +19,7 @@ package org.apache.spark
 
 import java.io._
 import java.lang.reflect.Constructor
-import java.net.{MalformedURLException, URI}
+import java.net.{URI}
 import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 9b352c9863114..9fe6202980fca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -23,9 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -34,6 +34,8 @@ private[r] class LogisticRegressionWrapper private (
     val features: Array[String],
     val isLoaded: Boolean = false) extends MLWritable {
 
+  import LogisticRegressionWrapper._
+
   private val logisticRegressionModel: LogisticRegressionModel =
     pipeline.stages(1).asInstanceOf[LogisticRegressionModel]
 
@@ -57,7 +59,11 @@ private[r] class LogisticRegressionWrapper private (
   lazy val recallByThreshold: DataFrame = blrSummary.recallByThreshold
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(logisticRegressionModel.getFeaturesCol)
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(logisticRegressionModel.getFeaturesCol)
+      .drop(logisticRegressionModel.getLabelCol)
+
   }
 
   override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)
@@ -66,6 +72,9 @@ private[r] class LogisticRegressionWrapper private (
 private[r] object LogisticRegressionWrapper
     extends MLReadable[LogisticRegressionWrapper] {
 
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
   def fit( // scalastyle:ignore
       data: DataFrame,
       formula: String,
@@ -73,7 +82,6 @@ private[r] object LogisticRegressionWrapper
       elasticNetParam: Double,
       maxIter: Int,
       tol: Double,
-      fitIntercept: Boolean,
       family: String,
       standardization: Boolean,
       thresholds: Array[Double],
@@ -84,14 +92,14 @@ private[r] object LogisticRegressionWrapper
 
     val rFormula = new RFormula()
       .setFormula(formula)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+      .setForceIndexLabel(true)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
-    // get feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
+    val fitIntercept = rFormula.hasIntercept
+
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val logisticRegression = new LogisticRegression()
@@ -105,7 +113,9 @@ private[r] object LogisticRegressionWrapper
       .setWeightCol(weightCol)
       .setAggregationDepth(aggregationDepth)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
       .setProbabilityCol(probability)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
 
     if (thresholds.length > 1) {
       logisticRegression.setThresholds(thresholds)
@@ -113,8 +123,13 @@ private[r] object LogisticRegressionWrapper
       logisticRegression.setThreshold(thresholds(0))
     }
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, logisticRegression))
+      .setStages(Array(rFormulaModel, logisticRegression, idxToStr))
       .fit(data)
 
     new LogisticRegressionWrapper(pipeline, features)

From 9dc3ef6e11b7dd3fd916d1442733938dcb5750e3 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Thu, 1 Dec 2016 16:48:10 +0800
Subject: [PATCH 248/534] [SPARK-18635][SQL] Partition name/values not escaped
 correctly in some cases

## What changes were proposed in this pull request?

Due to confusion between URI vs paths, in certain cases we escape partition values too many times, which causes some Hive client operations to fail or write data to the wrong location. This PR fixes at least some of these cases.

To my understanding this is how values, filesystem paths, and URIs interact.
- Hive stores raw (unescaped) partition values that are returned to you directly when you call listPartitions.
- Internally, we convert these raw values to filesystem paths via `ExternalCatalogUtils.[un]escapePathName`.
- In some circumstances we store URIs instead of filesystem paths. When a path is converted to a URI via `path.toURI`, the escaped partition values are further URI-encoded. This means that to get a path back from a URI, you must call `new Path(new URI(uriTxt))` in order to decode the URI-encoded string.
- In `CatalogStorageFormat` we store URIs as strings. This makes it easy to forget to URI-decode the value before converting it into a path.
- Finally, the Hive client itself uses mostly Paths for representing locations, and only URIs occasionally.

In the future we should probably clean this up, perhaps by dropping use of URIs when unnecessary. We should also try fixing escaping for partition names as well as values, though names are unlikely to contain special characters.

cc mallman cloud-fan yhuai

## How was this patch tested?

Unit tests.

Author: Eric Liang <ekl@databricks.com>

Closes #16071 from ericl/spark-18635.

(cherry picked from commit 88f559f20a5208f2386b874eb119f1cba2c748c7)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/catalog/interface.scala      |  3 ++
 .../spark/sql/hive/HiveExternalCatalog.scala  |  5 +-
 .../spark/sql/hive/client/HiveShim.scala      |  6 ++-
 .../PartitionProviderCompatibilitySuite.scala | 54 +++++++++++++++++++
 4 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index d8bc86727e466..d2a1af0800914 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -44,6 +44,9 @@ case class CatalogFunction(
  * Storage format, used to describe how a partition or a table is stored.
  */
 case class CatalogStorageFormat(
+    // TODO(ekl) consider storing this field as java.net.URI for type safety. Note that this must
+    // be converted to/from a hadoop Path object using new Path(new URI(locationUri)) and
+    // path.toUri respectively before use as a filesystem path due to URI char escaping.
     locationUri: Option[String],
     inputFormat: Option[String],
     outputFormat: Option[String],
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index fd9dc32063872..1a9943bc31058 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.io.IOException
+import java.net.URI
 import java.util
 
 import scala.util.control.NonFatal
@@ -833,10 +834,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       // However, Hive metastore is not case preserving and will generate wrong partition location
       // with lower cased partition column names. Here we set the default partition location
       // manually to avoid this problem.
-      val partitionPath = p.storage.locationUri.map(new Path(_)).getOrElse {
+      val partitionPath = p.storage.locationUri.map(uri => new Path(new URI(uri))).getOrElse {
         ExternalCatalogUtils.generatePartitionPath(p.spec, partitionColumnNames, tablePath)
       }
-      p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toString)))
+      p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toUri.toString)))
     }
     val lowerCasedParts = partsWithLocation.map(p => p.copy(spec = lowerCasePartitionSpec(p.spec)))
     client.createPartitions(db, table, lowerCasedParts, ignoreIfExists)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 3d9642dd1463d..e561706facf03 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -268,7 +268,8 @@ private[client] class Shim_v0_12 extends Shim with Logging {
       ignoreIfExists: Boolean): Unit = {
     val table = hive.getTable(database, tableName)
     parts.foreach { s =>
-      val location = s.storage.locationUri.map(new Path(table.getPath, _)).orNull
+      val location = s.storage.locationUri.map(
+        uri => new Path(table.getPath, new Path(new URI(uri)))).orNull
       val params = if (s.parameters.nonEmpty) s.parameters.asJava else null
       val spec = s.spec.asJava
       if (hive.getPartition(table, spec, false) != null && ignoreIfExists) {
@@ -463,7 +464,8 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       ignoreIfExists: Boolean): Unit = {
     val addPartitionDesc = new AddPartitionDesc(db, table, ignoreIfExists)
     parts.zipWithIndex.foreach { case (s, i) =>
-      addPartitionDesc.addPartition(s.spec.asJava, s.storage.locationUri.orNull)
+      addPartitionDesc.addPartition(
+        s.spec.asJava, s.storage.locationUri.map(u => new Path(new URI(u)).toString).orNull)
       if (s.parameters.nonEmpty) {
         addPartitionDesc.getPartition(i).setPartParams(s.parameters.asJava)
       }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index cace5fa95cad0..e8e4238d1c5a4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -205,6 +205,60 @@ class PartitionProviderCompatibilitySuite
         }
       }
     }
+
+    test(s"SPARK-18635 special chars in partition values - partition management $enabled") {
+      withTable("test") {
+        spark.range(10)
+          .selectExpr("id", "id as A", "'%' as B")
+          .write.partitionBy("A", "B").mode("overwrite")
+          .saveAsTable("test")
+        assert(spark.sql("select * from test").count() == 10)
+        assert(spark.sql("select * from test where B = '%'").count() == 10)
+        assert(spark.sql("select * from test where B = '$'").count() == 0)
+        spark.range(10)
+          .selectExpr("id", "id as A", "'=' as B")
+          .write.mode("append").insertInto("test")
+        spark.sql("insert into test partition (A, B) select id, id, '%=' from range(10)")
+        assert(spark.sql("select * from test").count() == 30)
+        assert(spark.sql("select * from test where B = '%'").count() == 10)
+        assert(spark.sql("select * from test where B = '='").count() == 10)
+        assert(spark.sql("select * from test where B = '%='").count() == 10)
+
+        // show partitions sanity check
+        val parts = spark.sql("show partitions test").collect().map(_.get(0)).toSeq
+        assert(parts.length == 30)
+        assert(parts.contains("A=0/B=%25"))
+        assert(parts.contains("A=0/B=%3D"))
+        assert(parts.contains("A=0/B=%25%3D"))
+
+        // drop partition sanity check
+        spark.sql("alter table test drop partition (A=1, B='%')")
+        assert(spark.sql("select * from test").count() == 29)  // 1 file in dropped partition
+
+        withTempDir { dir =>
+          // custom locations sanity check
+          spark.sql(s"""
+            |alter table test partition (A=0, B='%')
+            |set location '${dir.getAbsolutePath}'""".stripMargin)
+          assert(spark.sql("select * from test").count() == 28)  // moved to empty dir
+
+          // rename partition sanity check
+          spark.sql(s"""
+            |alter table test partition (A=5, B='%')
+            |rename to partition (A=100, B='%')""".stripMargin)
+          assert(spark.sql("select * from test where a = 5 and b = '%'").count() == 0)
+          assert(spark.sql("select * from test where a = 100 and b = '%'").count() == 1)
+
+          // try with A=0 which has a custom location
+          spark.sql("insert into test partition (A=0, B='%') select 1")
+          spark.sql(s"""
+            |alter table test partition (A=0, B='%')
+            |rename to partition (A=101, B='%')""".stripMargin)
+          assert(spark.sql("select * from test where a = 0 and b = '%'").count() == 0)
+          assert(spark.sql("select * from test where a = 101 and b = '%'").count() == 1)
+        }
+      }
+    }
   }
 
   /**

From 8579ab5d7092a65f044fd925ecd5b790305f0aef Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 1 Dec 2016 01:57:58 -0800
Subject: [PATCH 249/534] [SPARK-18666][WEB UI] Remove the codes checking
 deprecated config spark.sql.unsafe.enabled

## What changes were proposed in this pull request?

`spark.sql.unsafe.enabled` is deprecated since 1.6. There still are codes in UI to check it. We should remove it and clean the codes.

## How was this patch tested?

Changes to related existing unit test.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #16095 from viirya/remove-deprecated-config-code.

(cherry picked from commit dbf842b7a8479f9566146192ffc04421591742d5)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../org/apache/spark/ui/jobs/StagePage.scala  | 49 ++++++-------------
 .../org/apache/spark/ui/StagePageSuite.scala  | 16 ++----
 2 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 8c7cefe200739..412ddfa9fad35 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -70,8 +70,6 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
   // if we find that it's okay.
   private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
 
-  private val displayPeakExecutionMemory = parent.conf.getBoolean("spark.sql.unsafe.enabled", true)
-
   private def getLocalitySummaryString(stageData: StageUIData): String = {
     val localities = stageData.taskData.values.map(_.taskInfo.taskLocality)
     val localityCounts = localities.groupBy(identity).mapValues(_.size)
@@ -252,15 +250,13 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
                   <span class="additional-metric-title">Getting Result Time</span>
                 </span>
               </li>
-              {if (displayPeakExecutionMemory) {
-                <li>
-                  <span data-toggle="tooltip"
-                        title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
-                    <input type="checkbox" name={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}/>
-                    <span class="additional-metric-title">Peak Execution Memory</span>
-                  </span>
-                </li>
-              }}
+              <li>
+                <span data-toggle="tooltip"
+                      title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
+                  <input type="checkbox" name={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}/>
+                  <span class="additional-metric-title">Peak Execution Memory</span>
+                </span>
+              </li>
             </ul>
           </div>
         </div>
@@ -532,13 +528,9 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
               {serializationQuantiles}
             </tr>,
             <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
-            if (displayPeakExecutionMemory) {
-              <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-                {peakExecutionMemoryQuantiles}
-              </tr>
-            } else {
-              Nil
-            },
+            <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+              {peakExecutionMemoryQuantiles}
+            </tr>,
             if (stageData.hasInput) <tr>{inputQuantiles}</tr> else Nil,
             if (stageData.hasOutput) <tr>{outputQuantiles}</tr> else Nil,
             if (stageData.hasShuffleRead) {
@@ -1166,9 +1158,6 @@ private[ui] class TaskPagedTable(
     desc: Boolean,
     executorsListener: ExecutorsListener) extends PagedTable[TaskTableRowData] {
 
-  // We only track peak memory used for unsafe operators
-  private val displayPeakExecutionMemory = conf.getBoolean("spark.sql.unsafe.enabled", true)
-
   override def tableId: String = "task-table"
 
   override def tableCssClass: String =
@@ -1217,14 +1206,8 @@ private[ui] class TaskPagedTable(
         ("Task Deserialization Time", TaskDetailsClassNames.TASK_DESERIALIZATION_TIME),
         ("GC Time", ""),
         ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
-        ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME)) ++
-        {
-          if (displayPeakExecutionMemory) {
-            Seq(("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY))
-          } else {
-            Nil
-          }
-        } ++
+        ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME),
+        ("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
         {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
         {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++
         {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
@@ -1316,11 +1299,9 @@ private[ui] class TaskPagedTable(
       <td class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
         {UIUtils.formatDuration(task.gettingResultTime)}
       </td>
-      {if (displayPeakExecutionMemory) {
-        <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-          {Utils.bytesToString(task.peakExecutionMemoryUsed)}
-        </td>
-      }}
+      <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+        {Utils.bytesToString(task.peakExecutionMemoryUsed)}
+      </td>
       {if (task.accumulators.nonEmpty) {
         <td>{Unparsed(task.accumulators.get)}</td>
       }}
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index d30b987d6ca31..11482d187aeca 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -35,25 +35,15 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
   private val peakExecutionMemory = 10
 
-  test("peak execution memory only displayed if unsafe is enabled") {
-    val unsafeConf = "spark.sql.unsafe.enabled"
-    val conf = new SparkConf(false).set(unsafeConf, "true")
+  test("peak execution memory should displayed") {
+    val conf = new SparkConf(false)
     val html = renderStagePage(conf).toString().toLowerCase
     val targetString = "peak execution memory"
     assert(html.contains(targetString))
-    // Disable unsafe and make sure it's not there
-    val conf2 = new SparkConf(false).set(unsafeConf, "false")
-    val html2 = renderStagePage(conf2).toString().toLowerCase
-    assert(!html2.contains(targetString))
-    // Avoid setting anything; it should be displayed by default
-    val conf3 = new SparkConf(false)
-    val html3 = renderStagePage(conf3).toString().toLowerCase
-    assert(html3.contains(targetString))
   }
 
   test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
-    val unsafeConf = "spark.sql.unsafe.enabled"
-    val conf = new SparkConf(false).set(unsafeConf, "true")
+    val conf = new SparkConf(false)
     val html = renderStagePage(conf).toString().toLowerCase
     // verify min/25/50/75/max show task value not cumulative values
     assert(html.contains(s"<td>$peakExecutionMemory.0 b</td>" * 5))

From cbbe217777173b100de2f5a613c46428974826f6 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Thu, 1 Dec 2016 14:14:09 +0100
Subject: [PATCH 250/534] [SPARK-18645][DEPLOY] Fix spark-daemon.sh arguments
 error lead to throws Unrecognized option

## What changes were proposed in this pull request?

spark-daemon.sh will lost single quotes around after #15338. as follows:
```
execute_command nice -n 0 bash /opt/cloudera/parcels/SPARK-2.1.0-cdh5.4.3.d20161129-21.04.38/lib/spark/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 --name Thrift JDBC/ODBC Server --conf spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:-HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp
```
With this fix, as follows:
```
execute_command nice -n 0 bash /opt/cloudera/parcels/SPARK-2.1.0-cdh5.4.3.d20161129-21.04.38/lib/spark/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 --name 'Thrift JDBC/ODBC Server' --conf 'spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:-HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp'
```

## How was this patch tested?

- Manual tests
- Build the package and start-thriftserver.sh with `--conf 'spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:-HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp'`

Author: Yuming Wang <wgyumg@gmail.com>

Closes #16079 from wangyum/SPARK-18645.

(cherry picked from commit 2ab8551e79e1655c406c358b21c0a1e719f498be)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 sbin/spark-daemon.sh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 061019a55e997..c227c9828e6ac 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -124,9 +124,8 @@ if [ "$SPARK_NICENESS" = "" ]; then
 fi
 
 execute_command() {
-  local command="$@"
   if [ -z ${SPARK_NO_DAEMONIZE+set} ]; then
-      nohup -- $command >> $log 2>&1 < /dev/null &
+      nohup -- "$@" >> $log 2>&1 < /dev/null &
       newpid="$!"
 
       echo "$newpid" > "$pid"
@@ -143,12 +142,12 @@ execute_command() {
       sleep 2
       # Check if the process has died; in that case we'll tail the log so the user can see
       if [[ ! $(ps -p "$newpid" -o comm=) =~ "java" ]]; then
-        echo "failed to launch $command:"
+        echo "failed to launch: $@"
         tail -2 "$log" | sed 's/^/  /'
         echo "full log in $log"
       fi
   else
-      $command
+      "$@"
   fi
 }
 
@@ -176,11 +175,11 @@ run_command() {
 
   case "$mode" in
     (class)
-      execute_command nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command $@
+      execute_command nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class "$command" "$@"
       ;;
 
     (submit)
-      execute_command nice -n "$SPARK_NICENESS" bash "${SPARK_HOME}"/bin/spark-submit --class $command $@
+      execute_command nice -n "$SPARK_NICENESS" bash "${SPARK_HOME}"/bin/spark-submit --class "$command" "$@"
       ;;
 
     (*)

From 6916ddc385fc33fa390e541300ca2bb1dbd0599c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 1 Dec 2016 11:53:12 -0800
Subject: [PATCH 251/534] [SPARK-18674][SQL] improve the error message of using
 join

## What changes were proposed in this pull request?

The current error message of USING join is quite confusing, for example:
```
scala> val df1 = List(1,2,3).toDS.withColumnRenamed("value", "c1")
df1: org.apache.spark.sql.DataFrame = [c1: int]

scala> val df2 = List(1,2,3).toDS.withColumnRenamed("value", "c2")
df2: org.apache.spark.sql.DataFrame = [c2: int]

scala> df1.join(df2, usingColumn = "c1")
org.apache.spark.sql.AnalysisException: using columns ['c1] can not be resolved given input columns: [c1, c2] ;;
'Join UsingJoin(Inner,List('c1))
:- Project [value#1 AS c1#3]
:  +- LocalRelation [value#1]
+- Project [value#7 AS c2#9]
   +- LocalRelation [value#7]
```

after this PR, it becomes:
```
scala> val df1 = List(1,2,3).toDS.withColumnRenamed("value", "c1")
df1: org.apache.spark.sql.DataFrame = [c1: int]

scala> val df2 = List(1,2,3).toDS.withColumnRenamed("value", "c2")
df2: org.apache.spark.sql.DataFrame = [c2: int]

scala> df1.join(df2, usingColumn = "c1")
org.apache.spark.sql.AnalysisException: USING column `c1` can not be resolved with the right join side, the right output is: [c2];
```

## How was this patch tested?

updated tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16100 from cloud-fan/natural.

(cherry picked from commit e6534847100670a22b3b191a0f9d924fab7f3c02)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 28 ++++-------
 .../sql/catalyst/analysis/CheckAnalysis.scala |  6 ---
 .../sql/catalyst/parser/AstBuilder.scala      |  5 +-
 .../spark/sql/catalyst/plans/joinTypes.scala  |  2 +-
 .../analysis/ResolveNaturalJoinSuite.scala    | 47 +++++++++----------
 .../sql/catalyst/parser/PlanParserSuite.scala |  2 +-
 .../scala/org/apache/spark/sql/Dataset.scala  |  2 +-
 7 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e576d53280504..372a121993758 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1899,15 +1899,7 @@ class Analyzer(
     override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case j @ Join(left, right, UsingJoin(joinType, usingCols), condition)
           if left.resolved && right.resolved && j.duplicateResolved =>
-        // Resolve the column names referenced in using clause from both the legs of join.
-        val lCols = usingCols.flatMap(col => left.resolveQuoted(col.name, resolver))
-        val rCols = usingCols.flatMap(col => right.resolveQuoted(col.name, resolver))
-        if ((lCols.length == usingCols.length) && (rCols.length == usingCols.length)) {
-          val joinNames = lCols.map(exp => exp.name)
-          commonNaturalJoinProcessing(left, right, joinType, joinNames, None)
-        } else {
-          j
-        }
+        commonNaturalJoinProcessing(left, right, joinType, usingCols, None)
       case j @ Join(left, right, NaturalJoin(joinType), condition) if j.resolvedExceptNatural =>
         // find common column names from both sides
         val joinNames = left.output.map(_.name).intersect(right.output.map(_.name))
@@ -1922,18 +1914,16 @@ class Analyzer(
       joinNames: Seq[String],
       condition: Option[Expression]) = {
     val leftKeys = joinNames.map { keyName =>
-      val joinColumn = left.output.find(attr => resolver(attr.name, keyName))
-      assert(
-        joinColumn.isDefined,
-        s"$keyName should exist in ${left.output.map(_.name).mkString(",")}")
-      joinColumn.get
+      left.output.find(attr => resolver(attr.name, keyName)).getOrElse {
+        throw new AnalysisException(s"USING column `$keyName` can not be resolved with the " +
+          s"left join side, the left output is: [${left.output.map(_.name).mkString(", ")}]")
+      }
     }
     val rightKeys = joinNames.map { keyName =>
-      val joinColumn = right.output.find(attr => resolver(attr.name, keyName))
-      assert(
-        joinColumn.isDefined,
-        s"$keyName should exist in ${right.output.map(_.name).mkString(",")}")
-      joinColumn.get
+      right.output.find(attr => resolver(attr.name, keyName)).getOrElse {
+        throw new AnalysisException(s"USING column `$keyName` can not be resolved with the " +
+          s"right join side, the right output is: [${right.output.map(_.name).mkString(", ")}]")
+      }
     }
     val joinPairs = leftKeys.zip(rightKeys)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index db417526ed5b9..235a79973d6ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -190,12 +190,6 @@ trait CheckAnalysis extends PredicateHelper {
               case e =>
             }
 
-          case j @ Join(_, _, UsingJoin(_, cols), _) =>
-            val from = operator.inputSet.map(_.name).mkString(", ")
-            failAnalysis(
-              s"using columns [${cols.mkString(",")}] " +
-                s"can not be resolved given input columns: [$from] ")
-
           case j @ Join(_, _, _, Some(condition)) if condition.dataType != BooleanType =>
             failAnalysis(
               s"join condition '${condition.sql}' " +
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 2006844923cf7..06f0f5b67f220 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -570,10 +570,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
         // Resolve the join type and join condition
         val (joinType, condition) = Option(join.joinCriteria) match {
           case Some(c) if c.USING != null =>
-            val columns = c.identifier.asScala.map { column =>
-              UnresolvedAttribute.quoted(column.getText)
-            }
-            (UsingJoin(baseJoinType, columns), None)
+            (UsingJoin(baseJoinType, c.identifier.asScala.map(_.getText)), None)
           case Some(c) if c.booleanExpression != null =>
             (baseJoinType, Option(expression(c.booleanExpression)))
           case None if join.NATURAL != null =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
index 61e083e6fc2c3..853e9f3b076a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/joinTypes.scala
@@ -100,7 +100,7 @@ case class NaturalJoin(tpe: JoinType) extends JoinType {
   override def sql: String = "NATURAL " + tpe.sql
 }
 
-case class UsingJoin(tpe: JoinType, usingColumns: Seq[UnresolvedAttribute]) extends JoinType {
+case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType {
   require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe),
     "Unsupported using join type " + tpe)
   override def sql: String = "USING " + tpe.sql
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
index 100ec4d53fb81..1421d36fdb2a3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
@@ -38,7 +38,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using inner join") {
     val naturalPlan = r1.join(r2, NaturalJoin(Inner), None)
-    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("a")), None)
     val expected = r1.join(r2, Inner, Some(EqualTo(a, a))).select(a, b, c)
     checkAnalysis(naturalPlan, expected)
     checkAnalysis(usingPlan, expected)
@@ -46,7 +46,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using left join") {
     val naturalPlan = r1.join(r2, NaturalJoin(LeftOuter), None)
-    val usingPlan = r1.join(r2, UsingJoin(LeftOuter, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(LeftOuter, Seq("a")), None)
     val expected = r1.join(r2, LeftOuter, Some(EqualTo(a, a))).select(a, b, c)
     checkAnalysis(naturalPlan, expected)
     checkAnalysis(usingPlan, expected)
@@ -54,7 +54,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using right join") {
     val naturalPlan = r1.join(r2, NaturalJoin(RightOuter), None)
-    val usingPlan = r1.join(r2, UsingJoin(RightOuter, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(RightOuter, Seq("a")), None)
     val expected = r1.join(r2, RightOuter, Some(EqualTo(a, a))).select(a, b, c)
     checkAnalysis(naturalPlan, expected)
     checkAnalysis(usingPlan, expected)
@@ -62,7 +62,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using full outer join") {
     val naturalPlan = r1.join(r2, NaturalJoin(FullOuter), None)
-    val usingPlan = r1.join(r2, UsingJoin(FullOuter, Seq(UnresolvedAttribute("a"))), None)
+    val usingPlan = r1.join(r2, UsingJoin(FullOuter, Seq("a")), None)
     val expected = r1.join(r2, FullOuter, Some(EqualTo(a, a))).select(
       Alias(Coalesce(Seq(a, a)), "a")(), b, c)
     checkAnalysis(naturalPlan, expected)
@@ -71,7 +71,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using inner join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(Inner), None)
-    val usingPlan = r3.join(r4, UsingJoin(Inner, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(Inner, Seq("b")), None)
     val expected = r3.join(r4, Inner, Some(EqualTo(bNotNull, bNotNull))).select(
       bNotNull, aNotNull, cNotNull)
     checkAnalysis(naturalPlan, expected)
@@ -80,7 +80,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using left join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(LeftOuter), None)
-    val usingPlan = r3.join(r4, UsingJoin(LeftOuter, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(LeftOuter, Seq("b")), None)
     val expected = r3.join(r4, LeftOuter, Some(EqualTo(bNotNull, bNotNull))).select(
       bNotNull, aNotNull, c)
     checkAnalysis(naturalPlan, expected)
@@ -89,7 +89,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using right join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(RightOuter), None)
-    val usingPlan = r3.join(r4, UsingJoin(RightOuter, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(RightOuter, Seq("b")), None)
     val expected = r3.join(r4, RightOuter, Some(EqualTo(bNotNull, bNotNull))).select(
       bNotNull, a, cNotNull)
     checkAnalysis(naturalPlan, expected)
@@ -98,7 +98,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
   test("natural/using full outer join with no nullability") {
     val naturalPlan = r3.join(r4, NaturalJoin(FullOuter), None)
-    val usingPlan = r3.join(r4, UsingJoin(FullOuter, Seq(UnresolvedAttribute("b"))), None)
+    val usingPlan = r3.join(r4, UsingJoin(FullOuter, Seq("b")), None)
     val expected = r3.join(r4, FullOuter, Some(EqualTo(bNotNull, bNotNull))).select(
       Alias(Coalesce(Seq(b, b)), "b")(), a, c)
     checkAnalysis(naturalPlan, expected)
@@ -106,40 +106,35 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   }
 
   test("using unresolved attribute") {
-    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("d"))), None)
-    val error = intercept[AnalysisException] {
-      SimpleAnalyzer.checkAnalysis(usingPlan)
-    }
-    assert(error.message.contains(
-      "using columns ['d] can not be resolved given input columns: [b, a, c]"))
+    assertAnalysisError(
+      r1.join(r2, UsingJoin(Inner, Seq("d"))),
+      "USING column `d` can not be resolved with the left join side" :: Nil)
+    assertAnalysisError(
+      r1.join(r2, UsingJoin(Inner, Seq("b"))),
+      "USING column `b` can not be resolved with the right join side" :: Nil)
   }
 
   test("using join with a case sensitive analyzer") {
     val expected = r1.join(r2, Inner, Some(EqualTo(a, a))).select(a, b, c)
 
-    {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("a"))), None)
-      checkAnalysis(usingPlan, expected, caseSensitive = true)
-    }
+    val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("a")), None)
+    checkAnalysis(usingPlan, expected, caseSensitive = true)
 
-    {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("A"))), None)
-      assertAnalysisError(
-        usingPlan,
-        Seq("using columns ['A] can not be resolved given input columns: [b, a, c, a]"))
-    }
+    assertAnalysisError(
+      r1.join(r2, UsingJoin(Inner, Seq("A"))),
+      "USING column `A` can not be resolved with the left join side" :: Nil)
   }
 
   test("using join with a case insensitive analyzer") {
     val expected = r1.join(r2, Inner, Some(EqualTo(a, a))).select(a, b, c)
 
     {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("a"))), None)
+      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("a")), None)
       checkAnalysis(usingPlan, expected, caseSensitive = false)
     }
 
     {
-      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq(UnresolvedAttribute("A"))), None)
+      val usingPlan = r1.join(r2, UsingJoin(Inner, Seq("A")), None)
       checkAnalysis(usingPlan, expected, caseSensitive = false)
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index e5f1f7b3bd4cf..304beb121ff6b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -348,7 +348,7 @@ class PlanParserSuite extends PlanTest {
     val testUsingJoin = (sql: String, jt: JoinType) => {
       assertEqual(
         s"select * from t $sql u using(a, b)",
-        table("t").join(table("u"), UsingJoin(jt, Seq('a.attr, 'b.attr)), None).select(star()))
+        table("t").join(table("u"), UsingJoin(jt, Seq("a", "b")), None).select(star()))
     }
     val testAll = Seq(testUnconditionalJoin, testConditionalJoin, testNaturalJoin, testUsingJoin)
     val testExistence = Seq(testUnconditionalJoin, testConditionalJoin, testUsingJoin)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index fcc02e5eb3ef9..133f633212be7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -776,7 +776,7 @@ class Dataset[T] private[sql](
       Join(
         joined.left,
         joined.right,
-        UsingJoin(JoinType(joinType), usingColumns.map(UnresolvedAttribute(_))),
+        UsingJoin(JoinType(joinType), usingColumns),
         None)
     }
   }

From 4c673c656d52d29813979e942851b9205e4ace06 Mon Sep 17 00:00:00 2001
From: Sandeep Singh <sandeep@techaddict.me>
Date: Thu, 1 Dec 2016 13:22:40 -0800
Subject: [PATCH 252/534] [SPARK-18274][ML][PYSPARK] Memory leak in PySpark
 JavaWrapper

## What changes were proposed in this pull request?
In`JavaWrapper `'s destructor make Java Gateway dereference object in destructor, using `SparkContext._active_spark_context._gateway.detach`
Fixing the copying parameter bug, by moving the `copy` method from `JavaModel` to `JavaParams`

## How was this patch tested?
```scala
import random, string
from pyspark.ml.feature import StringIndexer

l = [(''.join(random.choice(string.ascii_uppercase) for _ in range(10)), ) for _ in range(int(7e5))]  # 700000 random strings of 10 characters
df = spark.createDataFrame(l, ['string'])

for i in range(50):
    indexer = StringIndexer(inputCol='string', outputCol='index')
    indexer.fit(df)
```
* Before: would keep StringIndexer strong reference, causing GC issues and is halted midway
After: garbage collection works as the object is dereferenced, and computation completes
* Mem footprint tested using profiler
* Added a parameter copy related test which was failing before.

Author: Sandeep Singh <sandeep@techaddict.me>
Author: jkbradley <joseph.kurata.bradley@gmail.com>

Closes #15843 from techaddict/SPARK-18274.

(cherry picked from commit 78bb7f8071379114314c394e0167c4c5fd8545c5)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 python/pyspark/ml/tests.py   | 18 ++++++++++++++++
 python/pyspark/ml/wrapper.py | 41 ++++++++++++++++++++----------------
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a0c288a0b71a2..68f5bc30ac57f 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -390,6 +390,24 @@ def test_word2vec_param(self):
         self.assertEqual(model.getWindowSize(), 6)
 
 
+class EvaluatorTests(SparkSessionTestCase):
+
+    def test_java_params(self):
+        """
+        This tests a bug fixed by SPARK-18274 which causes multiple copies
+        of a Params instance in Python to be linked to the same Java instance.
+        """
+        evaluator = RegressionEvaluator(metricName="r2")
+        df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
+        evaluator.evaluate(df)
+        self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
+        evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
+        evaluator.evaluate(df)
+        evaluatorCopy.evaluate(df)
+        self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
+        self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
+
+
 class FeatureTests(SparkSessionTestCase):
 
     def test_binarizer(self):
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 25c44b7533c77..13b75e9919221 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -71,6 +71,10 @@ class JavaParams(JavaWrapper, Params):
 
     __metaclass__ = ABCMeta
 
+    def __del__(self):
+        if SparkContext._active_spark_context:
+            SparkContext._active_spark_context._gateway.detach(self._java_obj)
+
     def _make_java_param_pair(self, param, value):
         """
         Makes a Java parm pair.
@@ -180,6 +184,25 @@ def __get_class(clazz):
                                       % stage_name)
         return py_stage
 
+    def copy(self, extra=None):
+        """
+        Creates a copy of this instance with the same uid and some
+        extra params. This implementation first calls Params.copy and
+        then make a copy of the companion Java pipeline component with
+        extra params. So both the Python wrapper and the Java pipeline
+        component get copied.
+
+        :param extra: Extra parameters to copy to the new instance
+        :return: Copy of this instance
+        """
+        if extra is None:
+            extra = dict()
+        that = super(JavaParams, self).copy(extra)
+        if self._java_obj is not None:
+            that._java_obj = self._java_obj.copy(self._empty_java_param_map())
+            that._transfer_params_to_java()
+        return that
+
 
 @inherit_doc
 class JavaEstimator(JavaParams, Estimator):
@@ -256,21 +279,3 @@ def __init__(self, java_model=None):
         super(JavaModel, self).__init__(java_model)
         if java_model is not None:
             self._resetUid(java_model.uid())
-
-    def copy(self, extra=None):
-        """
-        Creates a copy of this instance with the same uid and some
-        extra params. This implementation first calls Params.copy and
-        then make a copy of the companion Java model with extra params.
-        So both the Python wrapper and the Java model get copied.
-
-        :param extra: Extra parameters to copy to the new instance
-        :return: Copy of this instance
-        """
-        if extra is None:
-            extra = dict()
-        that = super(JavaModel, self).copy(extra)
-        if self._java_obj is not None:
-            that._java_obj = self._java_obj.copy(self._empty_java_param_map())
-            that._transfer_params_to_java()
-        return that

From 4746674ad3acfc38bbd3e2708d75280c19ef0202 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 1 Dec 2016 14:22:49 -0800
Subject: [PATCH 253/534] [SPARK-18617][SPARK-18560][TESTS] Fix flaky test:
 StreamingContextSuite. Receiver data should be deserialized properly

## What changes were proposed in this pull request?

Avoid to create multiple threads to stop StreamingContext. Otherwise, the latch added in #16091 can be passed too early.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16105 from zsxwing/SPARK-18617-2.

(cherry picked from commit 086b0c8f6788b205bc630d5ccf078f77b9751af3)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../org/apache/spark/streaming/StreamingContextSuite.scala    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
index 35eeb9dfa5ef8..5645996de5a69 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala
@@ -814,10 +814,12 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with Timeo
     ssc = new StreamingContext(conf, Milliseconds(100))
     val input = ssc.receiverStream(new TestReceiver)
     val latch = new CountDownLatch(1)
+    @volatile var stopping = false
     input.count().foreachRDD { rdd =>
       // Make sure we can read from BlockRDD
-      if (rdd.collect().headOption.getOrElse(0L) > 0) {
+      if (rdd.collect().headOption.getOrElse(0L) > 0 && !stopping) {
         // Stop StreamingContext to unblock "awaitTerminationOrTimeout"
+        stopping = true
         new Thread() {
           setDaemon(true)
           override def run(): Unit = {

From 2d2e80180f3b746df9e45a49bc62da31a37dadb8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 1 Dec 2016 17:58:28 -0800
Subject: [PATCH 254/534] [SPARK-18639] Build only a single pip package

## What changes were proposed in this pull request?
We current build 5 separate pip binary tar balls, doubling the release script runtime. It'd be better to build one, especially for use cases that are just using Spark locally. In the long run, it would make more sense to have Hadoop support be pluggable.

## How was this patch tested?
N/A - this is a release build script that doesn't have any automated test coverage. We will know if it goes wrong when we prepare releases.

Author: Reynold Xin <rxin@databricks.com>

Closes #16072 from rxin/SPARK-18639.

(cherry picked from commit 37e52f8793bff306a7ae5a9aecc16f28333b70e3)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 dev/create-release/release-build.sh | 45 +++++++++++++++++------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 1dbfa3b6e361b..aa42750f26679 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -150,6 +150,7 @@ if [[ "$1" == "package" ]]; then
     NAME=$1
     FLAGS=$2
     ZINC_PORT=$3
+    BUILD_PIP_PACKAGE=$4
     cp -r spark spark-$SPARK_VERSION-bin-$NAME
 
     cd spark-$SPARK_VERSION-bin-$NAME
@@ -170,24 +171,32 @@ if [[ "$1" == "package" ]]; then
     # Get maven home set by MVN
     MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
 
-    echo "Creating distribution"
-    ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
-      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
-    cd ..
 
-    echo "Copying and signing python distribution"
-    PYTHON_DIST_NAME=pyspark-$PYSPARK_VERSION.tar.gz
-    cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_DIST_NAME .
-
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
-      --output $PYTHON_DIST_NAME.asc \
-      --detach-sig $PYTHON_DIST_NAME
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-      MD5 $PYTHON_DIST_NAME > \
-      $PYTHON_DIST_NAME.md5
-    echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-      SHA512 $PYTHON_DIST_NAME > \
-      $PYTHON_DIST_NAME.sha
+    if [ -z "$BUILD_PIP_PACKAGE" ]; then
+      echo "Creating distribution without PIP package"
+      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+      cd ..
+    else
+      echo "Creating distribution with PIP package"
+      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
+        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+      cd ..
+
+      echo "Copying and signing python distribution"
+      PYTHON_DIST_NAME=pyspark-$PYSPARK_VERSION.tar.gz
+      cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_DIST_NAME .
+
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
+        --output $PYTHON_DIST_NAME.asc \
+        --detach-sig $PYTHON_DIST_NAME
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        MD5 $PYTHON_DIST_NAME > \
+        $PYTHON_DIST_NAME.md5
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        SHA512 $PYTHON_DIST_NAME > \
+        $PYTHON_DIST_NAME.sha
+    fi
 
     echo "Copying and signing regular binary distribution"
     cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
@@ -211,7 +220,7 @@ if [[ "$1" == "package" ]]; then
   make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" &
   make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" &
   make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" &
-  make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" &
+  make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" &
   make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" &
   make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" &
   wait

From 2f91b0154ee0674b65e80f81f6498b94666c4b46 Mon Sep 17 00:00:00 2001
From: sureshthalamati <suresh.thalamati@gmail.com>
Date: Thu, 1 Dec 2016 19:13:38 -0800
Subject: [PATCH 255/534] [SPARK-18141][SQL] Fix to quote column names in the
 predicate clause  of the JDBC RDD generated sql statement

## What changes were proposed in this pull request?

SQL query generated for the JDBC data source is not quoting columns in the predicate clause. When the source table has quoted column names,  spark jdbc read fails with column not found error incorrectly.

Error:
org.h2.jdbc.JdbcSQLException: Column "ID" not found;
Source SQL statement:
SELECT "Name","Id" FROM TEST."mixedCaseCols" WHERE (Id < 1)

This PR fixes by quoting column names in the generated  SQL for predicate clause  when filters are pushed down to the data source.

Source SQL statement after the fix:
SELECT "Name","Id" FROM TEST."mixedCaseCols" WHERE ("Id" < 1)

## How was this patch tested?

Added new test case to the JdbcSuite

Author: sureshthalamati <suresh.thalamati@gmail.com>

Closes #15662 from sureshthalamati/filter_quoted_cols-SPARK-18141.

(cherry picked from commit 70c5549ee9588228d18a7b405c977cf591e2efd4)
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
---
 .../execution/datasources/jdbc/JDBCRDD.scala  | 45 +++++++-----
 .../datasources/jdbc/JDBCRelation.scala       |  3 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 73 ++++++++++++++-----
 3 files changed, 82 insertions(+), 39 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index a1e5dfdbf739e..37df283a9e5b2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -27,7 +27,7 @@ import org.apache.spark.{Partition, SparkContext, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.jdbc.JdbcDialects
+import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.CompletionIterator
@@ -105,37 +105,40 @@ object JDBCRDD extends Logging {
    * Turns a single Filter into a String representing a SQL expression.
    * Returns None for an unhandled filter.
    */
-  def compileFilter(f: Filter): Option[String] = {
+  def compileFilter(f: Filter, dialect: JdbcDialect): Option[String] = {
+    def quote(colName: String): String = dialect.quoteIdentifier(colName)
+
     Option(f match {
-      case EqualTo(attr, value) => s"$attr = ${compileValue(value)}"
+      case EqualTo(attr, value) => s"${quote(attr)} = ${compileValue(value)}"
       case EqualNullSafe(attr, value) =>
-        s"(NOT ($attr != ${compileValue(value)} OR $attr IS NULL OR " +
-          s"${compileValue(value)} IS NULL) OR ($attr IS NULL AND ${compileValue(value)} IS NULL))"
-      case LessThan(attr, value) => s"$attr < ${compileValue(value)}"
-      case GreaterThan(attr, value) => s"$attr > ${compileValue(value)}"
-      case LessThanOrEqual(attr, value) => s"$attr <= ${compileValue(value)}"
-      case GreaterThanOrEqual(attr, value) => s"$attr >= ${compileValue(value)}"
-      case IsNull(attr) => s"$attr IS NULL"
-      case IsNotNull(attr) => s"$attr IS NOT NULL"
-      case StringStartsWith(attr, value) => s"${attr} LIKE '${value}%'"
-      case StringEndsWith(attr, value) => s"${attr} LIKE '%${value}'"
-      case StringContains(attr, value) => s"${attr} LIKE '%${value}%'"
+        val col = quote(attr)
+        s"(NOT ($col != ${compileValue(value)} OR $col IS NULL OR " +
+          s"${compileValue(value)} IS NULL) OR ($col IS NULL AND ${compileValue(value)} IS NULL))"
+      case LessThan(attr, value) => s"${quote(attr)} < ${compileValue(value)}"
+      case GreaterThan(attr, value) => s"${quote(attr)} > ${compileValue(value)}"
+      case LessThanOrEqual(attr, value) => s"${quote(attr)} <= ${compileValue(value)}"
+      case GreaterThanOrEqual(attr, value) => s"${quote(attr)} >= ${compileValue(value)}"
+      case IsNull(attr) => s"${quote(attr)} IS NULL"
+      case IsNotNull(attr) => s"${quote(attr)} IS NOT NULL"
+      case StringStartsWith(attr, value) => s"${quote(attr)} LIKE '${value}%'"
+      case StringEndsWith(attr, value) => s"${quote(attr)} LIKE '%${value}'"
+      case StringContains(attr, value) => s"${quote(attr)} LIKE '%${value}%'"
       case In(attr, value) if value.isEmpty =>
-        s"CASE WHEN ${attr} IS NULL THEN NULL ELSE FALSE END"
-      case In(attr, value) => s"$attr IN (${compileValue(value)})"
-      case Not(f) => compileFilter(f).map(p => s"(NOT ($p))").getOrElse(null)
+        s"CASE WHEN ${quote(attr)} IS NULL THEN NULL ELSE FALSE END"
+      case In(attr, value) => s"${quote(attr)} IN (${compileValue(value)})"
+      case Not(f) => compileFilter(f, dialect).map(p => s"(NOT ($p))").getOrElse(null)
       case Or(f1, f2) =>
         // We can't compile Or filter unless both sub-filters are compiled successfully.
         // It applies too for the following And filter.
         // If we can make sure compileFilter supports all filters, we can remove this check.
-        val or = Seq(f1, f2).flatMap(compileFilter(_))
+        val or = Seq(f1, f2).flatMap(compileFilter(_, dialect))
         if (or.size == 2) {
           or.map(p => s"($p)").mkString(" OR ")
         } else {
           null
         }
       case And(f1, f2) =>
-        val and = Seq(f1, f2).flatMap(compileFilter(_))
+        val and = Seq(f1, f2).flatMap(compileFilter(_, dialect))
         if (and.size == 2) {
           and.map(p => s"($p)").mkString(" AND ")
         } else {
@@ -214,7 +217,9 @@ private[jdbc] class JDBCRDD(
    * `filters`, but as a WHERE clause suitable for injection into a SQL query.
    */
   private val filterWhereClause: String =
-    filters.flatMap(JDBCRDD.compileFilter).map(p => s"($p)").mkString(" AND ")
+    filters
+      .flatMap(JDBCRDD.compileFilter(_, JdbcDialects.get(url)))
+      .map(p => s"($p)").mkString(" AND ")
 
   /**
    * A WHERE clause representing both `filters`, if any, and the current partition.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 672c21c6ac734..6abb27db8531e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -23,6 +23,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.Partition
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.jdbc.JdbcDialects
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 
@@ -113,7 +114,7 @@ private[sql] case class JDBCRelation(
 
   // Check if JDBCRDD.compileFilter can accept input filters
   override def unhandledFilters(filters: Array[Filter]): Array[Filter] = {
-    filters.filter(JDBCRDD.compileFilter(_).isEmpty)
+    filters.filter(JDBCRDD.compileFilter(_, JdbcDialects.get(jdbcOptions.url)).isEmpty)
   }
 
   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index b16be457ed5c3..af5f01c493e84 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -202,6 +202,21 @@ class JDBCSuite extends SparkFunSuite
          |partitionColumn '"Dept"', lowerBound '1', upperBound '4', numPartitions '4')
       """.stripMargin.replaceAll("\n", " "))
 
+    conn.prepareStatement(
+      """create table test."mixedCaseCols" ("Name" TEXT(32), "Id" INTEGER NOT NULL)""")
+      .executeUpdate()
+    conn.prepareStatement("""insert into test."mixedCaseCols" values ('fred', 1)""").executeUpdate()
+    conn.prepareStatement("""insert into test."mixedCaseCols" values ('mary', 2)""").executeUpdate()
+    conn.prepareStatement("""insert into test."mixedCaseCols" values (null, 3)""").executeUpdate()
+    conn.commit()
+
+    sql(
+      s"""
+         |CREATE TEMPORARY TABLE mixedCaseCols
+         |USING org.apache.spark.sql.jdbc
+         |OPTIONS (url '$url', dbtable 'TEST."mixedCaseCols"', user 'testUser', password 'testPass')
+      """.stripMargin.replaceAll("\n", " "))
+
     // Untested: IDENTITY, OTHER, UUID, ARRAY, and GEOMETRY types.
   }
 
@@ -604,30 +619,32 @@ class JDBCSuite extends SparkFunSuite
 
   test("compile filters") {
     val compileFilter = PrivateMethod[Option[String]]('compileFilter)
-    def doCompileFilter(f: Filter): String = JDBCRDD invokePrivate compileFilter(f) getOrElse("")
-    assert(doCompileFilter(EqualTo("col0", 3)) === "col0 = 3")
-    assert(doCompileFilter(Not(EqualTo("col1", "abc"))) === "(NOT (col1 = 'abc'))")
+    def doCompileFilter(f: Filter): String =
+      JDBCRDD invokePrivate compileFilter(f, JdbcDialects.get("jdbc:")) getOrElse("")
+    assert(doCompileFilter(EqualTo("col0", 3)) === """"col0" = 3""")
+    assert(doCompileFilter(Not(EqualTo("col1", "abc"))) === """(NOT ("col1" = 'abc'))""")
     assert(doCompileFilter(And(EqualTo("col0", 0), EqualTo("col1", "def")))
-      === "(col0 = 0) AND (col1 = 'def')")
+      === """("col0" = 0) AND ("col1" = 'def')""")
     assert(doCompileFilter(Or(EqualTo("col0", 2), EqualTo("col1", "ghi")))
-      === "(col0 = 2) OR (col1 = 'ghi')")
-    assert(doCompileFilter(LessThan("col0", 5)) === "col0 < 5")
+      === """("col0" = 2) OR ("col1" = 'ghi')""")
+    assert(doCompileFilter(LessThan("col0", 5)) === """"col0" < 5""")
     assert(doCompileFilter(LessThan("col3",
-      Timestamp.valueOf("1995-11-21 00:00:00.0"))) === "col3 < '1995-11-21 00:00:00.0'")
-    assert(doCompileFilter(LessThan("col4", Date.valueOf("1983-08-04"))) === "col4 < '1983-08-04'")
-    assert(doCompileFilter(LessThanOrEqual("col0", 5)) === "col0 <= 5")
-    assert(doCompileFilter(GreaterThan("col0", 3)) === "col0 > 3")
-    assert(doCompileFilter(GreaterThanOrEqual("col0", 3)) === "col0 >= 3")
-    assert(doCompileFilter(In("col1", Array("jkl"))) === "col1 IN ('jkl')")
+      Timestamp.valueOf("1995-11-21 00:00:00.0"))) === """"col3" < '1995-11-21 00:00:00.0'""")
+    assert(doCompileFilter(LessThan("col4", Date.valueOf("1983-08-04")))
+      === """"col4" < '1983-08-04'""")
+    assert(doCompileFilter(LessThanOrEqual("col0", 5)) === """"col0" <= 5""")
+    assert(doCompileFilter(GreaterThan("col0", 3)) === """"col0" > 3""")
+    assert(doCompileFilter(GreaterThanOrEqual("col0", 3)) === """"col0" >= 3""")
+    assert(doCompileFilter(In("col1", Array("jkl"))) === """"col1" IN ('jkl')""")
     assert(doCompileFilter(In("col1", Array.empty)) ===
-      "CASE WHEN col1 IS NULL THEN NULL ELSE FALSE END")
+      """CASE WHEN "col1" IS NULL THEN NULL ELSE FALSE END""")
     assert(doCompileFilter(Not(In("col1", Array("mno", "pqr"))))
-      === "(NOT (col1 IN ('mno', 'pqr')))")
-    assert(doCompileFilter(IsNull("col1")) === "col1 IS NULL")
-    assert(doCompileFilter(IsNotNull("col1")) === "col1 IS NOT NULL")
+      === """(NOT ("col1" IN ('mno', 'pqr')))""")
+    assert(doCompileFilter(IsNull("col1")) === """"col1" IS NULL""")
+    assert(doCompileFilter(IsNotNull("col1")) === """"col1" IS NOT NULL""")
     assert(doCompileFilter(And(EqualNullSafe("col0", "abc"), EqualTo("col1", "def")))
-      === "((NOT (col0 != 'abc' OR col0 IS NULL OR 'abc' IS NULL) "
-        + "OR (col0 IS NULL AND 'abc' IS NULL))) AND (col1 = 'def')")
+      === """((NOT ("col0" != 'abc' OR "col0" IS NULL OR 'abc' IS NULL) """
+        + """OR ("col0" IS NULL AND 'abc' IS NULL))) AND ("col1" = 'def')""")
   }
 
   test("Dialect unregister") {
@@ -824,4 +841,24 @@ class JDBCSuite extends SparkFunSuite
     val schema = JdbcUtils.schemaString(df.schema, "jdbc:mysql://localhost:3306/temp")
     assert(schema.contains("`order` TEXT"))
   }
+
+  test("SPARK-18141: Predicates on quoted column names in the jdbc data source") {
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id < 1").collect().size == 0)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id <= 1").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id > 1").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id >= 1").collect().size == 3)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id = 1").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id != 2").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id <=> 2").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name LIKE 'fr%'").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name LIKE '%ed'").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name LIKE '%re%'").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name IS NULL").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name IS NOT NULL").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols").filter($"Name".isin()).collect().size == 0)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name IN ('mary', 'fred')").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name NOT IN ('fred')").collect().size == 1)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Id = 1 OR Name = 'mary'").collect().size == 2)
+    assert(sql("SELECT * FROM mixedCaseCols WHERE Name = 'mary' AND Id = 2").collect().size == 1)
+  }
 }

From b9eb10043129defa53c5bdfd1190fe68c0107b3b Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 2 Dec 2016 11:15:26 +0800
Subject: [PATCH 256/534] [SPARK-18538][SQL][BACKPORT-2.1] Fix Concurrent Table
 Fetching Using DataFrameReader JDBC APIs

### What changes were proposed in this pull request?

#### This PR is to backport https://github.com/apache/spark/pull/15975 to Branch 2.1

---

The following two `DataFrameReader` JDBC APIs ignore the user-specified parameters of parallelism degree.

```Scala
  def jdbc(
      url: String,
      table: String,
      columnName: String,
      lowerBound: Long,
      upperBound: Long,
      numPartitions: Int,
      connectionProperties: Properties): DataFrame
```

```Scala
  def jdbc(
      url: String,
      table: String,
      predicates: Array[String],
      connectionProperties: Properties): DataFrame
```

This PR is to fix the issues. To verify the behavior correctness, we improve the plan output of `EXPLAIN` command by adding `numPartitions` in the `JDBCRelation` node.

Before the fix,
```
== Physical Plan ==
*Scan JDBCRelation(TEST.PEOPLE) [NAME#1896,THEID#1897] ReadSchema: struct<NAME:string,THEID:int>
```

After the fix,
```
== Physical Plan ==
*Scan JDBCRelation(TEST.PEOPLE) [numPartitions=3] [NAME#1896,THEID#1897] ReadSchema: struct<NAME:string,THEID:int>
```
### How was this patch tested?
Added the verification logics on all the test cases for JDBC concurrent fetching.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #16111 from gatorsmile/jdbcFix2.1.
---
 .../apache/spark/sql/DataFrameReader.scala    | 37 +++++-----
 .../datasources/jdbc/JDBCRelation.scala       |  3 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 67 +++++++++++++------
 3 files changed, 69 insertions(+), 38 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 1af2f9afea5eb..365b50dee93c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -159,7 +159,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * @since 1.4.0
    */
   def jdbc(url: String, table: String, properties: Properties): DataFrame = {
-    jdbc(url, table, JDBCRelation.columnPartition(null), properties)
+    // properties should override settings in extraOptions.
+    this.extraOptions = this.extraOptions ++ properties.asScala
+    // explicit url and dbtable should override all
+    this.extraOptions += (JDBCOptions.JDBC_URL -> url, JDBCOptions.JDBC_TABLE_NAME -> table)
+    format("jdbc").load()
   }
 
   /**
@@ -177,7 +181,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * @param upperBound the maximum value of `columnName` used to decide partition stride.
    * @param numPartitions the number of partitions. This, along with `lowerBound` (inclusive),
    *                      `upperBound` (exclusive), form partition strides for generated WHERE
-   *                      clause expressions used to split the column `columnName` evenly.
+   *                      clause expressions used to split the column `columnName` evenly. When
+   *                      the input is less than 1, the number is set to 1.
    * @param connectionProperties JDBC database connection arguments, a list of arbitrary string
    *                             tag/value. Normally at least a "user" and "password" property
    *                             should be included. "fetchsize" can be used to control the
@@ -192,9 +197,13 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       upperBound: Long,
       numPartitions: Int,
       connectionProperties: Properties): DataFrame = {
-    val partitioning = JDBCPartitioningInfo(columnName, lowerBound, upperBound, numPartitions)
-    val parts = JDBCRelation.columnPartition(partitioning)
-    jdbc(url, table, parts, connectionProperties)
+    // columnName, lowerBound, upperBound and numPartitions override settings in extraOptions.
+    this.extraOptions ++= Map(
+      JDBCOptions.JDBC_PARTITION_COLUMN -> columnName,
+      JDBCOptions.JDBC_LOWER_BOUND -> lowerBound.toString,
+      JDBCOptions.JDBC_UPPER_BOUND -> upperBound.toString,
+      JDBCOptions.JDBC_NUM_PARTITIONS -> numPartitions.toString)
+    jdbc(url, table, connectionProperties)
   }
 
   /**
@@ -220,22 +229,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       table: String,
       predicates: Array[String],
       connectionProperties: Properties): DataFrame = {
+    // connectionProperties should override settings in extraOptions.
+    val params = extraOptions.toMap ++ connectionProperties.asScala.toMap
+    val options = new JDBCOptions(url, table, params)
     val parts: Array[Partition] = predicates.zipWithIndex.map { case (part, i) =>
       JDBCPartition(part, i) : Partition
     }
-    jdbc(url, table, parts, connectionProperties)
-  }
-
-  private def jdbc(
-      url: String,
-      table: String,
-      parts: Array[Partition],
-      connectionProperties: Properties): DataFrame = {
-    // connectionProperties should override settings in extraOptions.
-    this.extraOptions = this.extraOptions ++ connectionProperties.asScala
-    // explicit url and dbtable should override all
-    this.extraOptions += ("url" -> url, "dbtable" -> table)
-    format("jdbc").load()
+    val relation = JDBCRelation(parts, options)(sparkSession)
+    sparkSession.baseRelationToDataFrame(relation)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 6abb27db8531e..5ca1c7543cfa7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -138,7 +138,8 @@ private[sql] case class JDBCRelation(
   }
 
   override def toString: String = {
+    val partitioningInfo = if (parts.nonEmpty) s" [numPartitions=${parts.length}]" else ""
     // credentials should not be included in the plan output, table information is sufficient.
-    s"JDBCRelation(${jdbcOptions.table})"
+    s"JDBCRelation(${jdbcOptions.table})" + partitioningInfo
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index af5f01c493e84..aa1ab141a4ec8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -24,12 +24,12 @@ import java.util.{Calendar, GregorianCalendar, Properties}
 import org.h2.jdbc.JdbcSQLException
 import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCRDD, JdbcUtils}
+import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCRDD, JDBCRelation, JdbcUtils}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -224,6 +224,16 @@ class JDBCSuite extends SparkFunSuite
     conn.close()
   }
 
+  // Check whether the tables are fetched in the expected degree of parallelism
+  def checkNumPartitions(df: DataFrame, expectedNumPartitions: Int): Unit = {
+    val jdbcRelations = df.queryExecution.analyzed.collect {
+      case LogicalRelation(r: JDBCRelation, _, _) => r
+    }
+    assert(jdbcRelations.length == 1)
+    assert(jdbcRelations.head.parts.length == expectedNumPartitions,
+      s"Expecting a JDBCRelation with $expectedNumPartitions partitions, but got:`$jdbcRelations`")
+  }
+
   test("SELECT *") {
     assert(sql("SELECT * FROM foobar").collect().size === 3)
   }
@@ -328,13 +338,23 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("SELECT * partitioned") {
-    assert(sql("SELECT * FROM parts").collect().size == 3)
+    val df = sql("SELECT * FROM parts")
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length == 3)
   }
 
   test("SELECT WHERE (simple predicates) partitioned") {
-    assert(sql("SELECT * FROM parts WHERE THEID < 1").collect().size === 0)
-    assert(sql("SELECT * FROM parts WHERE THEID != 2").collect().size === 2)
-    assert(sql("SELECT THEID FROM parts WHERE THEID = 1").collect().size === 1)
+    val df1 = sql("SELECT * FROM parts WHERE THEID < 1")
+    checkNumPartitions(df1, expectedNumPartitions = 3)
+    assert(df1.collect().length === 0)
+
+    val df2 = sql("SELECT * FROM parts WHERE THEID != 2")
+    checkNumPartitions(df2, expectedNumPartitions = 3)
+    assert(df2.collect().length === 2)
+
+    val df3 = sql("SELECT THEID FROM parts WHERE THEID = 1")
+    checkNumPartitions(df3, expectedNumPartitions = 3)
+    assert(df3.collect().length === 1)
   }
 
   test("SELECT second field partitioned") {
@@ -385,24 +405,27 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("Partitioning via JDBCPartitioningInfo API") {
-    assert(
-      spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties())
-      .collect().length === 3)
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", "THEID", 0, 4, 3, new Properties())
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length === 3)
   }
 
   test("Partitioning via list-of-where-clauses API") {
     val parts = Array[String]("THEID < 2", "THEID >= 2")
-    assert(spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties())
-      .collect().length === 3)
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties())
+    checkNumPartitions(df, expectedNumPartitions = 2)
+    assert(df.collect().length === 3)
   }
 
   test("Partitioning on column that might have null values.") {
-    assert(
-      spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "theid", 0, 4, 3, new Properties())
-        .collect().length === 4)
-    assert(
-      spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties())
-        .collect().length === 4)
+    val df = spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "theid", 0, 4, 3, new Properties())
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length === 4)
+
+    val df2 = spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties())
+    checkNumPartitions(df2, expectedNumPartitions = 3)
+    assert(df2.collect().length === 4)
+
     // partitioning on a nullable quoted column
     assert(
       spark.read.jdbc(urlWithUserAndPass, "TEST.EMP", """"Dept"""", 0, 4, 3, new Properties())
@@ -419,6 +442,7 @@ class JDBCSuite extends SparkFunSuite
       numPartitions = 0,
       connectionProperties = new Properties()
     )
+    checkNumPartitions(res, expectedNumPartitions = 1)
     assert(res.count() === 8)
   }
 
@@ -432,6 +456,7 @@ class JDBCSuite extends SparkFunSuite
       numPartitions = 10,
       connectionProperties = new Properties()
     )
+    checkNumPartitions(res, expectedNumPartitions = 4)
     assert(res.count() === 8)
   }
 
@@ -445,6 +470,7 @@ class JDBCSuite extends SparkFunSuite
       numPartitions = 4,
       connectionProperties = new Properties()
     )
+    checkNumPartitions(res, expectedNumPartitions = 1)
     assert(res.count() === 8)
   }
 
@@ -465,7 +491,9 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("SELECT * on partitioned table with a nullable partition column") {
-    assert(sql("SELECT * FROM nullparts").collect().size == 4)
+    val df = sql("SELECT * FROM nullparts")
+    checkNumPartitions(df, expectedNumPartitions = 3)
+    assert(df.collect().length == 4)
   }
 
   test("H2 integral types") {
@@ -739,7 +767,8 @@ class JDBCSuite extends SparkFunSuite
     }
     // test the JdbcRelation toString output
     df.queryExecution.analyzed.collect {
-      case r: LogicalRelation => assert(r.relation.toString == "JDBCRelation(TEST.PEOPLE)")
+      case r: LogicalRelation =>
+        assert(r.relation.toString == "JDBCRelation(TEST.PEOPLE) [numPartitions=3]")
     }
   }
 

From fce1be6cc81b1fe3991a4df91128f4fcd14ff615 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 2 Dec 2016 12:30:13 +0800
Subject: [PATCH 257/534] [SPARK-18284][SQL] Make
 ExpressionEncoder.serializer.nullable precise

## What changes were proposed in this pull request?

This PR makes `ExpressionEncoder.serializer.nullable` for flat encoder for a primitive type `false`. Since it is `true` for now, it is too conservative.
While `ExpressionEncoder.schema` has correct information (e.g. `<IntegerType, false>`), `serializer.head.nullable` of `ExpressionEncoder`, which got from `encoderFor[T]`, is always false. It is too conservative.

This is accomplished by checking whether a type is one of primitive types. If it is `true`, `nullable` should be `false`.

## How was this patch tested?

Added new tests for encoder and dataframe

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #15780 from kiszk/SPARK-18284.

(cherry picked from commit 38b9e69623c14a675b14639e8291f5d29d2a0bc3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/JavaTypeInference.scala      |  4 +-
 .../spark/sql/catalyst/ScalaReflection.scala  |  7 ++-
 .../catalyst/encoders/ExpressionEncoder.scala |  7 +--
 .../expressions/ReferenceToExpressions.scala  |  2 +-
 .../expressions/objects/objects.scala         | 24 +++++----
 .../encoders/ExpressionEncoderSuite.scala     | 19 ++++++-
 .../org/apache/spark/sql/DatasetSuite.scala   | 52 ++++++++++++++++++-
 .../sql/streaming/FileStreamSinkSuite.scala   |  2 +-
 8 files changed, 96 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 04f0cfce883f2..7e8e4dab72145 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -396,12 +396,14 @@ object JavaTypeInference {
 
         case _ if mapType.isAssignableFrom(typeToken) =>
           val (keyType, valueType) = mapKeyValueType(typeToken)
+
           ExternalMapToCatalyst(
             inputObject,
             ObjectType(keyType.getRawType),
             serializerFor(_, keyType),
             ObjectType(valueType.getRawType),
-            serializerFor(_, valueType)
+            serializerFor(_, valueType),
+            valueNullable = true
           )
 
         case other =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 0aa21b9347a9d..6e20096901d99 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -498,7 +498,8 @@ object ScalaReflection extends ScalaReflection {
           dataTypeFor(keyType),
           serializerFor(_, keyType, keyPath),
           dataTypeFor(valueType),
-          serializerFor(_, valueType, valuePath))
+          serializerFor(_, valueType, valuePath),
+          valueNullable = !valueType.typeSymbol.asClass.isPrimitive)
 
       case t if t <:< localTypeOf[String] =>
         StaticInvoke(
@@ -590,7 +591,9 @@ object ScalaReflection extends ScalaReflection {
               "cannot be used as field name\n" + walkedTypePath.mkString("\n"))
           }
 
-          val fieldValue = Invoke(inputObject, fieldName, dataTypeFor(fieldType))
+          val fieldValue = Invoke(
+            AssertNotNull(inputObject, walkedTypePath), fieldName, dataTypeFor(fieldType),
+            returnNullable = !fieldType.typeSymbol.asClass.isPrimitive)
           val clsName = getClassNameFromType(fieldType)
           val newPath = s"""- field (class: "$clsName", name: "$fieldName")""" +: walkedTypePath
           expressions.Literal(fieldName) :: serializerFor(fieldValue, fieldType, newPath) :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 9c4818db6333b..3757eccfa2dd8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -60,7 +60,7 @@ object ExpressionEncoder {
     val cls = mirror.runtimeClass(tpe)
     val flat = !ScalaReflection.definedByConstructorParams(tpe)
 
-    val inputObject = BoundReference(0, ScalaReflection.dataTypeFor[T], nullable = true)
+    val inputObject = BoundReference(0, ScalaReflection.dataTypeFor[T], nullable = !cls.isPrimitive)
     val nullSafeInput = if (flat) {
       inputObject
     } else {
@@ -71,10 +71,7 @@ object ExpressionEncoder {
     val serializer = ScalaReflection.serializerFor[T](nullSafeInput)
     val deserializer = ScalaReflection.deserializerFor[T]
 
-    val schema = ScalaReflection.schemaFor[T] match {
-      case ScalaReflection.Schema(s: StructType, _) => s
-      case ScalaReflection.Schema(dt, nullable) => new StructType().add("value", dt, nullable)
-    }
+    val schema = serializer.dataType
 
     new ExpressionEncoder[T](
       schema,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
index 6c75a7a50214f..2ca77e8394e17 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
@@ -74,7 +74,7 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression])
         ctx.addMutableState("boolean", classChildVarIsNull, "")
 
         val classChildVar =
-          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType)
+          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType, child.nullable)
 
         val initCode = s"${classChildVar.value} = ${childGen.value};\n" +
           s"${classChildVar.isNull} = ${childGen.isNull};"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index e517ec18eb540..a8aa1e725524a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -171,15 +171,18 @@ case class StaticInvoke(
  * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
  * @param propagateNull When true, and any of the arguments is null, null will be returned instead
  *                      of calling the function.
+ * @param returnNullable When false, indicating the invoked method will always return
+ *                       non-null value.
  */
 case class Invoke(
     targetObject: Expression,
     functionName: String,
     dataType: DataType,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true) extends InvokeLike {
+    propagateNull: Boolean = true,
+    returnNullable : Boolean = true) extends InvokeLike {
 
-  override def nullable: Boolean = true
+  override def nullable: Boolean = targetObject.nullable || needNullCheck || returnNullable
   override def children: Seq[Expression] = targetObject +: arguments
 
   override def eval(input: InternalRow): Any =
@@ -405,13 +408,15 @@ case class WrapOption(child: Expression, optType: DataType)
  * A place holder for the loop variable used in [[MapObjects]].  This should never be constructed
  * manually, but will instead be passed into the provided lambda function.
  */
-case class LambdaVariable(value: String, isNull: String, dataType: DataType) extends LeafExpression
+case class LambdaVariable(
+    value: String,
+    isNull: String,
+    dataType: DataType,
+    nullable: Boolean = true) extends LeafExpression
   with Unevaluable with NonSQLExpression {
 
-  override def nullable: Boolean = true
-
   override def genCode(ctx: CodegenContext): ExprCode = {
-    ExprCode(code = "", value = value, isNull = isNull)
+    ExprCode(code = "", value = value, isNull = if (nullable) isNull else "false")
   }
 }
 
@@ -592,7 +597,8 @@ object ExternalMapToCatalyst {
       keyType: DataType,
       keyConverter: Expression => Expression,
       valueType: DataType,
-      valueConverter: Expression => Expression): ExternalMapToCatalyst = {
+      valueConverter: Expression => Expression,
+      valueNullable: Boolean): ExternalMapToCatalyst = {
     val id = curId.getAndIncrement()
     val keyName = "ExternalMapToCatalyst_key" + id
     val valueName = "ExternalMapToCatalyst_value" + id
@@ -601,11 +607,11 @@ object ExternalMapToCatalyst {
     ExternalMapToCatalyst(
       keyName,
       keyType,
-      keyConverter(LambdaVariable(keyName, "false", keyType)),
+      keyConverter(LambdaVariable(keyName, "false", keyType, false)),
       valueName,
       valueIsNull,
       valueType,
-      valueConverter(LambdaVariable(valueName, valueIsNull, valueType)),
+      valueConverter(LambdaVariable(valueName, valueIsNull, valueType, valueNullable)),
       inputMap
     )
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 4d896c2e38f10..080f11b769388 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -24,7 +24,7 @@ import java.util.Arrays
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.sql.Encoders
+import org.apache.spark.sql.{Encoder, Encoders}
 import org.apache.spark.sql.catalyst.{OptionalData, PrimitiveData}
 import org.apache.spark.sql.catalyst.analysis.AnalysisTest
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -300,6 +300,11 @@ class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
   encodeDecodeTest(
     ReferenceValueClass(ReferenceValueClass.Container(1)), "reference value class")
 
+  encodeDecodeTest(Option(31), "option of int")
+  encodeDecodeTest(Option.empty[Int], "empty option of int")
+  encodeDecodeTest(Option("abc"), "option of string")
+  encodeDecodeTest(Option.empty[String], "empty option of string")
+
   productTest(("UDT", new ExamplePoint(0.1, 0.2)))
 
   test("nullable of encoder schema") {
@@ -338,6 +343,18 @@ class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
     }
   }
 
+  test("nullable of encoder serializer") {
+    def checkNullable[T: Encoder](nullable: Boolean): Unit = {
+      assert(encoderFor[T].serializer.forall(_.nullable === nullable))
+    }
+
+    // test for flat encoders
+    checkNullable[Int](false)
+    checkNullable[Option[Int]](true)
+    checkNullable[java.lang.Integer](true)
+    checkNullable[String](true)
+  }
+
   test("null check for map key") {
     val encoder = ExpressionEncoder[Map[String, Int]]()
     val e = intercept[RuntimeException](encoder.toRow(Map(("a", 1), (null, 2))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 1174d7354f931..d31c766cb779f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -28,7 +28,10 @@ import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
+
+case class TestDataPoint(x: Int, y: Double, s: String, t: TestDataPoint2)
+case class TestDataPoint2(x: Int, s: String)
 
 class DatasetSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -969,6 +972,53 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     assert(dataset.collect() sameElements Array(resultValue, resultValue))
   }
 
+  test("SPARK-18284: Serializer should have correct nullable value") {
+    val df1 = Seq(1, 2, 3, 4).toDF
+    assert(df1.schema(0).nullable == false)
+    val df2 = Seq(Integer.valueOf(1), Integer.valueOf(2)).toDF
+    assert(df2.schema(0).nullable == true)
+
+    val df3 = Seq(Seq(1, 2), Seq(3, 4)).toDF
+    assert(df3.schema(0).nullable == true)
+    assert(df3.schema(0).dataType.asInstanceOf[ArrayType].containsNull == false)
+    val df4 = Seq(Seq("a", "b"), Seq("c", "d")).toDF
+    assert(df4.schema(0).nullable == true)
+    assert(df4.schema(0).dataType.asInstanceOf[ArrayType].containsNull == true)
+
+    val df5 = Seq((0, 1.0), (2, 2.0)).toDF("id", "v")
+    assert(df5.schema(0).nullable == false)
+    assert(df5.schema(1).nullable == false)
+    val df6 = Seq((0, 1.0, "a"), (2, 2.0, "b")).toDF("id", "v1", "v2")
+    assert(df6.schema(0).nullable == false)
+    assert(df6.schema(1).nullable == false)
+    assert(df6.schema(2).nullable == true)
+
+    val df7 = (Tuple1(Array(1, 2, 3)) :: Nil).toDF("a")
+    assert(df7.schema(0).nullable == true)
+    assert(df7.schema(0).dataType.asInstanceOf[ArrayType].containsNull == false)
+
+    val df8 = (Tuple1(Array((null: Integer), (null: Integer))) :: Nil).toDF("a")
+    assert(df8.schema(0).nullable == true)
+    assert(df8.schema(0).dataType.asInstanceOf[ArrayType].containsNull == true)
+
+    val df9 = (Tuple1(Map(2 -> 3)) :: Nil).toDF("m")
+    assert(df9.schema(0).nullable == true)
+    assert(df9.schema(0).dataType.asInstanceOf[MapType].valueContainsNull == false)
+
+    val df10 = (Tuple1(Map(1 -> (null: Integer))) :: Nil).toDF("m")
+    assert(df10.schema(0).nullable == true)
+    assert(df10.schema(0).dataType.asInstanceOf[MapType].valueContainsNull == true)
+
+    val df11 = Seq(TestDataPoint(1, 2.2, "a", null),
+                   TestDataPoint(3, 4.4, "null", (TestDataPoint2(33, "b")))).toDF
+    assert(df11.schema(0).nullable == false)
+    assert(df11.schema(1).nullable == false)
+    assert(df11.schema(2).nullable == true)
+    assert(df11.schema(3).nullable == true)
+    assert(df11.schema(3).dataType.asInstanceOf[StructType].fields(0).nullable == false)
+    assert(df11.schema(3).dataType.asInstanceOf[StructType].fields(1).nullable == true)
+  }
+
   Seq(true, false).foreach { eager =>
     def testCheckpointing(testName: String)(f: => Unit): Unit = {
       test(s"Dataset.checkpoint() - $testName (eager = $eager)") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 09613ef9e4348..54efae3fb4627 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -86,7 +86,7 @@ class FileStreamSinkSuite extends StreamTest {
 
       val outputDf = spark.read.parquet(outputDir)
       val expectedSchema = new StructType()
-        .add(StructField("value", IntegerType))
+        .add(StructField("value", IntegerType, nullable = false))
         .add(StructField("id", IntegerType))
       assert(outputDf.schema === expectedSchema)
 

From 0f0903d17b9c71a569d92f2c35e2caeb1eb8c89f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 2 Dec 2016 12:54:12 +0800
Subject: [PATCH 258/534] [SPARK-18647][SQL] do not put provider in table
 properties for Hive serde table

## What changes were proposed in this pull request?

In Spark 2.1, we make Hive serde tables case-preserving by putting the table metadata in table properties, like what we did for data source table. However, we should not put table provider, as it will break forward compatibility. e.g. if we create a Hive serde table with Spark 2.1, using `sql("create table test stored as parquet as select 1")`, we will fail to read it with Spark 2.0, as Spark 2.0 mistakenly treat it as data source table because there is a `provider` entry in table properties.

Logically Hive serde table's provider is always hive, we don't need to store it in table properties, this PR removes it.

## How was this patch tested?

manually test the forward compatibility issue.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16080 from cloud-fan/hive.

(cherry picked from commit a5f02b00291e0a22429a3dca81f12cf6d38fea0b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/hive/HiveExternalCatalog.scala  | 80 ++++++++++---------
 .../sql/hive/HiveExternalCatalogSuite.scala   | 18 +++++
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |  2 -
 3 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 1a9943bc31058..065883234a780 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -21,6 +21,7 @@ import java.io.IOException
 import java.net.URI
 import java.util
 
+import scala.collection.mutable
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
@@ -219,9 +220,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
           // table location for tables in default database, while we expect to use the location of
           // default database.
           storage = tableDefinition.storage.copy(locationUri = tableLocation),
-          // Here we follow data source tables and put table metadata like provider, schema, etc. in
-          // table properties, so that we can work around the Hive metastore issue about not case
-          // preserving and make Hive serde table support mixed-case column names.
+          // Here we follow data source tables and put table metadata like table schema, partition
+          // columns etc. in table properties, so that we can work around the Hive metastore issue
+          // about not case preserving and make Hive serde table support mixed-case column names.
           properties = tableDefinition.properties ++ tableMetaToTableProps(tableDefinition))
         client.createTable(tableWithDataSourceProps, ignoreIfExists)
       } else {
@@ -233,10 +234,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   }
 
   private def createDataSourceTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = {
+    // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
+    val provider = table.provider.get
+
     // To work around some hive metastore issues, e.g. not case-preserving, bad decimal type
     // support, no column nullability, etc., we should do some extra works before saving table
     // metadata into Hive metastore:
-    //  1. Put table metadata like provider, schema, etc. in table properties.
+    //  1. Put table metadata like table schema, partition columns, etc. in table properties.
     //  2. Check if this table is hive compatible.
     //    2.1  If it's not hive compatible, set location URI, schema, partition columns and bucket
     //         spec to empty and save table metadata to Hive.
@@ -244,6 +248,12 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     //         it to Hive. If it fails, treat it as not hive compatible and go back to 2.1
     val tableProperties = tableMetaToTableProps(table)
 
+    // put table provider and partition provider in table properties.
+    tableProperties.put(DATASOURCE_PROVIDER, provider)
+    if (table.tracksPartitionsInCatalog) {
+      tableProperties.put(TABLE_PARTITION_PROVIDER, TABLE_PARTITION_PROVIDER_CATALOG)
+    }
+
     // Ideally we should also put `locationUri` in table properties like provider, schema, etc.
     // However, in older version of Spark we already store table location in storage properties
     // with key "path". Here we keep this behaviour for backward compatibility.
@@ -290,7 +300,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
 
     val qualifiedTableName = table.identifier.quotedString
-    val maybeSerde = HiveSerDe.sourceToSerDe(table.provider.get)
+    val maybeSerde = HiveSerDe.sourceToSerDe(provider)
     val skipHiveMetadata = table.storage.properties
       .getOrElse("skipHiveMetadata", "false").toBoolean
 
@@ -315,7 +325,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         (Some(newHiveCompatibleMetastoreTable(serde)), message)
 
       case _ =>
-        val provider = table.provider.get
         val message =
           s"Couldn't find corresponding Hive SerDe for data source provider $provider. " +
             s"Persisting data source table $qualifiedTableName into Hive metastore in " +
@@ -349,21 +358,14 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   /**
    * Data source tables may be non Hive compatible and we need to store table metadata in table
    * properties to workaround some Hive metastore limitations.
-   * This method puts table provider, partition provider, schema, partition column names, bucket
-   * specification into a map, which can be used as table properties later.
+   * This method puts table schema, partition column names, bucket specification into a map, which
+   * can be used as table properties later.
    */
-  private def tableMetaToTableProps(table: CatalogTable): scala.collection.Map[String, String] = {
-    // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
-    val provider = table.provider.get
+  private def tableMetaToTableProps(table: CatalogTable): mutable.Map[String, String] = {
     val partitionColumns = table.partitionColumnNames
     val bucketSpec = table.bucketSpec
 
-    val properties = new scala.collection.mutable.HashMap[String, String]
-    properties.put(DATASOURCE_PROVIDER, provider)
-    if (table.tracksPartitionsInCatalog) {
-      properties.put(TABLE_PARTITION_PROVIDER, TABLE_PARTITION_PROVIDER_CATALOG)
-    }
-
+    val properties = new mutable.HashMap[String, String]
     // Serialized JSON schema string may be too long to be stored into a single metastore table
     // property. In this case, we split the JSON string and store each part as a separate table
     // property.
@@ -617,14 +619,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
     if (table.tableType != VIEW) {
       table.properties.get(DATASOURCE_PROVIDER) match {
-        // No provider in table properties, which means this table is created by Spark prior to 2.1,
-        // or is created at Hive side.
+        // No provider in table properties, which means this is a Hive serde table.
         case None =>
-          table = table.copy(
-            provider = Some(DDLUtils.HIVE_PROVIDER), tracksPartitionsInCatalog = true)
-
-        // This is a Hive serde table created by Spark 2.1 or higher versions.
-        case Some(DDLUtils.HIVE_PROVIDER) =>
           table = restoreHiveSerdeTable(table)
 
         // This is a regular data source table.
@@ -637,7 +633,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val statsProps = table.properties.filterKeys(_.startsWith(STATISTICS_PREFIX))
 
     if (statsProps.nonEmpty) {
-      val colStats = new scala.collection.mutable.HashMap[String, ColumnStat]
+      val colStats = new mutable.HashMap[String, ColumnStat]
 
       // For each column, recover its column stats. Note that this is currently a O(n^2) operation,
       // but given the number of columns it usually not enormous, this is probably OK as a start.
@@ -674,21 +670,27 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       provider = Some(DDLUtils.HIVE_PROVIDER),
       tracksPartitionsInCatalog = true)
 
-    val schemaFromTableProps = getSchemaFromTableProperties(table)
-    if (DataType.equalsIgnoreCaseAndNullability(schemaFromTableProps, table.schema)) {
-      hiveTable.copy(
-        schema = schemaFromTableProps,
-        partitionColumnNames = getPartitionColumnsFromTableProperties(table),
-        bucketSpec = getBucketSpecFromTableProperties(table))
+    // If this is a Hive serde table created by Spark 2.1 or higher versions, we should restore its
+    // schema from table properties.
+    if (table.properties.contains(DATASOURCE_SCHEMA_NUMPARTS)) {
+      val schemaFromTableProps = getSchemaFromTableProperties(table)
+      if (DataType.equalsIgnoreCaseAndNullability(schemaFromTableProps, table.schema)) {
+        hiveTable.copy(
+          schema = schemaFromTableProps,
+          partitionColumnNames = getPartitionColumnsFromTableProperties(table),
+          bucketSpec = getBucketSpecFromTableProperties(table))
+      } else {
+        // Hive metastore may change the table schema, e.g. schema inference. If the table
+        // schema we read back is different(ignore case and nullability) from the one in table
+        // properties which was written when creating table, we should respect the table schema
+        // from hive.
+        logWarning(s"The table schema given by Hive metastore(${table.schema.simpleString}) is " +
+          "different from the schema when this table was created by Spark SQL" +
+          s"(${schemaFromTableProps.simpleString}). We have to fall back to the table schema " +
+          "from Hive metastore which is not case preserving.")
+        hiveTable
+      }
     } else {
-      // Hive metastore may change the table schema, e.g. schema inference. If the table
-      // schema we read back is different(ignore case and nullability) from the one in table
-      // properties which was written when creating table, we should respect the table schema
-      // from hive.
-      logWarning(s"The table schema given by Hive metastore(${table.schema.simpleString}) is " +
-        "different from the schema when this table was created by Spark SQL" +
-        s"(${schemaFromTableProps.simpleString}). We have to fall back to the table schema from " +
-        "Hive metastore which is not case preserving.")
       hiveTable
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index efa0beb85030b..6fee45824ea3f 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -20,8 +20,11 @@ package org.apache.spark.sql.hive
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.SparkConf
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.execution.command.DDLUtils
+import org.apache.spark.sql.types.StructType
 
 /**
  * Test suite for the [[HiveExternalCatalog]].
@@ -52,4 +55,19 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
     assert(selectedPartitions.length == 1)
     assert(selectedPartitions.head.spec == part1.spec)
   }
+
+  test("SPARK-18647: do not put provider in table properties for Hive serde table") {
+    val catalog = newBasicCatalog()
+    val hiveTable = CatalogTable(
+      identifier = TableIdentifier("hive_tbl", Some("db1")),
+      tableType = CatalogTableType.MANAGED,
+      storage = storageFormat,
+      schema = new StructType().add("col1", "int").add("col2", "string"),
+      provider = Some("hive"))
+    catalog.createTable(hiveTable, ignoreIfExists = false)
+
+    val rawTable = externalCatalog.client.getTable("db1", "hive_tbl")
+    assert(!rawTable.properties.contains(HiveExternalCatalog.DATASOURCE_PROVIDER))
+    assert(externalCatalog.getTable("db1", "hive_tbl").provider == Some(DDLUtils.HIVE_PROVIDER))
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 7abc4d9623f71..0a280b495215c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.hive
 
-import java.io.File
-
 import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType

From a7f8ebb8629706c54c286b7aca658838e718e804 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 1 Dec 2016 22:02:45 -0800
Subject: [PATCH 259/534] [SPARK-17213][SQL] Disable Parquet filter push-down
 for string and binary columns due to PARQUET-686

This PR targets to both master and branch-2.1.

## What changes were proposed in this pull request?

Due to PARQUET-686, Parquet doesn't do string comparison correctly while doing filter push-down for string columns. This PR disables filter push-down for both string and binary columns to work around this issue. Binary columns are also affected because some Parquet data models (like Hive) may store string columns as a plain Parquet `binary` instead of a `binary (UTF8)`.

## How was this patch tested?

New test case added in `ParquetFilterSuite`.

Author: Cheng Lian <lian@databricks.com>

Closes #16106 from liancheng/spark-17213-bad-string-ppd.

(cherry picked from commit ca6391637212814b7c0bd14c434a6737da17b258)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../datasources/parquet/ParquetFilters.scala  | 24 +++++++++++++++++
 .../parquet/ParquetFilterSuite.scala          | 26 ++++++++++++++++---
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index a6e9788097728..7730d1fccb0b9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -40,6 +40,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.eq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     // Binary.fromString and Binary.fromByteArray don't accept null values
     case StringType =>
       (n: String, v: Any) => FilterApi.eq(
@@ -49,6 +52,7 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeNotEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -62,6 +66,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
@@ -70,6 +77,7 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeLt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -81,6 +89,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n),
@@ -88,6 +99,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeLtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -99,6 +111,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n),
@@ -106,6 +121,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -117,6 +133,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n),
@@ -124,6 +143,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -135,6 +155,9 @@ private[parquet] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n),
@@ -142,6 +165,7 @@ private[parquet] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n), Binary.fromReusedByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 4246b54c21f0c..a0d57d79f045a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -47,7 +47,6 @@ import org.apache.spark.util.{AccumulatorContext, LongAccumulator}
  *    data type is nullable.
  */
 class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContext {
-
   private def checkFilterPredicate(
       df: DataFrame,
       predicate: Predicate,
@@ -230,7 +229,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - string") {
+  // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+  ignore("filter pushdown - string") {
     withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate(
@@ -258,7 +258,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - binary") {
+  // See SPARK-17213: https://issues.apache.org/jira/browse/SPARK-17213
+  ignore("filter pushdown - binary") {
     implicit class IntToBinary(int: Int) {
       def b: Array[Byte] = int.toString.getBytes(StandardCharsets.UTF_8)
     }
@@ -558,4 +559,23 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }
     }
   }
+
+  test("SPARK-17213: Broken Parquet filter push-down for string columns") {
+    withTempPath { dir =>
+      import testImplicits._
+
+      val path = dir.getCanonicalPath
+      // scalastyle:off nonascii
+      Seq("a", "é").toDF("name").write.parquet(path)
+      // scalastyle:on nonascii
+
+      assert(spark.read.parquet(path).where("name > 'a'").count() == 1)
+      assert(spark.read.parquet(path).where("name >= 'a'").count() == 2)
+
+      // scalastyle:off nonascii
+      assert(spark.read.parquet(path).where("name < 'é'").count() == 1)
+      assert(spark.read.parquet(path).where("name <= 'é'").count() == 2)
+      // scalastyle:on nonascii
+    }
+  }
 }

From 65e896a6e9a5378f2d3a02c0c2a57fdb8d8f1d9d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 2 Dec 2016 20:59:39 +0800
Subject: [PATCH 260/534] [SPARK-18679][SQL] Fix regression in file listing
 performance for non-catalog tables

## What changes were proposed in this pull request?

In Spark 2.1 ListingFileCatalog was significantly refactored (and renamed to InMemoryFileIndex). This introduced a regression where parallelism could only be introduced at the very top of the tree. However, in many cases (e.g. `spark.read.parquet(topLevelDir)`), the top of the tree is only a single directory.

This PR simplifies and fixes the parallel recursive listing code to allow parallelism to be introduced at any level during recursive descent (though note that once we decide to list a sub-tree in parallel, the sub-tree is listed in serial on executors).

cc mallman  cloud-fan

## How was this patch tested?

Checked metrics in unit tests.

Author: Eric Liang <ekl@databricks.com>

Closes #16112 from ericl/spark-18679.

(cherry picked from commit 294163ee9319e4f7f6da1259839eb3c80bba25c2)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/metrics/source/StaticSources.scala  |  8 ++
 .../PartitioningAwareFileIndex.scala          | 79 +++++++++++--------
 ...atalogSuite.scala => FileIndexSuite.scala} | 53 +++++++++++++
 3 files changed, 106 insertions(+), 34 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/{FileCatalogSuite.scala => FileIndexSuite.scala} (70%)

diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
index b433cd0a89ac9..99ec78633ab75 100644
--- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala
@@ -90,6 +90,12 @@ object HiveCatalogMetrics extends Source {
    */
   val METRIC_HIVE_CLIENT_CALLS = metricRegistry.counter(MetricRegistry.name("hiveClientCalls"))
 
+  /**
+   * Tracks the total number of Spark jobs launched for parallel file listing.
+   */
+  val METRIC_PARALLEL_LISTING_JOB_COUNT = metricRegistry.counter(
+    MetricRegistry.name("parallelListingJobCount"))
+
   /**
    * Resets the values of all metrics to zero. This is useful in tests.
    */
@@ -98,6 +104,7 @@ object HiveCatalogMetrics extends Source {
     METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount())
     METRIC_FILE_CACHE_HITS.dec(METRIC_FILE_CACHE_HITS.getCount())
     METRIC_HIVE_CLIENT_CALLS.dec(METRIC_HIVE_CLIENT_CALLS.getCount())
+    METRIC_PARALLEL_LISTING_JOB_COUNT.dec(METRIC_PARALLEL_LISTING_JOB_COUNT.getCount())
   }
 
   // clients can use these to avoid classloader issues with the codahale classes
@@ -105,4 +112,5 @@ object HiveCatalogMetrics extends Source {
   def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n)
   def incrementFileCacheHits(n: Int): Unit = METRIC_FILE_CACHE_HITS.inc(n)
   def incrementHiveClientCalls(n: Int): Unit = METRIC_HIVE_CLIENT_CALLS.inc(n)
+  def incrementParallelListingJobCount(n: Int): Unit = METRIC_PARALLEL_LISTING_JOB_COUNT.inc(n)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index 3740caa22c37e..f22b55bb0465e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -249,12 +249,9 @@ abstract class PartitioningAwareFileIndex(
           pathsToFetch += path
       }
     }
-    val discovered = if (pathsToFetch.length >=
-        sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
-      PartitioningAwareFileIndex.listLeafFilesInParallel(pathsToFetch, hadoopConf, sparkSession)
-    } else {
-      PartitioningAwareFileIndex.listLeafFilesInSerial(pathsToFetch, hadoopConf)
-    }
+    val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass))
+    val discovered = PartitioningAwareFileIndex.bulkListLeafFiles(
+      pathsToFetch, hadoopConf, filter, sparkSession)
     discovered.foreach { case (path, leafFiles) =>
       HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
       fileStatusCache.putLeafFiles(path, leafFiles.toArray)
@@ -286,31 +283,28 @@ object PartitioningAwareFileIndex extends Logging {
       blockLocations: Array[SerializableBlockLocation])
 
   /**
-   * List a collection of path recursively.
-   */
-  private def listLeafFilesInSerial(
-      paths: Seq[Path],
-      hadoopConf: Configuration): Seq[(Path, Seq[FileStatus])] = {
-    // Dummy jobconf to get to the pathFilter defined in configuration
-    val jobConf = new JobConf(hadoopConf, this.getClass)
-    val filter = FileInputFormat.getInputPathFilter(jobConf)
-
-    paths.map { path =>
-      val fs = path.getFileSystem(hadoopConf)
-      (path, listLeafFiles0(fs, path, filter))
-    }
-  }
-
-  /**
-   * List a collection of path recursively in parallel (using Spark executors).
-   * Each task launched will use [[listLeafFilesInSerial]] to list.
+   * Lists a collection of paths recursively. Picks the listing strategy adaptively depending
+   * on the number of paths to list.
+   *
+   * This may only be called on the driver.
+   *
+   * @return for each input path, the set of discovered files for the path
    */
-  private def listLeafFilesInParallel(
+  private def bulkListLeafFiles(
       paths: Seq[Path],
       hadoopConf: Configuration,
+      filter: PathFilter,
       sparkSession: SparkSession): Seq[(Path, Seq[FileStatus])] = {
-    assert(paths.size >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold)
+
+    // Short-circuits parallel listing when serial listing is likely to be faster.
+    if (paths.size < sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
+      return paths.map { path =>
+        (path, listLeafFiles(path, hadoopConf, filter, Some(sparkSession)))
+      }
+    }
+
     logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}")
+    HiveCatalogMetrics.incrementParallelListingJobCount(1)
 
     val sparkContext = sparkSession.sparkContext
     val serializableConfiguration = new SerializableConfiguration(hadoopConf)
@@ -322,9 +316,11 @@ object PartitioningAwareFileIndex extends Logging {
 
     val statusMap = sparkContext
       .parallelize(serializedPaths, numParallelism)
-      .mapPartitions { paths =>
+      .mapPartitions { pathStrings =>
         val hadoopConf = serializableConfiguration.value
-        listLeafFilesInSerial(paths.map(new Path(_)).toSeq, hadoopConf).iterator
+        pathStrings.map(new Path(_)).toSeq.map { path =>
+          (path, listLeafFiles(path, hadoopConf, filter, None))
+        }.iterator
       }.map { case (path, statuses) =>
         val serializableStatuses = statuses.map { status =>
           // Turn FileStatus into SerializableFileStatus so we can send it back to the driver
@@ -372,11 +368,20 @@ object PartitioningAwareFileIndex extends Logging {
   }
 
   /**
-   * List a single path, provided as a FileStatus, in serial.
+   * Lists a single filesystem path recursively. If a SparkSession object is specified, this
+   * function may launch Spark jobs to parallelize listing.
+   *
+   * If sessionOpt is None, this may be called on executors.
+   *
+   * @return all children of path that match the specified filter.
    */
-  private def listLeafFiles0(
-      fs: FileSystem, path: Path, filter: PathFilter): Seq[FileStatus] = {
+  private def listLeafFiles(
+      path: Path,
+      hadoopConf: Configuration,
+      filter: PathFilter,
+      sessionOpt: Option[SparkSession]): Seq[FileStatus] = {
     logTrace(s"Listing $path")
+    val fs = path.getFileSystem(hadoopConf)
     val name = path.getName.toLowerCase
     if (shouldFilterOut(name)) {
       Seq.empty[FileStatus]
@@ -391,9 +396,15 @@ object PartitioningAwareFileIndex extends Logging {
       }
 
       val allLeafStatuses = {
-        val (dirs, files) = statuses.partition(_.isDirectory)
-        val stats = files ++ dirs.flatMap(dir => listLeafFiles0(fs, dir.getPath, filter))
-        if (filter != null) stats.filter(f => filter.accept(f.getPath)) else stats
+        val (dirs, topLevelFiles) = statuses.partition(_.isDirectory)
+        val nestedFiles: Seq[FileStatus] = sessionOpt match {
+          case Some(session) =>
+            bulkListLeafFiles(dirs.map(_.getPath), hadoopConf, filter, session).flatMap(_._2)
+          case _ =>
+            dirs.flatMap(dir => listLeafFiles(dir.getPath, hadoopConf, filter, sessionOpt))
+        }
+        val allFiles = topLevelFiles ++ nestedFiles
+        if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
       }
 
       allLeafStatuses.filterNot(status => shouldFilterOut(status.getPath.getName)).map {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
similarity index 70%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index 56df1face6364..b7a472b7f0919 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileCatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -25,6 +25,7 @@ import scala.language.reflectiveCalls
 
 import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem}
 
+import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -81,6 +82,58 @@ class FileIndexSuite extends SharedSQLContext {
     }
   }
 
+  test("PartitioningAwareFileIndex listing parallelized with many top level dirs") {
+    for ((scale, expectedNumPar) <- Seq((10, 0), (50, 1))) {
+      withTempDir { dir =>
+        val topLevelDirs = (1 to scale).map { i =>
+          val tmp = new File(dir, s"foo=$i.txt")
+          tmp.mkdir()
+          new Path(tmp.getCanonicalPath)
+        }
+        HiveCatalogMetrics.reset()
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
+        new InMemoryFileIndex(spark, topLevelDirs, Map.empty, None)
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == expectedNumPar)
+      }
+    }
+  }
+
+  test("PartitioningAwareFileIndex listing parallelized with large child dirs") {
+    for ((scale, expectedNumPar) <- Seq((10, 0), (50, 1))) {
+      withTempDir { dir =>
+        for (i <- 1 to scale) {
+          new File(dir, s"foo=$i.txt").mkdir()
+        }
+        HiveCatalogMetrics.reset()
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
+        new InMemoryFileIndex(spark, Seq(new Path(dir.getCanonicalPath)), Map.empty, None)
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == expectedNumPar)
+      }
+    }
+  }
+
+  test("PartitioningAwareFileIndex listing parallelized with large, deeply nested child dirs") {
+    for ((scale, expectedNumPar) <- Seq((10, 0), (50, 4))) {
+      withTempDir { dir =>
+        for (i <- 1 to 2) {
+          val subdirA = new File(dir, s"a=$i")
+          subdirA.mkdir()
+          for (j <- 1 to 2) {
+            val subdirB = new File(subdirA, s"b=$j")
+            subdirB.mkdir()
+            for (k <- 1 to scale) {
+              new File(subdirB, s"foo=$k.txt").mkdir()
+            }
+          }
+        }
+        HiveCatalogMetrics.reset()
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
+        new InMemoryFileIndex(spark, Seq(new Path(dir.getCanonicalPath)), Map.empty, None)
+        assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == expectedNumPar)
+      }
+    }
+  }
+
   test("PartitioningAwareFileIndex - file filtering") {
     assert(!PartitioningAwareFileIndex.shouldFilterOut("abcd"))
     assert(PartitioningAwareFileIndex.shouldFilterOut(".ab"))

From 415730e19cea3a0e7ea5491bf801a22859bbab66 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 2 Dec 2016 21:48:22 +0800
Subject: [PATCH 261/534] [SPARK-18419][SQL] `JDBCRelation.insert` should not
 remove Spark options

## What changes were proposed in this pull request?

Currently, `JDBCRelation.insert` removes Spark options too early by mistakenly using `asConnectionProperties`. Spark options like `numPartitions` should be passed into `DataFrameWriter.jdbc` correctly. This bug have been **hidden** because `JDBCOptions.asConnectionProperties` fails to filter out the mixed-case options. This PR aims to fix both.

**JDBCRelation.insert**
```scala
override def insert(data: DataFrame, overwrite: Boolean): Unit = {
  val url = jdbcOptions.url
  val table = jdbcOptions.table
- val properties = jdbcOptions.asConnectionProperties
+ val properties = jdbcOptions.asProperties
  data.write
    .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
    .jdbc(url, table, properties)
```

**JDBCOptions.asConnectionProperties**
```scala
scala> import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
scala> import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
scala> new JDBCOptions(Map("url" -> "jdbc:mysql://localhost:3306/temp", "dbtable" -> "t1", "numPartitions" -> "10")).asConnectionProperties
res0: java.util.Properties = {numpartitions=10}
scala> new JDBCOptions(new CaseInsensitiveMap(Map("url" -> "jdbc:mysql://localhost:3306/temp", "dbtable" -> "t1", "numPartitions" -> "10"))).asConnectionProperties
res1: java.util.Properties = {numpartitions=10}
```

## How was this patch tested?

Pass the Jenkins with a new testcase.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #15863 from dongjoon-hyun/SPARK-18419.

(cherry picked from commit 55d528f2ba0ba689dbb881616d9436dc7958e943)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../datasources/jdbc/JDBCOptions.scala        | 23 ++++++++++++++-----
 .../execution/datasources/jdbc/JDBCRDD.scala  |  1 -
 .../datasources/jdbc/JDBCRelation.scala       |  2 +-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 10 ++++++++
 4 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
index 7f419b5788c4f..d94fa7e8d80a1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala
@@ -20,8 +20,6 @@ package org.apache.spark.sql.execution.datasources.jdbc
 import java.sql.{Connection, DriverManager}
 import java.util.Properties
 
-import scala.collection.mutable.ArrayBuffer
-
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 
 /**
@@ -41,10 +39,23 @@ class JDBCOptions(
       JDBCOptions.JDBC_TABLE_NAME -> table)))
   }
 
+  /**
+   * Returns a property with all options.
+   */
+  val asProperties: Properties = {
+    val properties = new Properties()
+    parameters.foreach { case (k, v) => properties.setProperty(k, v) }
+    properties
+  }
+
+  /**
+   * Returns a property with all options except Spark internal data source options like `url`,
+   * `dbtable`, and `numPartition`. This should be used when invoking JDBC API like `Driver.connect`
+   * because each DBMS vendor has its own property list for JDBC driver. See SPARK-17776.
+   */
   val asConnectionProperties: Properties = {
     val properties = new Properties()
-    // We should avoid to pass the options into properties. See SPARK-17776.
-    parameters.filterKeys(!jdbcOptionNames.contains(_))
+    parameters.filterKeys(key => !jdbcOptionNames(key.toLowerCase))
       .foreach { case (k, v) => properties.setProperty(k, v) }
     properties
   }
@@ -125,10 +136,10 @@ class JDBCOptions(
 }
 
 object JDBCOptions {
-  private val jdbcOptionNames = ArrayBuffer.empty[String]
+  private val jdbcOptionNames = collection.mutable.Set[String]()
 
   private def newOption(name: String): String = {
-    jdbcOptionNames += name
+    jdbcOptionNames += name.toLowerCase
     name
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
index 37df283a9e5b2..d5b11e7bec0bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -54,7 +54,6 @@ object JDBCRDD extends Logging {
   def resolveTable(options: JDBCOptions): StructType = {
     val url = options.url
     val table = options.table
-    val properties = options.asConnectionProperties
     val dialect = JdbcDialects.get(url)
     val conn: Connection = JdbcUtils.createConnectionFactory(options)()
     try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
index 5ca1c7543cfa7..8b45dba04d29e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala
@@ -131,7 +131,7 @@ private[sql] case class JDBCRelation(
   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
     val url = jdbcOptions.url
     val table = jdbcOptions.table
-    val properties = jdbcOptions.asConnectionProperties
+    val properties = jdbcOptions.asProperties
     data.write
       .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
       .jdbc(url, table, properties)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index aa1ab141a4ec8..4c964bf1b3ac4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -26,6 +26,7 @@ import org.scalatest.{BeforeAndAfter, PrivateMethodTester}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.DataSourceScanExec
 import org.apache.spark.sql.execution.command.ExplainCommand
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -890,4 +891,13 @@ class JDBCSuite extends SparkFunSuite
     assert(sql("SELECT * FROM mixedCaseCols WHERE Id = 1 OR Name = 'mary'").collect().size == 2)
     assert(sql("SELECT * FROM mixedCaseCols WHERE Name = 'mary' AND Id = 2").collect().size == 1)
   }
+
+  test("SPARK-18419: Fix `asConnectionProperties` to filter case-insensitively") {
+    val parameters = Map(
+      "url" -> "jdbc:mysql://localhost:3306/temp",
+      "dbtable" -> "t1",
+      "numPartitions" -> "10")
+    assert(new JDBCOptions(parameters).asConnectionProperties.isEmpty)
+    assert(new JDBCOptions(new CaseInsensitiveMap(parameters)).asConnectionProperties.isEmpty)
+  }
 }

From e374b2426114d841e1935719f6e21919475f6804 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 2 Dec 2016 21:59:02 +0800
Subject: [PATCH 262/534] [SPARK-18659][SQL] Incorrect behaviors in overwrite
 table for datasource tables

## What changes were proposed in this pull request?

Two bugs are addressed here
1. INSERT OVERWRITE TABLE sometime crashed when catalog partition management was enabled. This was because when dropping partitions after an overwrite operation, the Hive client will attempt to delete the partition files. If the entire partition directory was dropped, this would fail. The PR fixes this by adding a flag to control whether the Hive client should attempt to delete files.
2. The static partition spec for OVERWRITE TABLE was not correctly resolved to the case-sensitive original partition names. This resulted in the entire table being overwritten if you did not correctly capitalize your partition names.

cc yhuai cloud-fan

## How was this patch tested?

Unit tests. Surprisingly, the existing overwrite table tests did not catch these edge cases.

Author: Eric Liang <ekl@databricks.com>

Closes #16088 from ericl/spark-18659.

(cherry picked from commit 7935c8470c5c162ef7213e394fe8588e5dd42ca2)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/catalog/ExternalCatalog.scala    |  3 +-
 .../catalyst/catalog/InMemoryCatalog.scala    | 10 ++++--
 .../sql/catalyst/catalog/SessionCatalog.scala |  5 +--
 .../catalog/ExternalCatalogSuite.scala        | 21 ++++++-----
 .../catalog/SessionCatalogSuite.scala         | 27 +++++++++-----
 .../spark/sql/execution/SparkSqlParser.scala  |  5 +--
 .../spark/sql/execution/command/ddl.scala     |  6 ++--
 .../datasources/DataSourceStrategy.scala      | 13 +++++--
 .../execution/command/DDLCommandSuite.scala   |  3 +-
 .../spark/sql/hive/HiveExternalCatalog.scala  |  6 ++--
 .../spark/sql/hive/client/HiveClient.scala    |  3 +-
 .../sql/hive/client/HiveClientImpl.scala      |  6 ++--
 .../PartitionProviderCompatibilitySuite.scala | 35 +++++++++++++++++++
 .../spark/sql/hive/client/VersionsSuite.scala |  4 +--
 14 files changed, 110 insertions(+), 37 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 14dd707fa0f1c..259008f183b56 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -154,7 +154,8 @@ abstract class ExternalCatalog {
       table: String,
       parts: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit
+      purge: Boolean,
+      retainData: Boolean): Unit
 
   /**
    * Override the specs of one or many existing table partitions, assuming they exist.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index a3ffeaa63f690..880a7a0dc4225 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -385,7 +385,8 @@ class InMemoryCatalog(
       table: String,
       partSpecs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = synchronized {
+      purge: Boolean,
+      retainData: Boolean): Unit = synchronized {
     requireTableExists(db, table)
     val existingParts = catalog(db).tables(table).partitions
     if (!ignoreIfNotExists) {
@@ -395,7 +396,12 @@ class InMemoryCatalog(
       }
     }
 
-    val shouldRemovePartitionLocation = getTable(db, table).tableType == CatalogTableType.MANAGED
+    val shouldRemovePartitionLocation = if (retainData) {
+      false
+    } else {
+      getTable(db, table).tableType == CatalogTableType.MANAGED
+    }
+
     // TODO: we should follow hive to roll back if one partition path failed to delete, and support
     // partial partition spec.
     partSpecs.foreach { p =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 0b6a91fff71fe..da3a2079f42d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -687,13 +687,14 @@ class SessionCatalog(
       tableName: TableIdentifier,
       specs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = {
+      purge: Boolean,
+      retainData: Boolean): Unit = {
     val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
     val table = formatTableName(tableName.table)
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
     requirePartialMatchedPartitionSpec(specs, getTableMetadata(tableName))
-    externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists, purge)
+    externalCatalog.dropPartitions(db, table, specs, ignoreIfNotExists, purge, retainData)
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 303a8662d3f4d..3b39f420af494 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -361,13 +361,14 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val catalog = newBasicCatalog()
     assert(catalogPartitionsEqual(catalog, "db2", "tbl2", Seq(part1, part2)))
     catalog.dropPartitions(
-      "db2", "tbl2", Seq(part1.spec), ignoreIfNotExists = false, purge = false)
+      "db2", "tbl2", Seq(part1.spec), ignoreIfNotExists = false, purge = false, retainData = false)
     assert(catalogPartitionsEqual(catalog, "db2", "tbl2", Seq(part2)))
     resetState()
     val catalog2 = newBasicCatalog()
     assert(catalogPartitionsEqual(catalog2, "db2", "tbl2", Seq(part1, part2)))
     catalog2.dropPartitions(
-      "db2", "tbl2", Seq(part1.spec, part2.spec), ignoreIfNotExists = false, purge = false)
+      "db2", "tbl2", Seq(part1.spec, part2.spec), ignoreIfNotExists = false, purge = false,
+      retainData = false)
     assert(catalog2.listPartitions("db2", "tbl2").isEmpty)
   }
 
@@ -375,11 +376,13 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val catalog = newBasicCatalog()
     intercept[AnalysisException] {
       catalog.dropPartitions(
-        "does_not_exist", "tbl1", Seq(), ignoreIfNotExists = false, purge = false)
+        "does_not_exist", "tbl1", Seq(), ignoreIfNotExists = false, purge = false,
+        retainData = false)
     }
     intercept[AnalysisException] {
       catalog.dropPartitions(
-        "db2", "does_not_exist", Seq(), ignoreIfNotExists = false, purge = false)
+        "db2", "does_not_exist", Seq(), ignoreIfNotExists = false, purge = false,
+        retainData = false)
     }
   }
 
@@ -387,10 +390,11 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val catalog = newBasicCatalog()
     intercept[AnalysisException] {
       catalog.dropPartitions(
-        "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = false, purge = false)
+        "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = false, purge = false,
+        retainData = false)
     }
     catalog.dropPartitions(
-      "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = true, purge = false)
+      "db2", "tbl2", Seq(part3.spec), ignoreIfNotExists = true, purge = false, retainData = false)
   }
 
   test("get partition") {
@@ -713,7 +717,7 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     assert(exists(tableLocation, "partCol1=5", "partCol2=6"))
 
     catalog.dropPartitions("db1", "tbl", Seq(part2.spec, part3.spec), ignoreIfNotExists = false,
-      purge = false)
+      purge = false, retainData = false)
     assert(!exists(tableLocation, "partCol1=3", "partCol2=4"))
     assert(!exists(tableLocation, "partCol1=5", "partCol2=6"))
 
@@ -745,7 +749,8 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     val fs = partPath.getFileSystem(new Configuration)
     assert(fs.exists(partPath))
 
-    catalog.dropPartitions("db2", "tbl1", Seq(part1.spec), ignoreIfNotExists = false, purge = false)
+    catalog.dropPartitions(
+      "db2", "tbl1", Seq(part1.spec), ignoreIfNotExists = false, purge = false, retainData = false)
     assert(fs.exists(partPath))
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 3f27160d63934..f9c4b2687bf7a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -618,7 +618,8 @@ class SessionCatalogSuite extends SparkFunSuite {
       TableIdentifier("tbl2", Some("db2")),
       Seq(part1.spec),
       ignoreIfNotExists = false,
-      purge = false)
+      purge = false,
+      retainData = false)
     assert(catalogPartitionsEqual(externalCatalog.listPartitions("db2", "tbl2"), part2))
     // Drop partitions without explicitly specifying database
     sessionCatalog.setCurrentDatabase("db2")
@@ -626,7 +627,8 @@ class SessionCatalogSuite extends SparkFunSuite {
       TableIdentifier("tbl2"),
       Seq(part2.spec),
       ignoreIfNotExists = false,
-      purge = false)
+      purge = false,
+      retainData = false)
     assert(externalCatalog.listPartitions("db2", "tbl2").isEmpty)
     // Drop multiple partitions at once
     sessionCatalog.createPartitions(
@@ -636,7 +638,8 @@ class SessionCatalogSuite extends SparkFunSuite {
       TableIdentifier("tbl2", Some("db2")),
       Seq(part1.spec, part2.spec),
       ignoreIfNotExists = false,
-      purge = false)
+      purge = false,
+      retainData = false)
     assert(externalCatalog.listPartitions("db2", "tbl2").isEmpty)
   }
 
@@ -647,14 +650,16 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl1", Some("unknown_db")),
         Seq(),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     intercept[NoSuchTableException] {
       catalog.dropPartitions(
         TableIdentifier("does_not_exist", Some("db2")),
         Seq(),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
   }
 
@@ -665,13 +670,15 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl2", Some("db2")),
         Seq(part3.spec),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     catalog.dropPartitions(
       TableIdentifier("tbl2", Some("db2")),
       Seq(part3.spec),
       ignoreIfNotExists = true,
-      purge = false)
+      purge = false,
+      retainData = false)
   }
 
   test("drop partitions with invalid partition spec") {
@@ -681,7 +688,8 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl2", Some("db2")),
         Seq(partWithMoreColumns.spec),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     assert(e.getMessage.contains(
       "Partition spec is invalid. The spec (a, b, c) must be contained within " +
@@ -691,7 +699,8 @@ class SessionCatalogSuite extends SparkFunSuite {
         TableIdentifier("tbl2", Some("db2")),
         Seq(partWithUnknownColumns.spec),
         ignoreIfNotExists = false,
-        purge = false)
+        purge = false,
+        retainData = false)
     }
     assert(e.getMessage.contains(
       "Partition spec is invalid. The spec (a, unknown) must be contained within " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 5f89a229d6242..7a659ea151822 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -833,8 +833,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     AlterTableDropPartitionCommand(
       visitTableIdentifier(ctx.tableIdentifier),
       ctx.partitionSpec.asScala.map(visitNonOptionalPartitionSpec),
-      ctx.EXISTS != null,
-      ctx.PURGE != null)
+      ifExists = ctx.EXISTS != null,
+      purge = ctx.PURGE != null,
+      retainData = false)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 0f126d0200eff..c62c14200c24a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -421,7 +421,8 @@ case class AlterTableDropPartitionCommand(
     tableName: TableIdentifier,
     specs: Seq[TablePartitionSpec],
     ifExists: Boolean,
-    purge: Boolean)
+    purge: Boolean,
+    retainData: Boolean)
   extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
@@ -439,7 +440,8 @@ case class AlterTableDropPartitionCommand(
     }
 
     catalog.dropPartitions(
-      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge)
+      table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge,
+      retainData = retainData)
     Seq.empty[Row]
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index f3d92bf7cc245..4468dc58e404a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -217,16 +217,25 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
             if (deletedPartitions.nonEmpty) {
               AlterTableDropPartitionCommand(
                 l.catalogTable.get.identifier, deletedPartitions.toSeq,
-                ifExists = true, purge = true).run(t.sparkSession)
+                ifExists = true, purge = false,
+                retainData = true /* already deleted */).run(t.sparkSession)
             }
           }
         }
         t.location.refresh()
       }
 
+      val staticPartitionKeys: TablePartitionSpec = if (overwrite.enabled) {
+        overwrite.staticPartitionKeys.map { case (k, v) =>
+          (partitionSchema.map(_.name).find(_.equalsIgnoreCase(k)).get, v)
+        }
+      } else {
+        Map.empty
+      }
+
       val insertCmd = InsertIntoHadoopFsRelationCommand(
         outputPath,
-        if (overwrite.enabled) overwrite.staticPartitionKeys else Map.empty,
+        staticPartitionKeys,
         customPartitionLocations,
         partitionSchema,
         t.bucketSpec,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index d31e7aeb3a78a..5ef5f8ee77418 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -615,7 +615,8 @@ class DDLCommandSuite extends PlanTest {
         Map("dt" -> "2008-08-08", "country" -> "us"),
         Map("dt" -> "2009-09-09", "country" -> "uk")),
       ifExists = true,
-      purge = false)
+      purge = false,
+      retainData = false)
     val expected2_table = expected1_table.copy(ifExists = false)
     val expected1_purge = expected1_table.copy(purge = true)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 065883234a780..c213e8e0b22e6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -850,9 +850,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       table: String,
       parts: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = withClient {
+      purge: Boolean,
+      retainData: Boolean): Unit = withClient {
     requireTableExists(db, table)
-    client.dropPartitions(db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge)
+    client.dropPartitions(
+      db, table, parts.map(lowerCasePartitionSpec), ignoreIfNotExists, purge, retainData)
   }
 
   override def renamePartitions(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 569a9c11398ea..4c76932b61758 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -125,7 +125,8 @@ private[hive] trait HiveClient {
       table: String,
       specs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit
+      purge: Boolean,
+      retainData: Boolean): Unit
 
   /**
    * Rename one or many existing table partitions, assuming they exist.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 590029a517e09..bd840af5b1649 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -453,7 +453,8 @@ private[hive] class HiveClientImpl(
       table: String,
       specs: Seq[TablePartitionSpec],
       ignoreIfNotExists: Boolean,
-      purge: Boolean): Unit = withHiveState {
+      purge: Boolean,
+      retainData: Boolean): Unit = withHiveState {
     // TODO: figure out how to drop multiple partitions in one call
     val hiveTable = client.getTable(db, table, true /* throw exception */)
     // do the check at first and collect all the matching partitions
@@ -473,8 +474,7 @@ private[hive] class HiveClientImpl(
     var droppedParts = ArrayBuffer.empty[java.util.List[String]]
     matchingParts.foreach { partition =>
       try {
-        val deleteData = true
-        shim.dropPartition(client, db, table, partition, deleteData, purge)
+        shim.dropPartition(client, db, table, partition, !retainData, purge)
       } catch {
         case e: Exception =>
           val remainingParts = matchingParts.toBuffer -- droppedParts
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index e8e4238d1c5a4..c2ac032760780 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -259,6 +259,41 @@ class PartitionProviderCompatibilitySuite
         }
       }
     }
+
+    test(s"SPARK-18659 insert overwrite table files - partition management $enabled") {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> enabled.toString) {
+        withTable("test") {
+          spark.range(10)
+            .selectExpr("id", "id as A", "'x' as B")
+            .write.partitionBy("A", "B").mode("overwrite")
+            .saveAsTable("test")
+          spark.sql("insert overwrite table test select id, id, 'x' from range(1)")
+          assert(spark.sql("select * from test").count() == 1)
+
+          spark.range(10)
+            .selectExpr("id", "id as A", "'x' as B")
+            .write.partitionBy("A", "B").mode("overwrite")
+            .saveAsTable("test")
+          spark.sql(
+            "insert overwrite table test partition (A, B) select id, id, 'x' from range(1)")
+          assert(spark.sql("select * from test").count() == 1)
+        }
+      }
+    }
+
+    test(s"SPARK-18659 insert overwrite table with lowercase - partition management $enabled") {
+      withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> enabled.toString) {
+        withTable("test") {
+          spark.range(10)
+            .selectExpr("id", "id as A", "'x' as B")
+            .write.partitionBy("A", "B").mode("overwrite")
+            .saveAsTable("test")
+          // note that 'A', 'B' are lowercase instead of their original case here
+          spark.sql("insert overwrite table test partition (a=1, b) select id, 'x' from range(1)")
+          assert(spark.sql("select * from test").count() == 10)
+        }
+      }
+    }
   }
 
   /**
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 081b0ed9bd688..16ae345de6d95 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -352,13 +352,13 @@ class VersionsSuite extends SparkFunSuite with Logging {
       // with a version that is older than the minimum (1.2 in this case).
       try {
         client.dropPartitions("default", "src_part", Seq(spec), ignoreIfNotExists = true,
-          purge = true)
+          purge = true, retainData = false)
         assert(!versionsWithoutPurge.contains(version))
       } catch {
         case _: UnsupportedOperationException =>
           assert(versionsWithoutPurge.contains(version))
           client.dropPartitions("default", "src_part", Seq(spec), ignoreIfNotExists = true,
-            purge = false)
+            purge = false, retainData = false)
       }
 
       assert(client.getPartitionOption("default", "src_part", spec).isEmpty)

From 32c85383bfd6210e96b4bbcdedbe27a88935e4c7 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 2 Dec 2016 22:12:19 +0800
Subject: [PATCH 263/534] [SPARK-18674][SQL][FOLLOW-UP] improve the error
 message of using join

### What changes were proposed in this pull request?
Added a test case for using joins with nested fields.

### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #16110 from gatorsmile/followup-18674.

(cherry picked from commit 2f8776ccad532fbed17381ff97d302007918b8d8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala   |  8 ++++----
 .../analysis/ResolveNaturalJoinSuite.scala       | 16 +++++++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 372a121993758..fec42eedf98ab 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1915,14 +1915,14 @@ class Analyzer(
       condition: Option[Expression]) = {
     val leftKeys = joinNames.map { keyName =>
       left.output.find(attr => resolver(attr.name, keyName)).getOrElse {
-        throw new AnalysisException(s"USING column `$keyName` can not be resolved with the " +
-          s"left join side, the left output is: [${left.output.map(_.name).mkString(", ")}]")
+        throw new AnalysisException(s"USING column `$keyName` cannot be resolved on the left " +
+          s"side of the join. The left-side columns: [${left.output.map(_.name).mkString(", ")}]")
       }
     }
     val rightKeys = joinNames.map { keyName =>
       right.output.find(attr => resolver(attr.name, keyName)).getOrElse {
-        throw new AnalysisException(s"USING column `$keyName` can not be resolved with the " +
-          s"right join side, the right output is: [${right.output.map(_.name).mkString(", ")}]")
+        throw new AnalysisException(s"USING column `$keyName` cannot be resolved on the right " +
+          s"side of the join. The right-side columns: [${right.output.map(_.name).mkString(", ")}]")
       }
     }
     val joinPairs = leftKeys.zip(rightKeys)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
index 1421d36fdb2a3..e449b9669cc72 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala
@@ -28,6 +28,7 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   lazy val a = 'a.string
   lazy val b = 'b.string
   lazy val c = 'c.string
+  lazy val d = 'd.struct('f1.int, 'f2.long)
   lazy val aNotNull = a.notNull
   lazy val bNotNull = b.notNull
   lazy val cNotNull = c.notNull
@@ -35,6 +36,8 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   lazy val r2 = LocalRelation(c, a)
   lazy val r3 = LocalRelation(aNotNull, bNotNull)
   lazy val r4 = LocalRelation(cNotNull, bNotNull)
+  lazy val r5 = LocalRelation(d)
+  lazy val r6 = LocalRelation(d)
 
   test("natural/using inner join") {
     val naturalPlan = r1.join(r2, NaturalJoin(Inner), None)
@@ -108,10 +111,10 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
   test("using unresolved attribute") {
     assertAnalysisError(
       r1.join(r2, UsingJoin(Inner, Seq("d"))),
-      "USING column `d` can not be resolved with the left join side" :: Nil)
+      "USING column `d` cannot be resolved on the left side of the join" :: Nil)
     assertAnalysisError(
       r1.join(r2, UsingJoin(Inner, Seq("b"))),
-      "USING column `b` can not be resolved with the right join side" :: Nil)
+      "USING column `b` cannot be resolved on the right side of the join" :: Nil)
   }
 
   test("using join with a case sensitive analyzer") {
@@ -122,7 +125,14 @@ class ResolveNaturalJoinSuite extends AnalysisTest {
 
     assertAnalysisError(
       r1.join(r2, UsingJoin(Inner, Seq("A"))),
-      "USING column `A` can not be resolved with the left join side" :: Nil)
+      "USING column `A` cannot be resolved on the left side of the join" :: Nil)
+  }
+
+  test("using join on nested fields") {
+    assertAnalysisError(
+      r5.join(r6, UsingJoin(Inner, Seq("d.f1"))),
+      "USING column `d.f1` cannot be resolved on the left side of the join. " +
+        "The left-side columns: [d]" :: Nil)
   }
 
   test("using join with a case insensitive analyzer") {

From c69825a98989ee975dc8b87979e29e0fff15a3f7 Mon Sep 17 00:00:00 2001
From: Ryan Blue <blue@apache.org>
Date: Fri, 2 Dec 2016 08:41:40 -0800
Subject: [PATCH 264/534] [SPARK-18677] Fix parsing ['key'] in JSON path
 expressions.

## What changes were proposed in this pull request?

This fixes the parser rule to match named expressions, which doesn't work for two reasons:
1. The name match is not coerced to a regular expression (missing .r)
2. The surrounding literals are incorrect and attempt to escape a single quote, which is unnecessary

## How was this patch tested?

This adds test cases for named expressions using the bracket syntax, including one with quoted spaces.

Author: Ryan Blue <blue@apache.org>

Closes #16107 from rdblue/SPARK-18677-fix-json-path.

(cherry picked from commit 48778976e0566d9c93a8c900825def82c6b81fd6)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../expressions/jsonExpressions.scala         |  2 +-
 .../expressions/JsonExpressionsSuite.scala    | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index b61583d0dafb6..667ff649d1297 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -69,7 +69,7 @@ private[this] object JsonPathParser extends RegexParsers {
   // parse `.name` or `['name']` child expressions
   def named: Parser[List[PathInstruction]] =
     for {
-      name <- '.' ~> "[^\\.\\[]+".r | "[\\'" ~> "[^\\'\\?]+" <~ "\\']"
+      name <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\'\\?]+".r <~ "']"
     } yield {
       Key :: Named(name) :: Nil
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 3b0e90824b766..618b8b29e8ee5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -43,6 +43,30 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       """{"price":19.95,"color":"red"}""")
   }
 
+  test("$['store'].bicycle") {
+    checkEvaluation(
+      GetJsonObject(Literal(json), Literal("$['store'].bicycle")),
+      """{"price":19.95,"color":"red"}""")
+  }
+
+  test("$.store['bicycle']") {
+    checkEvaluation(
+      GetJsonObject(Literal(json), Literal("$.store['bicycle']")),
+      """{"price":19.95,"color":"red"}""")
+  }
+
+  test("$['store']['bicycle']") {
+    checkEvaluation(
+      GetJsonObject(Literal(json), Literal("$['store']['bicycle']")),
+      """{"price":19.95,"color":"red"}""")
+  }
+
+  test("$['key with spaces']") {
+    checkEvaluation(GetJsonObject(
+      Literal("""{ "key with spaces": "it works" }"""), Literal("$['key with spaces']")),
+      "it works")
+  }
+
   test("$.store.book") {
     checkEvaluation(
       GetJsonObject(Literal(json), Literal("$.store.book")),

From f915f8128bd47b9d668065f848d5d437365e564a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 2 Dec 2016 12:16:57 -0800
Subject: [PATCH 265/534] [SPARK-18291][SPARKR][ML] Revert
 "[SPARK-18291][SPARKR][ML] SparkR glm predict should output original label
 when family = binomial."

## What changes were proposed in this pull request?
It's better we can fix this issue by providing an option ```type``` for users to change the ```predict``` output schema, then they could output probabilities, log-space predictions, or original labels. In order to not involve breaking API change for 2.1, so revert this change firstly and will add it back after [SPARK-18618](https://issues.apache.org/jira/browse/SPARK-18618) resolved.

## How was this patch tested?
Existing unit tests.

This reverts commit daa975f4bfa4f904697bf3365a4be9987032e490.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16118 from yanboliang/spark-18291-revert.

(cherry picked from commit a985dd8e99d2663a3cb4745c675fa2057aa67155)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 R/pkg/inst/tests/testthat/test_mllib.R        | 20 ++---
 .../GeneralizedLinearRegressionWrapper.scala  | 78 ++-----------------
 2 files changed, 12 insertions(+), 86 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index c8f062d8ac5dc..07e812fd98013 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -64,16 +64,6 @@ test_that("spark.glm and predict", {
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
-  # binomial family
-  binomialTraining <- training[training$Species %in% c("versicolor", "virginica"), ]
-  model <- spark.glm(binomialTraining, Species ~ Sepal_Length + Sepal_Width,
-    family = binomial(link = "logit"))
-  prediction <- predict(model, binomialTraining)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
-  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
-    "versicolor", "virginica", "versicolor", "virginica", "versicolor")
-  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
-
   # poisson family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
   family = poisson(link = identity))
@@ -138,10 +128,10 @@ test_that("spark.glm summary", {
   expect_equal(stats$aic, rStats$aic)
 
   # Test spark.glm works with weighted dataset
-  a1 <- c(0, 1, 2, 3, 4)
-  a2 <- c(5, 2, 1, 3, 2)
-  w <- c(1, 2, 3, 4, 5)
-  b <- c(1, 0, 1, 0, 0)
+  a1 <- c(0, 1, 2, 3)
+  a2 <- c(5, 2, 1, 3)
+  w <- c(1, 2, 3, 4)
+  b <- c(1, 0, 1, 0)
   data <- as.data.frame(cbind(a1, a2, w, b))
   df <- suppressWarnings(createDataFrame(data))
 
@@ -168,7 +158,7 @@ test_that("spark.glm summary", {
   data <- as.data.frame(cbind(a1, a2, b))
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
-  expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
+  expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
 
   # Test spark.glm works on collinear data
   A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 8bcc9fe5d1b85..78f401f29b004 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -23,17 +23,12 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
-import org.apache.spark.ml.feature.{IndexToString, RFormula}
-import org.apache.spark.ml.regression._
-import org.apache.spark.ml.Transformer
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.feature.RFormula
 import org.apache.spark.ml.r.RWrapperUtils._
+import org.apache.spark.ml.regression._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
 
 private[r] class GeneralizedLinearRegressionWrapper private (
     val pipeline: PipelineModel,
@@ -48,8 +43,6 @@ private[r] class GeneralizedLinearRegressionWrapper private (
     val rNumIterations: Int,
     val isLoaded: Boolean = false) extends MLWritable {
 
-  import GeneralizedLinearRegressionWrapper._
-
   private val glm: GeneralizedLinearRegressionModel =
     pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
 
@@ -60,16 +53,7 @@ private[r] class GeneralizedLinearRegressionWrapper private (
   def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    if (rFamily == "binomial") {
-      pipeline.transform(dataset)
-        .drop(PREDICTED_LABEL_PROB_COL)
-        .drop(PREDICTED_LABEL_INDEX_COL)
-        .drop(glm.getFeaturesCol)
-        .drop(glm.getLabelCol)
-    } else {
-      pipeline.transform(dataset)
-        .drop(glm.getFeaturesCol)
-    }
+    pipeline.transform(dataset).drop(glm.getFeaturesCol)
   }
 
   override def write: MLWriter =
@@ -79,10 +63,6 @@ private[r] class GeneralizedLinearRegressionWrapper private (
 private[r] object GeneralizedLinearRegressionWrapper
   extends MLReadable[GeneralizedLinearRegressionWrapper] {
 
-  val PREDICTED_LABEL_PROB_COL = "pred_label_prob"
-  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
-  val PREDICTED_LABEL_COL = "prediction"
-
   def fit(
       formula: String,
       data: DataFrame,
@@ -93,7 +73,6 @@ private[r] object GeneralizedLinearRegressionWrapper
       weightCol: String,
       regParam: Double): GeneralizedLinearRegressionWrapper = {
     val rFormula = new RFormula().setFormula(formula)
-    if (family == "binomial") rFormula.setForceIndexLabel(true)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema
@@ -111,28 +90,9 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setWeightCol(weightCol)
       .setRegParam(regParam)
       .setFeaturesCol(rFormula.getFeaturesCol)
-      .setLabelCol(rFormula.getLabelCol)
-    val pipeline = if (family == "binomial") {
-      // Convert prediction from probability to label index.
-      val probToPred = new ProbabilityToPrediction()
-        .setInputCol(PREDICTED_LABEL_PROB_COL)
-        .setOutputCol(PREDICTED_LABEL_INDEX_COL)
-      // Convert prediction from label index to original label.
-      val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
-        .asInstanceOf[NominalAttribute]
-      val labels = labelAttr.values.get
-      val idxToStr = new IndexToString()
-        .setInputCol(PREDICTED_LABEL_INDEX_COL)
-        .setOutputCol(PREDICTED_LABEL_COL)
-        .setLabels(labels)
-
-      new Pipeline()
-        .setStages(Array(rFormulaModel, glr.setPredictionCol(PREDICTED_LABEL_PROB_COL),
-          probToPred, idxToStr))
-        .fit(data)
-    } else {
-      new Pipeline().setStages(Array(rFormulaModel, glr)).fit(data)
-    }
+    val pipeline = new Pipeline()
+      .setStages(Array(rFormulaModel, glr))
+      .fit(data)
 
     val glm: GeneralizedLinearRegressionModel =
       pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
@@ -248,27 +208,3 @@ private[r] object GeneralizedLinearRegressionWrapper
     }
   }
 }
-
-/**
- * This utility transformer converts the predicted value of GeneralizedLinearRegressionModel
- * with "binomial" family from probability to prediction according to threshold 0.5.
- */
-private[r] class ProbabilityToPrediction private[r] (override val uid: String)
-  extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable {
-
-  def this() = this(Identifiable.randomUID("probToPred"))
-
-  def setInputCol(value: String): this.type = set(inputCol, value)
-
-  def setOutputCol(value: String): this.type = set(outputCol, value)
-
-  override def transformSchema(schema: StructType): StructType = {
-    StructType(schema.fields :+ StructField($(outputCol), DoubleType))
-  }
-
-  override def transform(dataset: Dataset[_]): DataFrame = {
-    dataset.withColumn($(outputCol), round(col($(inputCol))))
-  }
-
-  override def copy(extra: ParamMap): ProbabilityToPrediction = defaultCopy(extra)
-}

From f53763275ae1b74925e4123dd87f567798f16ba1 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Fri, 2 Dec 2016 12:42:47 -0800
Subject: [PATCH 266/534] [SPARK-18670][SS] Limit the number of
 StreamingQueryListener.StreamProgressEvent when there is no data

## What changes were proposed in this pull request?

This PR adds a sql conf `spark.sql.streaming.noDataReportInterval` to control how long to wait before outputing the next StreamProgressEvent when there is no data.

## How was this patch tested?

The added unit test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16108 from zsxwing/SPARK-18670.

(cherry picked from commit 56a503df5ccbb233ad6569e22002cc989e676337)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../execution/streaming/StreamExecution.scala | 18 +++++++-
 .../apache/spark/sql/internal/SQLConf.scala   | 10 +++++
 .../StreamingQueryListenerSuite.scala         | 44 +++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 6d0e269d341ee..8804c647a75c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -63,6 +63,9 @@ class StreamExecution(
 
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
+  private val noDataProgressEventInterval =
+    sparkSession.sessionState.conf.streamingNoDataProgressEventInterval
+
   /**
    * A lock used to wait/notify when batches complete. Use a fair lock to avoid thread starvation.
    */
@@ -196,6 +199,9 @@ class StreamExecution(
       // While active, repeatedly attempt to run batches.
       SparkSession.setActiveSession(sparkSession)
 
+      // The timestamp we report an event that has no input data
+      var lastNoDataProgressEventTime = Long.MinValue
+
       triggerExecutor.execute(() => {
         startTrigger()
 
@@ -218,7 +224,17 @@ class StreamExecution(
 
             // Report trigger as finished and construct progress object.
             finishTrigger(dataAvailable)
-            postEvent(new QueryProgressEvent(lastProgress))
+            if (dataAvailable) {
+              // Reset noDataEventTimestamp if we processed any data
+              lastNoDataProgressEventTime = Long.MinValue
+              postEvent(new QueryProgressEvent(lastProgress))
+            } else {
+              val now = triggerClock.getTimeMillis()
+              if (now - noDataProgressEventInterval >= lastNoDataProgressEventTime) {
+                lastNoDataProgressEventTime = now
+                postEvent(new QueryProgressEvent(lastProgress))
+              }
+            }
 
             if (dataAvailable) {
               // We'll increase currentBatchId after we complete processing current batch's data
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 21b26b81467fe..581f99e9c155c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -577,6 +577,13 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(10L)
 
+  val STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL =
+    SQLConfigBuilder("spark.sql.streaming.noDataProgressEventInterval")
+      .internal()
+      .doc("How long to wait between two progress events when there is no data")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefault(10000L)
+
   val STREAMING_METRICS_ENABLED =
     SQLConfigBuilder("spark.sql.streaming.metricsEnabled")
       .doc("Whether Dropwizard/Codahale metrics will be reported for active streaming queries.")
@@ -658,6 +665,9 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def streamingPollingDelay: Long = getConf(STREAMING_POLLING_DELAY)
 
+  def streamingNoDataProgressEventInterval: Long =
+    getConf(STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL)
+
   def streamingMetricsEnabled: Boolean = getConf(STREAMING_METRICS_ENABLED)
 
   def streamingProgressRetention: Int = getConf(STREAMING_PROGRESS_RETENTION)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 07a13a48a18c8..3086abf03cd6c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -31,6 +31,7 @@ import org.scalatest.PrivateMethodTester._
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.util.JsonProtocol
 
@@ -46,6 +47,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     assert(spark.streams.active.isEmpty)
     assert(addedListeners.isEmpty)
     // Make sure we don't leak any events to the next test
+    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
   }
 
   testQuietly("single listener, check trigger events are generated correctly") {
@@ -191,6 +193,48 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     assert(queryQueryTerminated.exception === newQueryTerminated.exception)
   }
 
+  test("only one progress event per interval when no data") {
+    // This test will start a query but not push any data, and then check if we push too many events
+    withSQLConf(SQLConf.STREAMING_NO_DATA_PROGRESS_EVENT_INTERVAL.key -> "100ms") {
+      @volatile var numProgressEvent = 0
+      val listener = new StreamingQueryListener {
+        override def onQueryStarted(event: QueryStartedEvent): Unit = {}
+        override def onQueryProgress(event: QueryProgressEvent): Unit = {
+          numProgressEvent += 1
+        }
+        override def onQueryTerminated(event: QueryTerminatedEvent): Unit = {}
+      }
+      spark.streams.addListener(listener)
+      try {
+        val input = new MemoryStream[Int](0, sqlContext) {
+          @volatile var numTriggers = 0
+          override def getOffset: Option[Offset] = {
+            numTriggers += 1
+            super.getOffset
+          }
+        }
+        val clock = new StreamManualClock()
+        val actions = mutable.ArrayBuffer[StreamAction]()
+        actions += StartStream(trigger = ProcessingTime(10), triggerClock = clock)
+        for (_ <- 1 to 100) {
+          actions += AdvanceManualClock(10)
+        }
+        actions += AssertOnQuery { _ =>
+          eventually(timeout(streamingTimeout)) {
+            assert(input.numTriggers > 100) // at least 100 triggers have occurred
+          }
+          true
+        }
+        testStream(input.toDS)(actions: _*)
+        spark.sparkContext.listenerBus.waitUntilEmpty(10000)
+        // 11 is the max value of the possible numbers of events.
+        assert(numProgressEvent > 1 && numProgressEvent <= 11)
+      } finally {
+        spark.streams.removeListener(listener)
+      }
+    }
+  }
+
   testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
     // query-event-logs-version-2.0.0.txt has all types of events generated by
     // Structured Streaming in Spark 2.0.0.

From 839d4e9ca94b132732225632e8c50364e53579a0 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 2 Dec 2016 16:28:01 -0800
Subject: [PATCH 267/534] [SPARK-18324][ML][DOC] Update ML programming and
 migration guide for 2.1 release

## What changes were proposed in this pull request?
Update ML programming and migration guide for 2.1 release.

## How was this patch tested?
Doc change, no test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16076 from yanboliang/spark-18324.

(cherry picked from commit 2dc0d7efe3380a5763cb69ef346674a46f8e3d57)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 docs/ml-guide.md            | 150 ++++--------------------------------
 docs/ml-migration-guides.md | 147 +++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+), 134 deletions(-)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index 4607ad3ba681a..ddf81be177f3d 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -60,152 +60,34 @@ MLlib is under active development.
 The APIs marked `Experimental`/`DeveloperApi` may change in future releases,
 and the migration guide below will explain all changes between releases.
 
-## From 1.6 to 2.0
+## From 2.0 to 2.1
 
 ### Breaking changes
-
-There were several breaking changes in Spark 2.0, which are outlined below.
-
-**Linear algebra classes for DataFrame-based APIs**
-
-Spark's linear algebra dependencies were moved to a new project, `mllib-local` 
-(see [SPARK-13944](https://issues.apache.org/jira/browse/SPARK-13944)). 
-As part of this change, the linear algebra classes were copied to a new package, `spark.ml.linalg`. 
-The DataFrame-based APIs in `spark.ml` now depend on the `spark.ml.linalg` classes, 
-leading to a few breaking changes, predominantly in various model classes 
-(see [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810) for a full list).
-
-**Note:** the RDD-based APIs in `spark.mllib` continue to depend on the previous package `spark.mllib.linalg`.
-
-_Converting vectors and matrices_
-
-While most pipeline components support backward compatibility for loading, 
-some existing `DataFrames` and pipelines in Spark versions prior to 2.0, that contain vector or matrix 
-columns, may need to be migrated to the new `spark.ml` vector and matrix types. 
-Utilities for converting `DataFrame` columns from `spark.mllib.linalg` to `spark.ml.linalg` types
-(and vice versa) can be found in `spark.mllib.util.MLUtils`.
-
-There are also utility methods available for converting single instances of 
-vectors and matrices. Use the `asML` method on a `mllib.linalg.Vector` / `mllib.linalg.Matrix`
-for converting to `ml.linalg` types, and 
-`mllib.linalg.Vectors.fromML` / `mllib.linalg.Matrices.fromML` 
-for converting to `mllib.linalg` types.
-
-<div class="codetabs">
-<div data-lang="scala"  markdown="1">
-
-{% highlight scala %}
-import org.apache.spark.mllib.util.MLUtils
-
-// convert DataFrame columns
-val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
-val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
-// convert a single vector or matrix
-val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML
-val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML
-{% endhighlight %}
-
-Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail.
-</div>
-
-<div data-lang="java" markdown="1">
-
-{% highlight java %}
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.sql.Dataset;
-
-// convert DataFrame columns
-Dataset<Row> convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF);
-Dataset<Row> convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF);
-// convert a single vector or matrix
-org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML();
-org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML();
-{% endhighlight %}
-
-Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for further detail.
-</div>
-
-<div data-lang="python"  markdown="1">
-
-{% highlight python %}
-from pyspark.mllib.util import MLUtils
-
-# convert DataFrame columns
-convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
-convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
-# convert a single vector or matrix
-mlVec = mllibVec.asML()
-mlMat = mllibMat.asML()
-{% endhighlight %}
-
-Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for further detail.
-</div>
-</div>
-
+ 
 **Deprecated methods removed**
 
-Several deprecated methods were removed in the `spark.mllib` and `spark.ml` packages:
-
-* `setScoreCol` in `ml.evaluation.BinaryClassificationEvaluator`
-* `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml`
-* `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`)
-* `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`)
-* `defaultStategy` in `mllib.tree.configuration.Strategy`
-* `build` in `mllib.tree.Node`
-* libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils`
-
-A full list of breaking changes can be found at [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810).
+* `setLabelCol` in `feature.ChiSqSelectorModel`
+* `numTrees` in `classification.RandomForestClassificationModel` (This now refers to the Param called `numTrees`)
+* `numTrees` in `regression.RandomForestRegressionModel` (This now refers to the Param called `numTrees`)
+* `model` in `regression.LinearRegressionSummary`
+* `validateParams` in `PipelineStage`
+* `validateParams` in `Evaluator`
 
 ### Deprecations and changes of behavior
 
 **Deprecations**
 
-Deprecations in the `spark.mllib` and `spark.ml` packages include:
-
-* [SPARK-14984](https://issues.apache.org/jira/browse/SPARK-14984):
- In `spark.ml.regression.LinearRegressionSummary`, the `model` field has been deprecated.
-* [SPARK-13784](https://issues.apache.org/jira/browse/SPARK-13784):
- In `spark.ml.regression.RandomForestRegressionModel` and `spark.ml.classification.RandomForestClassificationModel`,
- the `numTrees` parameter has been deprecated in favor of `getNumTrees` method.
-* [SPARK-13761](https://issues.apache.org/jira/browse/SPARK-13761):
- In `spark.ml.param.Params`, the `validateParams` method has been deprecated.
- We move all functionality in overridden methods to the corresponding `transformSchema`.
-* [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829):
- In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated.
- We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`.
-* [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900):
- In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`.
-* [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644):
- In `spark.ml.util.MLReader` and `spark.ml.util.MLWriter`, the `context` method has been deprecated in favor of `session`.
-* In `spark.ml.feature.ChiSqSelectorModel`, the `setLabelCol` method has been deprecated since it was not used by `ChiSqSelectorModel`.
+* [SPARK-18592](https://issues.apache.org/jira/browse/SPARK-18592):
+  Deprecate all Param setter methods except for input/output column Params for `DecisionTreeClassificationModel`, `GBTClassificationModel`, `RandomForestClassificationModel`, `DecisionTreeRegressionModel`, `GBTRegressionModel` and `RandomForestRegressionModel`
 
 **Changes of behavior**
 
-Changes of behavior in the `spark.mllib` and `spark.ml` packages include:
-
-* [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780):
- `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now.
- This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`:
-    * The intercept will not be regularized when training binary classification model with L1/L2 Updater.
-    * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.
-* [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429):
- In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`,
- the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6.
-* [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363):
- Fix a bug of `PowerIterationClustering` which will likely change its result.
-* [SPARK-13048](https://issues.apache.org/jira/browse/SPARK-13048):
- `LDA` using the `EM` optimizer will keep the last checkpoint by default, if checkpointing is being used.
-* [SPARK-12153](https://issues.apache.org/jira/browse/SPARK-12153):
- `Word2Vec` now respects sentence boundaries. Previously, it did not handle them correctly.
-* [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574):
- `HashingTF` uses `MurmurHash3` as default hash algorithm in both `spark.ml` and `spark.mllib`.
-* [SPARK-14768](https://issues.apache.org/jira/browse/SPARK-14768):
- The `expectedType` argument for PySpark `Param` was removed.
-* [SPARK-14931](https://issues.apache.org/jira/browse/SPARK-14931):
- Some default `Param` values, which were mismatched between pipelines in Scala and Python, have been changed.
-* [SPARK-13600](https://issues.apache.org/jira/browse/SPARK-13600):
- `QuantileDiscretizer` now uses `spark.sql.DataFrameStatFunctions.approxQuantile` to find splits (previously used custom sampling logic).
- The output buckets will differ for same input data and params.
+* [SPARK-17870](https://issues.apache.org/jira/browse/SPARK-17870):
+ Fix a bug of `ChiSqSelector` which will likely change its result. Now `ChiSquareSelector` use pValue rather than raw statistic to select a fixed number of top features.
+* [SPARK-3261](https://issues.apache.org/jira/browse/SPARK-3261):
+ `KMeans` returns potentially fewer than k cluster centers in cases where k distinct centroids aren't available or aren't selected.
+* [SPARK-17389](https://issues.apache.org/jira/browse/SPARK-17389):
+ `KMeans` reduces the default number of steps from 5 to 2 for the k-means|| initialization mode.
 
 ## Previous Spark versions
 
diff --git a/docs/ml-migration-guides.md b/docs/ml-migration-guides.md
index 82bf9d7760fb4..58c3747ea6387 100644
--- a/docs/ml-migration-guides.md
+++ b/docs/ml-migration-guides.md
@@ -7,6 +7,153 @@ description: MLlib migration guides from before Spark SPARK_VERSION_SHORT
 
 The migration guide for the current Spark version is kept on the [MLlib Guide main page](ml-guide.html#migration-guide).
 
+## From 1.6 to 2.0
+
+### Breaking changes
+
+There were several breaking changes in Spark 2.0, which are outlined below.
+
+**Linear algebra classes for DataFrame-based APIs**
+
+Spark's linear algebra dependencies were moved to a new project, `mllib-local` 
+(see [SPARK-13944](https://issues.apache.org/jira/browse/SPARK-13944)). 
+As part of this change, the linear algebra classes were copied to a new package, `spark.ml.linalg`. 
+The DataFrame-based APIs in `spark.ml` now depend on the `spark.ml.linalg` classes, 
+leading to a few breaking changes, predominantly in various model classes 
+(see [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810) for a full list).
+
+**Note:** the RDD-based APIs in `spark.mllib` continue to depend on the previous package `spark.mllib.linalg`.
+
+_Converting vectors and matrices_
+
+While most pipeline components support backward compatibility for loading, 
+some existing `DataFrames` and pipelines in Spark versions prior to 2.0, that contain vector or matrix 
+columns, may need to be migrated to the new `spark.ml` vector and matrix types. 
+Utilities for converting `DataFrame` columns from `spark.mllib.linalg` to `spark.ml.linalg` types
+(and vice versa) can be found in `spark.mllib.util.MLUtils`.
+
+There are also utility methods available for converting single instances of 
+vectors and matrices. Use the `asML` method on a `mllib.linalg.Vector` / `mllib.linalg.Matrix`
+for converting to `ml.linalg` types, and 
+`mllib.linalg.Vectors.fromML` / `mllib.linalg.Matrices.fromML` 
+for converting to `mllib.linalg` types.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.mllib.util.MLUtils
+
+// convert DataFrame columns
+val convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
+val convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
+// convert a single vector or matrix
+val mlVec: org.apache.spark.ml.linalg.Vector = mllibVec.asML
+val mlMat: org.apache.spark.ml.linalg.Matrix = mllibMat.asML
+{% endhighlight %}
+
+Refer to the [`MLUtils` Scala docs](api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) for further detail.
+</div>
+
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.Dataset;
+
+// convert DataFrame columns
+Dataset<Row> convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF);
+Dataset<Row> convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF);
+// convert a single vector or matrix
+org.apache.spark.ml.linalg.Vector mlVec = mllibVec.asML();
+org.apache.spark.ml.linalg.Matrix mlMat = mllibMat.asML();
+{% endhighlight %}
+
+Refer to the [`MLUtils` Java docs](api/java/org/apache/spark/mllib/util/MLUtils.html) for further detail.
+</div>
+
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+from pyspark.mllib.util import MLUtils
+
+# convert DataFrame columns
+convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
+convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
+# convert a single vector or matrix
+mlVec = mllibVec.asML()
+mlMat = mllibMat.asML()
+{% endhighlight %}
+
+Refer to the [`MLUtils` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils) for further detail.
+</div>
+</div>
+
+**Deprecated methods removed**
+
+Several deprecated methods were removed in the `spark.mllib` and `spark.ml` packages:
+
+* `setScoreCol` in `ml.evaluation.BinaryClassificationEvaluator`
+* `weights` in `LinearRegression` and `LogisticRegression` in `spark.ml`
+* `setMaxNumIterations` in `mllib.optimization.LBFGS` (marked as `DeveloperApi`)
+* `treeReduce` and `treeAggregate` in `mllib.rdd.RDDFunctions` (these functions are available on `RDD`s directly, and were marked as `DeveloperApi`)
+* `defaultStategy` in `mllib.tree.configuration.Strategy`
+* `build` in `mllib.tree.Node`
+* libsvm loaders for multiclass and load/save labeledData methods in `mllib.util.MLUtils`
+
+A full list of breaking changes can be found at [SPARK-14810](https://issues.apache.org/jira/browse/SPARK-14810).
+
+### Deprecations and changes of behavior
+
+**Deprecations**
+
+Deprecations in the `spark.mllib` and `spark.ml` packages include:
+
+* [SPARK-14984](https://issues.apache.org/jira/browse/SPARK-14984):
+ In `spark.ml.regression.LinearRegressionSummary`, the `model` field has been deprecated.
+* [SPARK-13784](https://issues.apache.org/jira/browse/SPARK-13784):
+ In `spark.ml.regression.RandomForestRegressionModel` and `spark.ml.classification.RandomForestClassificationModel`,
+ the `numTrees` parameter has been deprecated in favor of `getNumTrees` method.
+* [SPARK-13761](https://issues.apache.org/jira/browse/SPARK-13761):
+ In `spark.ml.param.Params`, the `validateParams` method has been deprecated.
+ We move all functionality in overridden methods to the corresponding `transformSchema`.
+* [SPARK-14829](https://issues.apache.org/jira/browse/SPARK-14829):
+ In `spark.mllib` package, `LinearRegressionWithSGD`, `LassoWithSGD`, `RidgeRegressionWithSGD` and `LogisticRegressionWithSGD` have been deprecated.
+ We encourage users to use `spark.ml.regression.LinearRegresson` and `spark.ml.classification.LogisticRegresson`.
+* [SPARK-14900](https://issues.apache.org/jira/browse/SPARK-14900):
+ In `spark.mllib.evaluation.MulticlassMetrics`, the parameters `precision`, `recall` and `fMeasure` have been deprecated in favor of `accuracy`.
+* [SPARK-15644](https://issues.apache.org/jira/browse/SPARK-15644):
+ In `spark.ml.util.MLReader` and `spark.ml.util.MLWriter`, the `context` method has been deprecated in favor of `session`.
+* In `spark.ml.feature.ChiSqSelectorModel`, the `setLabelCol` method has been deprecated since it was not used by `ChiSqSelectorModel`.
+
+**Changes of behavior**
+
+Changes of behavior in the `spark.mllib` and `spark.ml` packages include:
+
+* [SPARK-7780](https://issues.apache.org/jira/browse/SPARK-7780):
+ `spark.mllib.classification.LogisticRegressionWithLBFGS` directly calls `spark.ml.classification.LogisticRegresson` for binary classification now.
+ This will introduce the following behavior changes for `spark.mllib.classification.LogisticRegressionWithLBFGS`:
+    * The intercept will not be regularized when training binary classification model with L1/L2 Updater.
+    * If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.
+* [SPARK-13429](https://issues.apache.org/jira/browse/SPARK-13429):
+ In order to provide better and consistent result with `spark.ml.classification.LogisticRegresson`,
+ the default value of `spark.mllib.classification.LogisticRegressionWithLBFGS`: `convergenceTol` has been changed from 1E-4 to 1E-6.
+* [SPARK-12363](https://issues.apache.org/jira/browse/SPARK-12363):
+ Fix a bug of `PowerIterationClustering` which will likely change its result.
+* [SPARK-13048](https://issues.apache.org/jira/browse/SPARK-13048):
+ `LDA` using the `EM` optimizer will keep the last checkpoint by default, if checkpointing is being used.
+* [SPARK-12153](https://issues.apache.org/jira/browse/SPARK-12153):
+ `Word2Vec` now respects sentence boundaries. Previously, it did not handle them correctly.
+* [SPARK-10574](https://issues.apache.org/jira/browse/SPARK-10574):
+ `HashingTF` uses `MurmurHash3` as default hash algorithm in both `spark.ml` and `spark.mllib`.
+* [SPARK-14768](https://issues.apache.org/jira/browse/SPARK-14768):
+ The `expectedType` argument for PySpark `Param` was removed.
+* [SPARK-14931](https://issues.apache.org/jira/browse/SPARK-14931):
+ Some default `Param` values, which were mismatched between pipelines in Scala and Python, have been changed.
+* [SPARK-13600](https://issues.apache.org/jira/browse/SPARK-13600):
+ `QuantileDiscretizer` now uses `spark.sql.DataFrameStatFunctions.approxQuantile` to find splits (previously used custom sampling logic).
+ The output buckets will differ for same input data and params.
+
 ## From 1.5 to 1.6
 
 There are no breaking API changes in the `spark.mllib` or `spark.ml` packages, but there are

From cf3dbec68d379763ee541bf3b7a4809e1f2d0cb7 Mon Sep 17 00:00:00 2001
From: zero323 <zero323@users.noreply.github.com>
Date: Fri, 2 Dec 2016 17:39:28 -0800
Subject: [PATCH 268/534] [SPARK-18690][PYTHON][SQL] Backward compatibility of
 unbounded frames

## What changes were proposed in this pull request?

Makes `Window.unboundedPreceding` and `Window.unboundedFollowing` backward compatible.

## How was this patch tested?

Pyspark SQL unittests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: zero323 <zero323@users.noreply.github.com>

Closes #16123 from zero323/SPARK-17845-follow-up.

(cherry picked from commit a9cbfc4f6a8db936215fcf64697d5b65f13f666e)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 python/pyspark/sql/tests.py  | 35 +++++++++++++++++++++++++++++++++++
 python/pyspark/sql/window.py | 30 ++++++++++++++++--------------
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index b7b2a5923c07f..0aff9cebe91bd 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1980,6 +1980,41 @@ def assert_runs_only_one_job_stage_and_task(job_group_name, f):
         # Regression test for SPARK-17514: limit(n).collect() should the perform same as take(n)
         assert_runs_only_one_job_stage_and_task("collect_limit", lambda: df.limit(1).collect())
 
+    @unittest.skipIf(sys.version_info < (3, 3), "Unittest < 3.3 doesn't support mocking")
+    def test_unbounded_frames(self):
+        from unittest.mock import patch
+        from pyspark.sql import functions as F
+        from pyspark.sql import window
+        import importlib
+
+        df = self.spark.range(0, 3)
+
+        def rows_frame_match():
+            return "ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select(
+                F.count("*").over(window.Window.rowsBetween(-sys.maxsize, sys.maxsize))
+            ).columns[0]
+
+        def range_frame_match():
+            return "RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" in df.select(
+                F.count("*").over(window.Window.rangeBetween(-sys.maxsize, sys.maxsize))
+            ).columns[0]
+
+        with patch("sys.maxsize", 2 ** 31 - 1):
+            importlib.reload(window)
+            self.assertTrue(rows_frame_match())
+            self.assertTrue(range_frame_match())
+
+        with patch("sys.maxsize", 2 ** 63 - 1):
+            importlib.reload(window)
+            self.assertTrue(rows_frame_match())
+            self.assertTrue(range_frame_match())
+
+        with patch("sys.maxsize", 2 ** 127 - 1):
+            importlib.reload(window)
+            self.assertTrue(rows_frame_match())
+            self.assertTrue(range_frame_match())
+
+        importlib.reload(window)
 
 if __name__ == "__main__":
     from pyspark.sql.tests import *
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index c345e623f1cb1..7ce27f9b102c0 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -49,6 +49,8 @@ class Window(object):
 
     _JAVA_MIN_LONG = -(1 << 63)  # -9223372036854775808
     _JAVA_MAX_LONG = (1 << 63) - 1  # 9223372036854775807
+    _PRECEDING_THRESHOLD = max(-sys.maxsize, _JAVA_MIN_LONG)
+    _FOLLOWING_THRESHOLD = min(sys.maxsize, _JAVA_MAX_LONG)
 
     unboundedPreceding = _JAVA_MIN_LONG
 
@@ -98,9 +100,9 @@ def rowsBetween(start, end):
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
                     any value greater than or equal to 9223372036854775807.
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         sc = SparkContext._active_spark_context
         jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rowsBetween(start, end)
@@ -123,14 +125,14 @@ def rangeBetween(start, end):
 
         :param start: boundary start, inclusive.
                       The frame is unbounded if this is ``Window.unboundedPreceding``, or
-                      any value less than or equal to -9223372036854775808.
+                      any value less than or equal to max(-sys.maxsize, -9223372036854775808).
         :param end: boundary end, inclusive.
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
-                    any value greater than or equal to 9223372036854775807.
+                    any value greater than or equal to min(sys.maxsize, 9223372036854775807).
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         sc = SparkContext._active_spark_context
         jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rangeBetween(start, end)
@@ -185,14 +187,14 @@ def rowsBetween(self, start, end):
 
         :param start: boundary start, inclusive.
                       The frame is unbounded if this is ``Window.unboundedPreceding``, or
-                      any value less than or equal to -9223372036854775808.
+                      any value less than or equal to max(-sys.maxsize, -9223372036854775808).
         :param end: boundary end, inclusive.
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
-                    any value greater than or equal to 9223372036854775807.
+                    any value greater than or equal to min(sys.maxsize, 9223372036854775807).
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         return WindowSpec(self._jspec.rowsBetween(start, end))
 
@@ -211,14 +213,14 @@ def rangeBetween(self, start, end):
 
         :param start: boundary start, inclusive.
                       The frame is unbounded if this is ``Window.unboundedPreceding``, or
-                      any value less than or equal to -9223372036854775808.
+                      any value less than or equal to max(-sys.maxsize, -9223372036854775808).
         :param end: boundary end, inclusive.
                     The frame is unbounded if this is ``Window.unboundedFollowing``, or
-                    any value greater than or equal to 9223372036854775807.
+                    any value greater than or equal to min(sys.maxsize, 9223372036854775807).
         """
-        if start <= Window._JAVA_MIN_LONG:
+        if start <= Window._PRECEDING_THRESHOLD:
             start = Window.unboundedPreceding
-        if end >= Window._JAVA_MAX_LONG:
+        if end >= Window._FOLLOWING_THRESHOLD:
             end = Window.unboundedFollowing
         return WindowSpec(self._jspec.rangeBetween(start, end))
 

From 28ea432a26953866eaf95b2fd32a251ecf0c8094 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 3 Dec 2016 10:12:28 +0000
Subject: [PATCH 269/534] [SPARK-18685][TESTS] Fix URI and release resources
 after opening in tests at ExecutorClassLoaderSuite

## What changes were proposed in this pull request?

This PR fixes two problems as below:

- Close `BufferedSource` after `Source.fromInputStream(...)` to release resource and make the tests pass on Windows in `ExecutorClassLoaderSuite`

  ```
  [info] Exception encountered when attempting to run a suite with class name: org.apache.spark.repl.ExecutorClassLoaderSuite *** ABORTED *** (7 seconds, 333 milliseconds)
  [info]   java.io.IOException: Failed to delete: C:\projects\spark\target\tmp\spark-77b2f37b-6405-47c4-af1c-4a6a206511f2
  [info]   at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1010)
  [info]   at org.apache.spark.repl.ExecutorClassLoaderSuite.afterAll(ExecutorClassLoaderSuite.scala:76)
  [info]   at org.scalatest.BeforeAndAfterAll$class.afterAll(BeforeAndAfterAll.scala:213)
  ...
  ```

- Fix URI correctly so that related tests can be passed on Windows.

  ```
  [info] - child first *** FAILED *** (78 milliseconds)
  [info]   java.net.URISyntaxException: Illegal character in authority at index 7: file://C:\projects\spark\target\tmp\spark-00b66070-0548-463c-b6f3-8965d173da9b
  [info]   at java.net.URI$Parser.fail(URI.java:2848)
  [info]   at java.net.URI$Parser.parseAuthority(URI.java:3186)
  ...
  [info] - parent first *** FAILED *** (15 milliseconds)
  [info]   java.net.URISyntaxException: Illegal character in authority at index 7: file://C:\projects\spark\target\tmp\spark-00b66070-0548-463c-b6f3-8965d173da9b
  [info]   at java.net.URI$Parser.fail(URI.java:2848)
  [info]   at java.net.URI$Parser.parseAuthority(URI.java:3186)
  ...
  [info] - child first can fall back *** FAILED *** (0 milliseconds)
  [info]   java.net.URISyntaxException: Illegal character in authority at index 7: file://C:\projects\spark\target\tmp\spark-00b66070-0548-463c-b6f3-8965d173da9b
  [info]   at java.net.URI$Parser.fail(URI.java:2848)
  [info]   at java.net.URI$Parser.parseAuthority(URI.java:3186)
  ...
  [info] - child first can fail *** FAILED *** (0 milliseconds)
  [info]   java.net.URISyntaxException: Illegal character in authority at index 7: file://C:\projects\spark\target\tmp\spark-00b66070-0548-463c-b6f3-8965d173da9b
  [info]   at java.net.URI$Parser.fail(URI.java:2848)
  [info]   at java.net.URI$Parser.parseAuthority(URI.java:3186)
  ...
  [info] - resource from parent *** FAILED *** (0 milliseconds)
  [info]   java.net.URISyntaxException: Illegal character in authority at index 7: file://C:\projects\spark\target\tmp\spark-00b66070-0548-463c-b6f3-8965d173da9b
  [info]   at java.net.URI$Parser.fail(URI.java:2848)
  [info]   at java.net.URI$Parser.parseAuthority(URI.java:3186)
  ...
  [info] - resources from parent *** FAILED *** (0 milliseconds)
  [info]   java.net.URISyntaxException: Illegal character in authority at index 7: file://C:\projects\spark\target\tmp\spark-00b66070-0548-463c-b6f3-8965d173da9b
  [info]   at java.net.URI$Parser.fail(URI.java:2848)
  [info]   at java.net.URI$Parser.parseAuthority(URI.java:3186)
  ```

## How was this patch tested?

Manually tested via AppVeyor.

**Before**
https://ci.appveyor.com/project/spark-test/spark/build/102-rpel-ExecutorClassLoaderSuite

**After**
https://ci.appveyor.com/project/spark-test/spark/build/108-rpel-ExecutorClassLoaderSuite

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #16116 from HyukjinKwon/close-after-open.

(cherry picked from commit d1312fb7edffd6e10c86f69ddfff05f8915856ac)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../spark/repl/ExecutorClassLoaderSuite.scala | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
index 3d622d42f4086..6d274bddb7782 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala
@@ -24,7 +24,6 @@ import java.nio.charset.StandardCharsets
 import java.nio.file.{Paths, StandardOpenOption}
 import java.util
 
-import scala.concurrent.duration._
 import scala.io.Source
 import scala.language.implicitConversions
 
@@ -34,8 +33,6 @@ import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfterAll
-import org.scalatest.concurrent.Interruptor
-import org.scalatest.concurrent.Timeouts._
 import org.scalatest.mock.MockitoSugar
 
 import org.apache.spark._
@@ -61,7 +58,7 @@ class ExecutorClassLoaderSuite
     super.beforeAll()
     tempDir1 = Utils.createTempDir()
     tempDir2 = Utils.createTempDir()
-    url1 = "file://" + tempDir1
+    url1 = tempDir1.toURI.toURL.toString
     urls2 = List(tempDir2.toURI.toURL).toArray
     childClassNames.foreach(TestUtils.createCompiledClass(_, tempDir1, "1"))
     parentResourceNames.foreach { x =>
@@ -118,8 +115,14 @@ class ExecutorClassLoaderSuite
     val resourceName: String = parentResourceNames.head
     val is = classLoader.getResourceAsStream(resourceName)
     assert(is != null, s"Resource $resourceName not found")
-    val content = Source.fromInputStream(is, "UTF-8").getLines().next()
-    assert(content.contains("resource"), "File doesn't contain 'resource'")
+
+    val bufferedSource = Source.fromInputStream(is, "UTF-8")
+    Utils.tryWithSafeFinally {
+      val content = bufferedSource.getLines().next()
+      assert(content.contains("resource"), "File doesn't contain 'resource'")
+    } {
+      bufferedSource.close()
+    }
   }
 
   test("resources from parent") {
@@ -128,8 +131,14 @@ class ExecutorClassLoaderSuite
     val resourceName: String = parentResourceNames.head
     val resources: util.Enumeration[URL] = classLoader.getResources(resourceName)
     assert(resources.hasMoreElements, s"Resource $resourceName not found")
-    val fileReader = Source.fromInputStream(resources.nextElement().openStream()).bufferedReader()
-    assert(fileReader.readLine().contains("resource"), "File doesn't contain 'resource'")
+
+    val bufferedSource = Source.fromInputStream(resources.nextElement().openStream())
+    Utils.tryWithSafeFinally {
+      val fileReader = bufferedSource.bufferedReader()
+      assert(fileReader.readLine().contains("resource"), "File doesn't contain 'resource'")
+    } {
+      bufferedSource.close()
+    }
   }
 
   test("fetch classes using Spark's RpcEnv") {

From b098b4845c557a3139c76caa0377c3049b6fe8aa Mon Sep 17 00:00:00 2001
From: Nattavut Sutyanyong <nsy.can@gmail.com>
Date: Sat, 3 Dec 2016 11:36:26 -0800
Subject: [PATCH 270/534] [SPARK-18582][SQL] Whitelist LogicalPlan operators
 allowed in correlated subqueries

## What changes were proposed in this pull request?

This fix puts an explicit list of operators that Spark supports for correlated subqueries.

## How was this patch tested?

Run sql/test, catalyst/test and add a new test case on Generate.

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #16046 from nsyca/spark18455.0.

(cherry picked from commit 4a3c09601ba69f7d49d1946bb6f20f5cfe453031)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 158 ++++++++++++------
 .../sql/catalyst/optimizer/Optimizer.scala    |   2 +-
 .../analysis/AnalysisErrorSuite.scala         |   4 +-
 .../org/apache/spark/sql/SubquerySuite.scala  |  18 ++
 4 files changed, 129 insertions(+), 53 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index fec42eedf98ab..f738ae822178a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -952,24 +952,24 @@ class Analyzer(
     private def pullOutCorrelatedPredicates(sub: LogicalPlan): (LogicalPlan, Seq[Expression]) = {
       val predicateMap = scala.collection.mutable.Map.empty[LogicalPlan, Seq[Expression]]
 
-      /** Make sure a plans' subtree does not contain a tagged predicate. */
-      def failOnOuterReferenceInSubTree(p: LogicalPlan, msg: String): Unit = {
-        if (p.collect(predicateMap).nonEmpty) {
-          failAnalysis(s"Accessing outer query column is not allowed in $msg: $p")
+      // Make sure a plan's subtree does not contain outer references
+      def failOnOuterReferenceInSubTree(p: LogicalPlan): Unit = {
+        if (p.collectFirst(predicateMap).nonEmpty) {
+          failAnalysis(s"Accessing outer query column is not allowed in:\n$p")
         }
       }
 
-      /** Helper function for locating outer references. */
+      // Helper function for locating outer references.
       def containsOuter(e: Expression): Boolean = {
         e.find(_.isInstanceOf[OuterReference]).isDefined
       }
 
-      /** Make sure a plans' expressions do not contain a tagged predicate. */
+      // Make sure a plan's expressions do not contain outer references
       def failOnOuterReference(p: LogicalPlan): Unit = {
         if (p.expressions.exists(containsOuter)) {
           failAnalysis(
             "Expressions referencing the outer query are not supported outside of WHERE/HAVING " +
-              s"clauses: $p")
+              s"clauses:\n$p")
         }
       }
 
@@ -1018,10 +1018,51 @@ class Analyzer(
 
       // Simplify the predicates before pulling them out.
       val transformed = BooleanSimplification(sub) transformUp {
-        // WARNING:
-        // Only Filter can host correlated expressions at this time
-        // Anyone adding a new "case" below needs to add the call to
-        // "failOnOuterReference" to disallow correlated expressions in it.
+
+        // Whitelist operators allowed in a correlated subquery
+        // There are 4 categories:
+        // 1. Operators that are allowed anywhere in a correlated subquery, and,
+        //    by definition of the operators, they either do not contain
+        //    any columns or cannot host outer references.
+        // 2. Operators that are allowed anywhere in a correlated subquery
+        //    so long as they do not host outer references.
+        // 3. Operators that need special handlings. These operators are
+        //    Project, Filter, Join, Aggregate, and Generate.
+        //
+        // Any operators that are not in the above list are allowed
+        // in a correlated subquery only if they are not on a correlation path.
+        // In other word, these operators are allowed only under a correlation point.
+        //
+        // A correlation path is defined as the sub-tree of all the operators that
+        // are on the path from the operator hosting the correlated expressions
+        // up to the operator producing the correlated values.
+
+        // Category 1:
+        // BroadcastHint, Distinct, LeafNode, Repartition, and SubqueryAlias
+        case p: BroadcastHint =>
+          p
+        case p: Distinct =>
+          p
+        case p: LeafNode =>
+          p
+        case p: Repartition =>
+          p
+        case p: SubqueryAlias =>
+          p
+
+        // Category 2:
+        // These operators can be anywhere in a correlated subquery.
+        // so long as they do not host outer references in the operators.
+        case p: Sort =>
+          failOnOuterReference(p)
+          p
+        case p: RedistributeData =>
+          failOnOuterReference(p)
+          p
+
+        // Category 3:
+        // Filter is one of the two operators allowed to host correlated expressions.
+        // The other operator is Join. Filter can be anywhere in a correlated subquery.
         case f @ Filter(cond, child) =>
           // Find all predicates with an outer reference.
           val (correlated, local) = splitConjunctivePredicates(cond).partition(containsOuter)
@@ -1043,14 +1084,24 @@ class Analyzer(
               predicateMap += child -> xs
               child
           }
+
+        // Project cannot host any correlated expressions
+        // but can be anywhere in a correlated subquery.
         case p @ Project(expressions, child) =>
           failOnOuterReference(p)
+
           val referencesToAdd = missingReferences(p)
           if (referencesToAdd.nonEmpty) {
             Project(expressions ++ referencesToAdd, child)
           } else {
             p
           }
+
+        // Aggregate cannot host any correlated expressions
+        // It can be on a correlation path if the correlation contains
+        // only equality correlated predicates.
+        // It cannot be on a correlation path if the correlation has
+        // non-equality correlated predicates.
         case a @ Aggregate(grouping, expressions, child) =>
           failOnOuterReference(a)
           failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a)
@@ -1061,48 +1112,55 @@ class Analyzer(
           } else {
             a
           }
-        case w : Window =>
-          failOnOuterReference(w)
-          failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, w)
-          w
-        case j @ Join(left, _, RightOuter, _) =>
-          failOnOuterReference(j)
-          failOnOuterReferenceInSubTree(left, "a RIGHT OUTER JOIN")
-          j
-        // SPARK-18578: Do not allow any correlated predicate
-        // in a Full (Outer) Join operator and its descendants
-        case j @ Join(_, _, FullOuter, _) =>
-          failOnOuterReferenceInSubTree(j, "a FULL OUTER JOIN")
-          j
-        case j @ Join(_, right, jt, _) if !jt.isInstanceOf[InnerLike] =>
-          failOnOuterReference(j)
-          failOnOuterReferenceInSubTree(right, "a LEFT (OUTER) JOIN")
+
+        // Join can host correlated expressions.
+        case j @ Join(left, right, joinType, _) =>
+          joinType match {
+            // Inner join, like Filter, can be anywhere.
+            case _: InnerLike =>
+              failOnOuterReference(j)
+
+            // Left outer join's right operand cannot be on a correlation path.
+            // LeftAnti and ExistenceJoin are special cases of LeftOuter.
+            // Note that ExistenceJoin cannot be expressed externally in both SQL and DataFrame
+            // so it should not show up here in Analysis phase. This is just a safety net.
+            //
+            // LeftSemi does not allow output from the right operand.
+            // Any correlated references in the subplan
+            // of the right operand cannot be pulled up.
+            case LeftOuter | LeftSemi | LeftAnti | ExistenceJoin(_) =>
+              failOnOuterReference(j)
+              failOnOuterReferenceInSubTree(right)
+
+            // Likewise, Right outer join's left operand cannot be on a correlation path.
+            case RightOuter =>
+              failOnOuterReference(j)
+              failOnOuterReferenceInSubTree(left)
+
+            // Any other join types not explicitly listed above,
+            // including Full outer join, are treated as Category 4.
+            case _ =>
+              failOnOuterReferenceInSubTree(j)
+          }
           j
-        case u: Union =>
-          failOnOuterReferenceInSubTree(u, "a UNION")
-          u
-        case s: SetOperation =>
-          failOnOuterReferenceInSubTree(s.right, "an INTERSECT/EXCEPT")
-          s
-        case e: Expand =>
-          failOnOuterReferenceInSubTree(e, "an EXPAND")
-          e
-        case l : LocalLimit =>
-          failOnOuterReferenceInSubTree(l, "a LIMIT")
-          l
-        // Since LIMIT <n> is represented as GlobalLimit(<n>, (LocalLimit (<n>, child))
-        // and we are walking bottom up, we will fail on LocalLimit before
-        // reaching GlobalLimit.
-        // The code below is just a safety net.
-        case g : GlobalLimit =>
-          failOnOuterReferenceInSubTree(g, "a LIMIT")
-          g
-        case s : Sample =>
-          failOnOuterReferenceInSubTree(s, "a TABLESAMPLE")
-          s
-        case p =>
+
+        // Generator with join=true, i.e., expressed with
+        // LATERAL VIEW [OUTER], similar to inner join,
+        // allows to have correlation under it
+        // but must not host any outer references.
+        // Note:
+        // Generator with join=false is treated as Category 4.
+        case p @ Generate(generator, true, _, _, _, _) =>
           failOnOuterReference(p)
           p
+
+        // Category 4: Any other operators not in the above 3 categories
+        // cannot be on a correlation path, that is they are allowed only
+        // under a correlation point but they and their descendant operators
+        // are not allowed to have any correlated expressions.
+        case p =>
+          failOnOuterReferenceInSubTree(p)
+          p
       }
       (transformed, predicateMap.values.flatten.toSeq)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 37f0c8ed19d37..75d9997582aa6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -932,7 +932,7 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
         split(joinCondition.map(splitConjunctivePredicates).getOrElse(Nil), left, right)
 
       joinType match {
-        case _: InnerLike |  LeftSemi =>
+        case _: InnerLike | LeftSemi =>
           // push down the single side only join filter for both sides sub queries
           val newLeft = leftJoinConditions.
             reduceLeftOption(And).map(Filter(_, left)).getOrElse(left)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 8c1faea2394c6..96aff37a4b4f9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -542,7 +542,7 @@ class AnalysisErrorSuite extends AnalysisTest {
           Filter(EqualTo(OuterReference(a), b), LocalRelation(b)))
       ),
       LocalRelation(a))
-    assertAnalysisError(plan4, "Accessing outer query column is not allowed in a LIMIT" :: Nil)
+    assertAnalysisError(plan4, "Accessing outer query column is not allowed in" :: Nil)
 
     val plan5 = Filter(
       Exists(
@@ -551,6 +551,6 @@ class AnalysisErrorSuite extends AnalysisTest {
       ),
       LocalRelation(a))
     assertAnalysisError(plan5,
-                        "Accessing outer query column is not allowed in a TABLESAMPLE" :: Nil)
+                        "Accessing outer query column is not allowed in" :: Nil)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 73a53944964fd..0f2f520006e35 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -789,4 +789,22 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  // Generate operator
+  test("Correlated subqueries in LATERAL VIEW") {
+    withTempView("t1", "t2") {
+      Seq((1, 1), (2, 0)).toDF("c1", "c2").createOrReplaceTempView("t1")
+      Seq[(Int, Array[Int])]((1, Array(1, 2)), (2, Array(-1, -3)))
+        .toDF("c1", "arr_c2").createTempView("t2")
+      checkAnswer(
+        sql(
+          """
+          | select c2
+          | from t1
+          | where exists (select *
+          |               from t2 lateral view explode(arr_c2) q as c2
+                          where t1.c1 = t2.c1)""".stripMargin),
+        Row(1) :: Row(0) :: Nil)
+    }
+  }
 }

From 28f698b4845e6497d060270ba790cc60dc7e1a6e Mon Sep 17 00:00:00 2001
From: Yunni <Euler57721@gmail.com>
Date: Sat, 3 Dec 2016 16:58:15 -0800
Subject: [PATCH 271/534] [SPARK-18081][ML][DOCS] Add user guide for Locality
 Sensitive Hashing(LSH)

## What changes were proposed in this pull request?
The user guide for LSH is added to ml-features.md, with several scala/java examples in spark-examples.

## How was this patch tested?
Doc has been generated through Jekyll, and checked through manual inspection.

Author: Yunni <Euler57721@gmail.com>
Author: Yun Ni <yunn@uber.com>
Author: Joseph K. Bradley <joseph@databricks.com>
Author: Yun Ni <Euler57721@gmail.com>

Closes #15795 from Yunni/SPARK-18081-lsh-guide.

(cherry picked from commit 34777184cd8cab61e1dd25d0a4d5e738880a57b2)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 docs/ml-features.md                           | 111 ++++++++++++++++++
 ...avaBucketedRandomProjectionLSHExample.java |  98 ++++++++++++++++
 .../examples/ml/JavaMinHashLSHExample.java    |  70 +++++++++++
 .../BucketedRandomProjectionLSHExample.scala  |  80 +++++++++++++
 .../spark/examples/ml/MinHashLSHExample.scala |  77 ++++++++++++
 5 files changed, 436 insertions(+)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 9eecc1333d06f..3ecf700abf6e5 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -9,6 +9,7 @@ This section covers algorithms for working with features, roughly divided into t
 * Extraction: Extracting features from "raw" data
 * Transformation: Scaling, converting, or modifying features
 * Selection: Selecting a subset from a larger set of features
+* Locality Sensitive Hashing (LSH): This class of algorithms combines aspects of feature transformation with other algorithms.
 
 **Table of Contents**
 
@@ -1450,3 +1451,113 @@ for more details on the API.
 {% include_example python/ml/chisq_selector_example.py %}
 </div>
 </div>
+
+# Locality Sensitive Hashing
+[Locality Sensitive Hashing (LSH)](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) is an important class of hashing techniques, which is commonly used in clustering, approximate nearest neighbor search and outlier detection with large datasets.
+
+The general idea of LSH is to use a family of functions ("LSH families") to hash data points into buckets, so that the data points which are close to each other are in the same buckets with high probability, while data points that are far away from each other are very likely in different buckets. An LSH family is formally defined as follows.
+
+In a metric space `(M, d)`, where `M` is a set and `d` is a distance function on `M`, an LSH family is a family of functions `h` that satisfy the following properties:
+`\[
+\forall p, q \in M,\\
+d(p,q) \leq r1 \Rightarrow Pr(h(p)=h(q)) \geq p1\\
+d(p,q) \geq r2 \Rightarrow Pr(h(p)=h(q)) \leq p2
+\]`
+This LSH family is called `(r1, r2, p1, p2)`-sensitive.
+
+In Spark, different LSH families are implemented in separate classes (e.g., `MinHash`), and APIs for feature transformation, approximate similarity join and approximate nearest neighbor are provided in each class.
+
+In LSH, we define a false positive as a pair of distant input features (with `$d(p,q) \geq r2$`) which are hashed into the same bucket, and we define a false negative as a pair of nearby features (with `$d(p,q) \leq r1$`) which are hashed into different buckets.
+
+## LSH Operations
+
+We describe the major types of operations which LSH can be used for.  A fitted LSH model has methods for each of these operations.
+
+### Feature Transformation
+Feature transformation is the basic functionality to add hashed values as a new column. This can be useful for dimensionality reduction. Users can specify input and output column names by setting `inputCol` and `outputCol`.
+
+LSH also supports multiple LSH hash tables. Users can specify the number of hash tables by setting `numHashTables`. This is also used for [OR-amplification](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Amplification) in approximate similarity join and approximate nearest neighbor. Increasing the number of hash tables will increase the accuracy but will also increase communication cost and running time.
+
+The type of `outputCol` is `Seq[Vector]` where the dimension of the array equals `numHashTables`, and the dimensions of the vectors are currently set to 1. In future releases, we will implement AND-amplification so that users can specify the dimensions of these vectors.
+
+### Approximate Similarity Join
+Approximate similarity join takes two datasets and approximately returns pairs of rows in the datasets whose distance is smaller than a user-defined threshold. Approximate similarity join supports both joining two different datasets and self-joining. Self-joining will produce some duplicate pairs.
+
+Approximate similarity join accepts both transformed and untransformed datasets as input. If an untransformed dataset is used, it will be transformed automatically. In this case, the hash signature will be created as `outputCol`.
+
+In the joined dataset, the origin datasets can be queried in `datasetA` and `datasetB`. A distance column will be added to the output dataset to show the true distance between each pair of rows returned.
+
+### Approximate Nearest Neighbor Search
+Approximate nearest neighbor search takes a dataset (of feature vectors) and a key (a single feature vector), and it approximately returns a specified number of rows in the dataset that are closest to the vector.
+
+Approximate nearest neighbor search accepts both transformed and untransformed datasets as input. If an untransformed dataset is used, it will be transformed automatically. In this case, the hash signature will be created as `outputCol`.
+
+A distance column will be added to the output dataset to show the true distance between each output row and the searched key.
+
+**Note:** Approximate nearest neighbor search will return fewer than `k` rows when there are not enough candidates in the hash bucket.
+
+## LSH Algorithms
+
+### Bucketed Random Projection for Euclidean Distance
+
+[Bucketed Random Projection](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions) is an LSH family for Euclidean distance. The Euclidean distance is defined as follows:
+`\[
+d(\mathbf{x}, \mathbf{y}) = \sqrt{\sum_i (x_i - y_i)^2}
+\]`
+Its LSH family projects feature vectors `$\mathbf{x}$` onto a random unit vector `$\mathbf{v}$` and portions the projected results into hash buckets:
+`\[
+h(\mathbf{x}) = \Big\lfloor \frac{\mathbf{x} \cdot \mathbf{v}}{r} \Big\rfloor
+\]`
+where `r` is a user-defined bucket length. The bucket length can be used to control the average size of hash buckets (and thus the number of buckets). A larger bucket length (i.e., fewer buckets) increases the probability of features being hashed to the same bucket (increasing the numbers of true and false positives).
+
+Bucketed Random Projection accepts arbitrary vectors as input features, and supports both sparse and dense vectors.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [BucketedRandomProjectionLSH Scala docs](api/scala/index.html#org.apache.spark.ml.feature.BucketedRandomProjectionLSH)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [BucketedRandomProjectionLSH Java docs](api/java/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java %}
+</div>
+</div>
+
+### MinHash for Jaccard Distance
+[MinHash](https://en.wikipedia.org/wiki/MinHash) is an LSH family for Jaccard distance where input features are sets of natural numbers. Jaccard distance of two sets is defined by the cardinality of their intersection and union:
+`\[
+d(\mathbf{A}, \mathbf{B}) = 1 - \frac{|\mathbf{A} \cap \mathbf{B}|}{|\mathbf{A} \cup \mathbf{B}|}
+\]`
+MinHash applies a random hash function `g` to each element in the set and take the minimum of all hashed values:
+`\[
+h(\mathbf{A}) = \min_{a \in \mathbf{A}}(g(a))
+\]`
+
+The input sets for MinHash are represented as binary vectors, where the vector indices represent the elements themselves and the non-zero values in the vector represent the presence of that element in the set. While both dense and sparse vectors are supported, typically sparse vectors are recommended for efficiency. For example, `Vectors.sparse(10, Array[(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements in the space. This set contains elem 2, elem 3 and elem 5. All non-zero values are treated as binary "1" values.
+
+**Note:** Empty sets cannot be transformed by MinHash, which means any input vector must have at least 1 non-zero entry.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [MinHashLSH Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinHashLSH)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/MinHashLSHExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [MinHashLSH Java docs](api/java/org/apache/spark/ml/feature/MinHashLSH.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java %}
+</div>
+</div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java
new file mode 100644
index 0000000000000..ca3ee5a285255
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketedRandomProjectionLSHExample.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.BucketedRandomProjectionLSH;
+import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaBucketedRandomProjectionLSHExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaBucketedRandomProjectionLSHExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> dataA = Arrays.asList(
+      RowFactory.create(0, Vectors.dense(1.0, 1.0)),
+      RowFactory.create(1, Vectors.dense(1.0, -1.0)),
+      RowFactory.create(2, Vectors.dense(-1.0, -1.0)),
+      RowFactory.create(3, Vectors.dense(-1.0, 1.0))
+    );
+
+    List<Row> dataB = Arrays.asList(
+        RowFactory.create(4, Vectors.dense(1.0, 0.0)),
+        RowFactory.create(5, Vectors.dense(-1.0, 0.0)),
+        RowFactory.create(6, Vectors.dense(0.0, 1.0)),
+        RowFactory.create(7, Vectors.dense(0.0, -1.0))
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("keys", new VectorUDT(), false, Metadata.empty())
+    });
+    Dataset<Row> dfA = spark.createDataFrame(dataA, schema);
+    Dataset<Row> dfB = spark.createDataFrame(dataB, schema);
+
+    Vector key = Vectors.dense(1.0, 0.0);
+
+    BucketedRandomProjectionLSH mh = new BucketedRandomProjectionLSH()
+      .setBucketLength(2.0)
+      .setNumHashTables(3)
+      .setInputCol("keys")
+      .setOutputCol("values");
+
+    BucketedRandomProjectionLSHModel model = mh.fit(dfA);
+
+    // Feature Transformation
+    model.transform(dfA).show();
+    // Cache the transformed columns
+    Dataset<Row> transformedA = model.transform(dfA).cache();
+    Dataset<Row> transformedB = model.transform(dfB).cache();
+
+    // Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 1.5).show();
+    model.approxSimilarityJoin(transformedA, transformedB, 1.5).show();
+    // Self Join
+    model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show();
+
+    // Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show();
+    model.approxNearestNeighbors(transformedA, key, 2).show();
+    // $example off$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java
new file mode 100644
index 0000000000000..9dbbf6d117246
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashLSHExample.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.MinHashLSH;
+import org.apache.spark.ml.feature.MinHashLSHModel;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaMinHashLSHExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaMinHashLSHExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(0, Vectors.sparse(6, new int[]{0, 1, 2}, new double[]{1.0, 1.0, 1.0})),
+      RowFactory.create(1, Vectors.sparse(6, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0})),
+      RowFactory.create(2, Vectors.sparse(6, new int[]{0, 2, 4}, new double[]{1.0, 1.0, 1.0}))
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("keys", new VectorUDT(), false, Metadata.empty())
+    });
+    Dataset<Row> dataFrame = spark.createDataFrame(data, schema);
+
+    MinHashLSH mh = new MinHashLSH()
+      .setNumHashTables(1)
+      .setInputCol("keys")
+      .setOutputCol("values");
+
+    MinHashLSHModel model = mh.fit(dataFrame);
+    model.transform(dataFrame).show();
+    // $example off$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
new file mode 100644
index 0000000000000..686cc39d3b9a5
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketedRandomProjectionLSHExample.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object BucketedRandomProjectionLSHExample {
+  def main(args: Array[String]): Unit = {
+    // Creates a SparkSession
+    val spark = SparkSession
+      .builder
+      .appName("BucketedRandomProjectionLSHExample")
+      .getOrCreate()
+
+    // $example on$
+    val dfA = spark.createDataFrame(Seq(
+      (0, Vectors.dense(1.0, 1.0)),
+      (1, Vectors.dense(1.0, -1.0)),
+      (2, Vectors.dense(-1.0, -1.0)),
+      (3, Vectors.dense(-1.0, 1.0))
+    )).toDF("id", "keys")
+
+    val dfB = spark.createDataFrame(Seq(
+      (4, Vectors.dense(1.0, 0.0)),
+      (5, Vectors.dense(-1.0, 0.0)),
+      (6, Vectors.dense(0.0, 1.0)),
+      (7, Vectors.dense(0.0, -1.0))
+    )).toDF("id", "keys")
+
+    val key = Vectors.dense(1.0, 0.0)
+
+    val brp = new BucketedRandomProjectionLSH()
+      .setBucketLength(2.0)
+      .setNumHashTables(3)
+      .setInputCol("keys")
+      .setOutputCol("values")
+
+    val model = brp.fit(dfA)
+
+    // Feature Transformation
+    model.transform(dfA).show()
+    // Cache the transformed columns
+    val transformedA = model.transform(dfA).cache()
+    val transformedB = model.transform(dfB).cache()
+
+    // Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 1.5).show()
+    model.approxSimilarityJoin(transformedA, transformedB, 1.5).show()
+    // Self Join
+    model.approxSimilarityJoin(dfA, dfA, 2.5).filter("datasetA.id < datasetB.id").show()
+
+    // Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show()
+    model.approxNearestNeighbors(transformedA, key, 2).show()
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
new file mode 100644
index 0000000000000..f4fc3cf4118a6
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashLSHExample.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.MinHashLSH
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object MinHashLSHExample {
+  def main(args: Array[String]): Unit = {
+    // Creates a SparkSession
+    val spark = SparkSession
+      .builder
+      .appName("MinHashLSHExample")
+      .getOrCreate()
+
+    // $example on$
+    val dfA = spark.createDataFrame(Seq(
+      (0, Vectors.sparse(6, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
+      (1, Vectors.sparse(6, Seq((2, 1.0), (3, 1.0), (4, 1.0)))),
+      (2, Vectors.sparse(6, Seq((0, 1.0), (2, 1.0), (4, 1.0))))
+    )).toDF("id", "keys")
+
+    val dfB = spark.createDataFrame(Seq(
+      (3, Vectors.sparse(6, Seq((1, 1.0), (3, 1.0), (5, 1.0)))),
+      (4, Vectors.sparse(6, Seq((2, 1.0), (3, 1.0), (5, 1.0)))),
+      (5, Vectors.sparse(6, Seq((1, 1.0), (2, 1.0), (4, 1.0))))
+    )).toDF("id", "keys")
+
+    val key = Vectors.sparse(6, Seq((1, 1.0), (3, 1.0)))
+
+    val mh = new MinHashLSH()
+      .setNumHashTables(3)
+      .setInputCol("keys")
+      .setOutputCol("values")
+
+    val model = mh.fit(dfA)
+
+    // Feature Transformation
+    model.transform(dfA).show()
+    // Cache the transformed columns
+    val transformedA = model.transform(dfA).cache()
+    val transformedB = model.transform(dfB).cache()
+
+    // Approximate similarity join
+    model.approxSimilarityJoin(dfA, dfB, 0.6).show()
+    model.approxSimilarityJoin(transformedA, transformedB, 0.6).show()
+    // Self Join
+    model.approxSimilarityJoin(dfA, dfA, 0.6).filter("datasetA.id < datasetB.id").show()
+
+    // Approximate nearest neighbor search
+    model.approxNearestNeighbors(dfA, key, 2).show()
+    model.approxNearestNeighbors(transformedA, key, 2).show()
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println

From 8145c82bc8e4c44e7b74695e2307bb837cde1207 Mon Sep 17 00:00:00 2001
From: Kapil Singh <kapsingh@adobe.com>
Date: Sun, 4 Dec 2016 17:16:40 +0800
Subject: [PATCH 272/534] [SPARK-18091][SQL] Deep if expressions cause
 Generated SpecificUnsafeProjection code to exceed JVM code size limit

## What changes were proposed in this pull request?

Fix for SPARK-18091 which is a bug related to large if expressions causing generated SpecificUnsafeProjection code to exceed JVM code size limit.

This PR changes if expression's code generation to place its predicate, true value and false value expressions' generated code in separate methods in context so as to never generate too long combined code.
## How was this patch tested?

Added a unit test and also tested manually with the application (having transformations similar to the unit test) which caused the issue to be identified in the first place.

Author: Kapil Singh <kapsingh@adobe.com>

Closes #15620 from kapilsingh5050/SPARK-18091-IfCodegenFix.

(cherry picked from commit e463678b194e08be4a8bc9d1d45461d6c77a15ee)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/conditionalExpressions.scala  | 82 ++++++++++++++++---
 .../expressions/CodeGenerationSuite.scala     | 21 +++++
 2 files changed, 90 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index afc190e6978d4..bacedec1ae203 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -64,19 +64,75 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
     val trueEval = trueValue.genCode(ctx)
     val falseEval = falseValue.genCode(ctx)
 
-    ev.copy(code = s"""
-      ${condEval.code}
-      boolean ${ev.isNull} = false;
-      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
-      if (!${condEval.isNull} && ${condEval.value}) {
-        ${trueEval.code}
-        ${ev.isNull} = ${trueEval.isNull};
-        ${ev.value} = ${trueEval.value};
-      } else {
-        ${falseEval.code}
-        ${ev.isNull} = ${falseEval.isNull};
-        ${ev.value} = ${falseEval.value};
-      }""")
+    // place generated code of condition, true value and false value in separate methods if
+    // their code combined is large
+    val combinedLength = condEval.code.length + trueEval.code.length + falseEval.code.length
+    val generatedCode = if (combinedLength > 1024 &&
+      // Split these expressions only if they are created from a row object
+      (ctx.INPUT_ROW != null && ctx.currentVars == null)) {
+
+      val (condFuncName, condGlobalIsNull, condGlobalValue) =
+        createAndAddFunction(ctx, condEval, predicate.dataType, "evalIfCondExpr")
+      val (trueFuncName, trueGlobalIsNull, trueGlobalValue) =
+        createAndAddFunction(ctx, trueEval, trueValue.dataType, "evalIfTrueExpr")
+      val (falseFuncName, falseGlobalIsNull, falseGlobalValue) =
+        createAndAddFunction(ctx, falseEval, falseValue.dataType, "evalIfFalseExpr")
+      s"""
+        $condFuncName(${ctx.INPUT_ROW});
+        boolean ${ev.isNull} = false;
+        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+        if (!$condGlobalIsNull && $condGlobalValue) {
+          $trueFuncName(${ctx.INPUT_ROW});
+          ${ev.isNull} = $trueGlobalIsNull;
+          ${ev.value} = $trueGlobalValue;
+        } else {
+          $falseFuncName(${ctx.INPUT_ROW});
+          ${ev.isNull} = $falseGlobalIsNull;
+          ${ev.value} = $falseGlobalValue;
+        }
+      """
+    }
+    else {
+      s"""
+        ${condEval.code}
+        boolean ${ev.isNull} = false;
+        ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+        if (!${condEval.isNull} && ${condEval.value}) {
+          ${trueEval.code}
+          ${ev.isNull} = ${trueEval.isNull};
+          ${ev.value} = ${trueEval.value};
+        } else {
+          ${falseEval.code}
+          ${ev.isNull} = ${falseEval.isNull};
+          ${ev.value} = ${falseEval.value};
+        }
+      """
+    }
+
+    ev.copy(code = generatedCode)
+  }
+
+  private def createAndAddFunction(
+      ctx: CodegenContext,
+      ev: ExprCode,
+      dataType: DataType,
+      baseFuncName: String): (String, String, String) = {
+    val globalIsNull = ctx.freshName("isNull")
+    ctx.addMutableState("boolean", globalIsNull, s"$globalIsNull = false;")
+    val globalValue = ctx.freshName("value")
+    ctx.addMutableState(ctx.javaType(dataType), globalValue,
+      s"$globalValue = ${ctx.defaultValue(dataType)};")
+    val funcName = ctx.freshName(baseFuncName)
+    val funcBody =
+      s"""
+         |private void $funcName(InternalRow ${ctx.INPUT_ROW}) {
+         |  ${ev.code.trim}
+         |  $globalIsNull = ${ev.isNull};
+         |  $globalValue = ${ev.value};
+         |}
+         """.stripMargin
+    ctx.addNewFunction(funcName, funcBody)
+    (funcName, globalIsNull, globalValue)
   }
 
   override def toString: String = s"if ($predicate) $trueValue else $falseValue"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 0cb201e4dae3e..0f4b4b5bc8dd6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -97,6 +97,27 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(actual(0) == cases)
   }
 
+  test("SPARK-18091: split large if expressions into blocks due to JVM code size limit") {
+    val inStr = "StringForTesting"
+    val row = create_row(inStr)
+    val inputStrAttr = 'a.string.at(0)
+
+    var strExpr: Expression = inputStrAttr
+    for (_ <- 1 to 13) {
+      strExpr = If(EqualTo(Decode(Encode(strExpr, "utf-8"), "utf-8"), inputStrAttr),
+        strExpr, strExpr)
+    }
+
+    val expressions = Seq(strExpr)
+    val plan = GenerateUnsafeProjection.generate(expressions, true)
+    val actual = plan(row).toSeq(expressions.map(_.dataType))
+    val expected = Seq(UTF8String.fromString(inStr))
+
+    if (!checkResult(actual, expected)) {
+      fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")
+    }
+  }
+
   test("SPARK-14793: split wide array creation into blocks due to JVM code size limit") {
     val length = 5000
     val expressions = Seq(CreateArray(List.fill(length)(EqualTo(Literal(1), Literal(1)))))

From 41d698ecead46979e9a77b21e6a9c8f27cff63ac Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Sun, 4 Dec 2016 20:44:04 +0800
Subject: [PATCH 273/534] [SPARK-18661][SQL] Creating a partitioned datasource
 table should not scan all files for table

## What changes were proposed in this pull request?

Even though in 2.1 creating a partitioned datasource table will not populate the partition data by default (until the user issues MSCK REPAIR TABLE), it seems we still scan the filesystem for no good reason.

We should avoid doing this when the user specifies a schema.

## How was this patch tested?

Perf stat tests.

Author: Eric Liang <ekl@databricks.com>

Closes #16090 from ericl/spark-18661.

(cherry picked from commit d9eb4c7215f26dd05527c0b9980af35087ab9d64)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../command/createDataSourceTables.scala      | 10 +++-
 .../execution/datasources/DataSource.scala    |  2 +-
 .../sql/execution/command/DDLSuite.scala      | 11 +++-
 .../hive/PartitionedTablePerfStatsSuite.scala | 51 +++++++++++++++++--
 4 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index 422700c89194a..193a2a2cdc170 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -58,13 +58,21 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
     // Create the relation to validate the arguments before writing the metadata to the metastore,
     // and infer the table schema and partition if users didn't specify schema in CREATE TABLE.
     val pathOption = table.storage.locationUri.map("path" -> _)
+    // Fill in some default table options from the session conf
+    val tableWithDefaultOptions = table.copy(
+      identifier = table.identifier.copy(
+        database = Some(
+          table.identifier.database.getOrElse(sessionState.catalog.getCurrentDatabase))),
+      tracksPartitionsInCatalog = sparkSession.sessionState.conf.manageFilesourcePartitions)
     val dataSource: BaseRelation =
       DataSource(
         sparkSession = sparkSession,
         userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
+        partitionColumns = table.partitionColumnNames,
         className = table.provider.get,
         bucketSpec = table.bucketSpec,
-        options = table.storage.properties ++ pathOption).resolveRelation()
+        options = table.storage.properties ++ pathOption,
+        catalogTable = Some(tableWithDefaultOptions)).resolveRelation()
 
     dataSource match {
       case fs: HadoopFsRelation =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index ccfc759c8fa7e..f47eb84df0288 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -132,7 +132,7 @@ case class DataSource(
       }.toArray
       new InMemoryFileIndex(sparkSession, globbedPaths, options, None)
     }
-    val partitionSchema = if (partitionColumns.isEmpty && catalogTable.isEmpty) {
+    val partitionSchema = if (partitionColumns.isEmpty) {
       // Try to infer partitioning, because no DataSource in the read path provides the partitioning
       // columns properly unless it is a Hive DataSource
       val resolved = tempFileIndex.partitionSchema.map { partitionField =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 10843e9ba5753..6593fa479d66b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -312,7 +312,13 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
           pathToNonPartitionedTable,
           userSpecifiedSchema = Option("num int, str string"),
           userSpecifiedPartitionCols = partitionCols,
-          expectedSchema = new StructType().add("num", IntegerType).add("str", StringType),
+          expectedSchema = if (partitionCols.isDefined) {
+            // we skipped inference, so the partition col is ordered at the end
+            new StructType().add("str", StringType).add("num", IntegerType)
+          } else {
+            // no inferred partitioning, so schema is in original order
+            new StructType().add("num", IntegerType).add("str", StringType)
+          },
           expectedPartitionCols = partitionCols.map(Seq(_)).getOrElse(Seq.empty[String]))
       }
     }
@@ -565,7 +571,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach {
       val table = catalog.getTableMetadata(TableIdentifier("tbl"))
       assert(table.tableType == CatalogTableType.MANAGED)
       assert(table.provider == Some("parquet"))
-      assert(table.schema == new StructType().add("a", IntegerType).add("b", IntegerType))
+      // a is ordered last since it is a user-specified partitioning column
+      assert(table.schema == new StructType().add("b", IntegerType).add("a", IntegerType))
       assert(table.partitionColumnNames == Seq("a"))
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index 9838b9a4eba3d..65c02d473b79d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -60,36 +60,52 @@ class PartitionedTablePerfStatsSuite
     setupPartitionedHiveTable(tableName, dir, 5)
   }
 
-  private def setupPartitionedHiveTable(tableName: String, dir: File, scale: Int): Unit = {
+  private def setupPartitionedHiveTable(
+      tableName: String, dir: File, scale: Int,
+      clearMetricsBeforeCreate: Boolean = false, repair: Boolean = true): Unit = {
     spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
+    if (clearMetricsBeforeCreate) {
+      HiveCatalogMetrics.reset()
+    }
+
     spark.sql(s"""
       |create external table $tableName (fieldOne long)
       |partitioned by (partCol1 int, partCol2 int)
       |stored as parquet
       |location "${dir.getAbsolutePath}"""".stripMargin)
-    spark.sql(s"msck repair table $tableName")
+    if (repair) {
+      spark.sql(s"msck repair table $tableName")
+    }
   }
 
   private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
     setupPartitionedDatasourceTable(tableName, dir, 5)
   }
 
-  private def setupPartitionedDatasourceTable(tableName: String, dir: File, scale: Int): Unit = {
+  private def setupPartitionedDatasourceTable(
+      tableName: String, dir: File, scale: Int,
+      clearMetricsBeforeCreate: Boolean = false, repair: Boolean = true): Unit = {
     spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
+    if (clearMetricsBeforeCreate) {
+      HiveCatalogMetrics.reset()
+    }
+
     spark.sql(s"""
       |create table $tableName (fieldOne long, partCol1 int, partCol2 int)
       |using parquet
       |options (path "${dir.getAbsolutePath}")
       |partitioned by (partCol1, partCol2)""".stripMargin)
-    spark.sql(s"msck repair table $tableName")
+    if (repair) {
+      spark.sql(s"msck repair table $tableName")
+    }
   }
 
   genericTest("partitioned pruned table reports only selected files") { spec =>
@@ -250,6 +266,33 @@ class PartitionedTablePerfStatsSuite
     }
   }
 
+  test("datasource table: table setup does not scan filesystem") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable(
+            "test", dir, scale = 10, clearMetricsBeforeCreate = true, repair = false)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
+  test("hive table: table setup does not scan filesystem") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          HiveCatalogMetrics.reset()
+          setupPartitionedHiveTable(
+            "test", dir, scale = 10, clearMetricsBeforeCreate = true, repair = false)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
+          assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
+        }
+      }
+    }
+  }
+
   test("hive table: num hive client calls does not scale with partition count") {
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
       withTable("test") {

From c13c2939fb19901d86ee013aa7bb5e200d79be85 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 4 Dec 2016 20:25:11 -0800
Subject: [PATCH 274/534] [SPARK-18643][SPARKR] SparkR hangs at session start
 when installed as a package without Spark

## What changes were proposed in this pull request?

If SparkR is running as a package and it has previously downloaded Spark Jar it should be able to run as before without having to set SPARK_HOME. Basically with this bug the auto install Spark will only work in the first session.

This seems to be a regression on the earlier behavior.

Fix is to always try to install or check for the cached Spark if running in an interactive session.
As discussed before, we should probably only install Spark iff running in an interactive session (R shell, RStudio etc)

## How was this patch tested?

Manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16077 from felixcheung/rsessioninteractive.

(cherry picked from commit b019b3a8ac49336e657f5e093fa2fba77f8d12d2)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/pkg/R/sparkR.R                     | 5 ++++-
 R/pkg/vignettes/sparkr-vignettes.Rmd | 4 ++--
 docs/sparkr.md                       | 4 +++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index a7152b4313993..43bff97553c2f 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -322,6 +322,9 @@ sparkRHive.init <- function(jsc = NULL) {
 #' SparkSession or initializes a new SparkSession.
 #' Additional Spark properties can be set in \code{...}, and these named parameters take priority
 #' over values in \code{master}, \code{appName}, named lists of \code{sparkConfig}.
+#' When called in an interactive session, this checks for the Spark installation, and, if not
+#' found, it will be downloaded and cached automatically. Alternatively, \code{install.spark} can
+#' be called manually.
 #'
 #' For details on how to initialize and use SparkR, refer to SparkR programming guide at
 #' \url{http://spark.apache.org/docs/latest/sparkr.html#starting-up-sparksession}.
@@ -565,7 +568,7 @@ sparkCheckInstall <- function(sparkHome, master, deployMode) {
       message(msg)
       NULL
     } else {
-      if (isMasterLocal(master)) {
+      if (interactive() || isMasterLocal(master)) {
         msg <- paste0("Spark not found in SPARK_HOME: ", sparkHome)
         message(msg)
         packageLocalDir <- install.spark()
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 73a5e26a3ba9c..a36f8fc0c1455 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -94,13 +94,13 @@ sparkR.session.stop()
 
 Different from many other R packages, to use SparkR, you need an additional installation of Apache Spark. The Spark installation will be used to run a backend process that will compile and execute SparkR programs.
 
-If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html). Alternatively, we provide an easy-to-use function `install.spark` to complete this process. You don't have to call it explicitly. We will check the installation when `sparkR.session` is called and `install.spark` function will be  triggered automatically if no installation is found.
+After installing the SparkR package, you can call `sparkR.session` as explained in the previous section to start and it will check for the Spark installation. If you are working with SparkR from an interactive shell (eg. R, RStudio) then Spark is downloaded and cached automatically if it is not found. Alternatively, we provide an easy-to-use function `install.spark` for running this manually. If you don't have Spark installed on the computer, you may download it from [Apache Spark Website](http://spark.apache.org/downloads.html).
 
 ```{r, eval=FALSE}
 install.spark()
 ```
 
-If you already have Spark installed, you don't have to install again and can pass the `sparkHome` argument to `sparkR.session` to let SparkR know where the Spark installation is.
+If you already have Spark installed, you don't have to install again and can pass the `sparkHome` argument to `sparkR.session` to let SparkR know where the existing Spark installation is.
 
 ```{r, eval=FALSE}
 sparkR.session(sparkHome = "/HOME/spark")
diff --git a/docs/sparkr.md b/docs/sparkr.md
index d26949226b117..60cd01a9fea71 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -40,7 +40,9 @@ sparkR.session()
 You can also start SparkR from RStudio. You can connect your R program to a Spark cluster from
 RStudio, R shell, Rscript or other R IDEs. To start, make sure SPARK_HOME is set in environment
 (you can check [Sys.getenv](https://stat.ethz.ch/R-manual/R-devel/library/base/html/Sys.getenv.html)),
-load the SparkR package, and call `sparkR.session` as below. In addition to calling `sparkR.session`,
+load the SparkR package, and call `sparkR.session` as below. It will check for the Spark installation, and, if not found, it will be downloaded and cached automatically. Alternatively, you can also run `install.spark` manually.
+
+In addition to calling `sparkR.session`,
  you could also specify certain Spark driver properties. Normally these
 [Application properties](configuration.html#application-properties) and
 [Runtime Environment](configuration.html#runtime-environment) cannot be set programmatically, as the

From 88e07efe86512142eeada6a6f1f7fe858204c59b Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 5 Dec 2016 00:32:58 -0800
Subject: [PATCH 275/534] [SPARK-18625][ML] OneVsRestModel should support
 setFeaturesCol and setPredictionCol

## What changes were proposed in this pull request?
add `setFeaturesCol` and `setPredictionCol` for `OneVsRestModel`

## How was this patch tested?
added tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #16059 from zhengruifeng/ovrm_setCol.

(cherry picked from commit bdfe7f67468ecfd9927a1fec60d6605dd05ebe3f)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 .../apache/spark/ml/classification/OneVsRest.scala |  9 +++++++++
 .../spark/ml/classification/OneVsRestSuite.scala   | 14 +++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index f4ab0a074c420..e58b30d66588c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -140,6 +140,14 @@ final class OneVsRestModel private[ml] (
     this(uid, Metadata.empty, models.asScala.toArray)
   }
 
+  /** @group setParam */
+  @Since("2.1.0")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
   @Since("1.4.0")
   override def transformSchema(schema: StructType): StructType = {
     validateAndTransformSchema(schema, fitting = false, getClassifier.featuresDataType)
@@ -175,6 +183,7 @@ final class OneVsRestModel private[ml] (
         val updateUDF = udf { (predictions: Map[Int, Double], prediction: Vector) =>
           predictions + ((index, prediction(1)))
         }
+        model.setFeaturesCol($(featuresCol))
         val transformedDataset = model.transform(df).select(columns: _*)
         val updatedDataset = transformedDataset
           .withColumn(tmpColName, updateUDF(col(accColName), col(rawPredictionCol)))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 3f9bcec427399..aacb7921b835f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.Metadata
 
 class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@@ -136,6 +137,17 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     assert(outputFields.contains("p"))
   }
 
+  test("SPARK-18625 : OneVsRestModel should support setFeaturesCol and setPredictionCol") {
+    val ova = new OneVsRest().setClassifier(new LogisticRegression)
+    val ovaModel = ova.fit(dataset)
+    val dataset2 = dataset.select(col("label").as("y"), col("features").as("fea"))
+    ovaModel.setFeaturesCol("fea")
+    ovaModel.setPredictionCol("pred")
+    val transformedDataset = ovaModel.transform(dataset2)
+    val outputFields = transformedDataset.schema.fieldNames.toSet
+    assert(outputFields === Set("y", "fea", "pred"))
+  }
+
   test("SPARK-8049: OneVsRest shouldn't output temp columns") {
     val logReg = new LogisticRegression()
       .setMaxIter(1)

From 1821cbead1875fbe1c16d7c50563aa0839e1f70f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 5 Dec 2016 00:39:44 -0800
Subject: [PATCH 276/534] [SPARK-18279][DOC][ML][SPARKR] Add R examples to ML
 programming guide.

## What changes were proposed in this pull request?
Add R examples to ML programming guide for the following algorithms as POC:
* spark.glm
* spark.survreg
* spark.naiveBayes
* spark.kmeans

The four algorithms were added to SparkR since 2.0.0, more docs for algorithms added during 2.1 release cycle will be addressed in a separate follow-up PR.

## How was this patch tested?
This is the screenshots of generated ML programming guide for ```GeneralizedLinearRegression```:
![image](https://cloud.githubusercontent.com/assets/1962026/20866403/babad856-b9e1-11e6-9984-62747801e8c4.png)

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16136 from yanboliang/spark-18279.

(cherry picked from commit eb8dd68132998aa00902dfeb935db1358781e1c1)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 docs/ml-classification-regression.md | 22 ++++++++++++++++++++++
 docs/ml-clustering.md                |  8 ++++++++
 2 files changed, 30 insertions(+)

diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index c72c01fcff830..5148ad02d93aa 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -389,6 +389,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 {% include_example python/ml/naive_bayes_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.naiveBayes.html) for more details.
+
+{% include_example naiveBayes r/ml.R %}
+</div>
+
 </div>
 
 
@@ -566,6 +574,13 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 {% include_example python/ml/generalized_linear_regression_example.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.glm.html) for more details.
+
+{% include_example glm r/ml.R %}
+</div>
+
 </div>
 
 
@@ -755,6 +770,13 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 {% include_example python/ml/aft_survival_regression.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.survreg.html) for more details.
+
+{% include_example survreg r/ml.R %}
+</div>
+
 </div>
 
 
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 8a0a61cb595e7..4731abc7dcdd6 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -86,6 +86,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 {% include_example python/ml/kmeans_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.kmeans.html) for more details.
+
+{% include_example kmeans r/ml.R %}
+</div>
+
 </div>
 
 ## Latent Dirichlet allocation (LDA)

From afd2321b689fb29d18fee1840f5a0058cefd6d60 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 5 Dec 2016 10:36:13 -0800
Subject: [PATCH 277/534] [MINOR][DOC] Use SparkR `TRUE` value and add default
 values for `StructField` in SQL Guide.

## What changes were proposed in this pull request?

In `SQL Programming Guide`, this PR uses `TRUE` instead of `True` in SparkR and adds default values of `nullable` for `StructField` in Scala/Python/R (i.e., "Note: The default value of nullable is true."). In Java API, `nullable` is not optional.

**BEFORE**
* SPARK 2.1.0 RC1
http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc1-docs/sql-programming-guide.html#data-types

**AFTER**

* R
<img width="916" alt="screen shot 2016-12-04 at 11 58 19 pm" src="https://cloud.githubusercontent.com/assets/9700541/20877443/abba19a6-ba7d-11e6-8984-afbe00333fb0.png">

* Scala
<img width="914" alt="screen shot 2016-12-04 at 11 57 37 pm" src="https://cloud.githubusercontent.com/assets/9700541/20877433/99ce734a-ba7d-11e6-8bb5-e8619041b09b.png">

* Python
<img width="914" alt="screen shot 2016-12-04 at 11 58 04 pm" src="https://cloud.githubusercontent.com/assets/9700541/20877440/a5c89338-ba7d-11e6-8f92-6c0ae9388d7e.png">

## How was this patch tested?

Manual.

```
cd docs
SKIP_API=1 jekyll build
open _site/index.html
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #16141 from dongjoon-hyun/SPARK-SQL-GUIDE.

(cherry picked from commit 410b7898661f77e748564aaee6a5ab7747ce34ad)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 docs/sql-programming-guide.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 51ba91130e91f..d57f22eca460e 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1840,7 +1840,8 @@ You can access them by doing
   <td> The value type in Scala of the data type of this field
   (For example, Int for a StructField with the data type IntegerType) </td>
   <td>
-  StructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
+  StructField(<i>name</i>, <i>dataType</i>, [<i>nullable</i>])<br />
+  <b>Note:</b> The default value of <i>nullable</i> is <i>true</i>.
   </td>
 </tr>
 </table>
@@ -2128,7 +2129,8 @@ from pyspark.sql.types import *
   <td> The value type in Python of the data type of this field
   (For example, Int for a StructField with the data type IntegerType) </td>
   <td>
-  StructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
+  StructField(<i>name</i>, <i>dataType</i>, [<i>nullable</i>])<br />
+  <b>Note:</b> The default value of <i>nullable</i> is <i>True</i>.
   </td>
 </tr>
 </table>
@@ -2249,7 +2251,7 @@ from pyspark.sql.types import *
   <td> vector or list </td>
   <td>
   list(type="array", elementType=<i>elementType</i>, containsNull=[<i>containsNull</i>])<br />
-  <b>Note:</b> The default value of <i>containsNull</i> is <i>True</i>.
+  <b>Note:</b> The default value of <i>containsNull</i> is <i>TRUE</i>.
   </td>
 </tr>
 <tr>
@@ -2257,7 +2259,7 @@ from pyspark.sql.types import *
   <td> environment </td>
   <td>
   list(type="map", keyType=<i>keyType</i>, valueType=<i>valueType</i>, valueContainsNull=[<i>valueContainsNull</i>])<br />
-  <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>True</i>.
+  <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>TRUE</i>.
   </td>
 </tr>
 <tr>
@@ -2274,7 +2276,8 @@ from pyspark.sql.types import *
   <td> The value type in R of the data type of this field
   (For example, integer for a StructField with the data type IntegerType) </td>
   <td>
-  list(name=<i>name</i>, type=<i>dataType</i>, nullable=<i>nullable</i>)
+  list(name=<i>name</i>, type=<i>dataType</i>, nullable=[<i>nullable</i>])<br />
+  <b>Note:</b> The default value of <i>nullable</i> is <i>TRUE</i>.
   </td>
 </tr>
 </table>

From 30c074308f723f95823b43fbc54e2e4742d52840 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Mon, 5 Dec 2016 10:49:22 -0800
Subject: [PATCH 278/534] Revert "[SPARK-18284][SQL] Make
 ExpressionEncoder.serializer.nullable precise"

This reverts commit fce1be6cc81b1fe3991a4df91128f4fcd14ff615 from branch-2.1.
---
 .../sql/catalyst/JavaTypeInference.scala      |  4 +-
 .../spark/sql/catalyst/ScalaReflection.scala  |  7 +--
 .../catalyst/encoders/ExpressionEncoder.scala |  7 ++-
 .../expressions/ReferenceToExpressions.scala  |  2 +-
 .../expressions/objects/objects.scala         | 24 ++++-----
 .../encoders/ExpressionEncoderSuite.scala     | 19 +------
 .../org/apache/spark/sql/DatasetSuite.scala   | 52 +------------------
 .../sql/streaming/FileStreamSinkSuite.scala   |  2 +-
 8 files changed, 21 insertions(+), 96 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 7e8e4dab72145..04f0cfce883f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -396,14 +396,12 @@ object JavaTypeInference {
 
         case _ if mapType.isAssignableFrom(typeToken) =>
           val (keyType, valueType) = mapKeyValueType(typeToken)
-
           ExternalMapToCatalyst(
             inputObject,
             ObjectType(keyType.getRawType),
             serializerFor(_, keyType),
             ObjectType(valueType.getRawType),
-            serializerFor(_, valueType),
-            valueNullable = true
+            serializerFor(_, valueType)
           )
 
         case other =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 6e20096901d99..0aa21b9347a9d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -498,8 +498,7 @@ object ScalaReflection extends ScalaReflection {
           dataTypeFor(keyType),
           serializerFor(_, keyType, keyPath),
           dataTypeFor(valueType),
-          serializerFor(_, valueType, valuePath),
-          valueNullable = !valueType.typeSymbol.asClass.isPrimitive)
+          serializerFor(_, valueType, valuePath))
 
       case t if t <:< localTypeOf[String] =>
         StaticInvoke(
@@ -591,9 +590,7 @@ object ScalaReflection extends ScalaReflection {
               "cannot be used as field name\n" + walkedTypePath.mkString("\n"))
           }
 
-          val fieldValue = Invoke(
-            AssertNotNull(inputObject, walkedTypePath), fieldName, dataTypeFor(fieldType),
-            returnNullable = !fieldType.typeSymbol.asClass.isPrimitive)
+          val fieldValue = Invoke(inputObject, fieldName, dataTypeFor(fieldType))
           val clsName = getClassNameFromType(fieldType)
           val newPath = s"""- field (class: "$clsName", name: "$fieldName")""" +: walkedTypePath
           expressions.Literal(fieldName) :: serializerFor(fieldValue, fieldType, newPath) :: Nil
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 3757eccfa2dd8..9c4818db6333b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -60,7 +60,7 @@ object ExpressionEncoder {
     val cls = mirror.runtimeClass(tpe)
     val flat = !ScalaReflection.definedByConstructorParams(tpe)
 
-    val inputObject = BoundReference(0, ScalaReflection.dataTypeFor[T], nullable = !cls.isPrimitive)
+    val inputObject = BoundReference(0, ScalaReflection.dataTypeFor[T], nullable = true)
     val nullSafeInput = if (flat) {
       inputObject
     } else {
@@ -71,7 +71,10 @@ object ExpressionEncoder {
     val serializer = ScalaReflection.serializerFor[T](nullSafeInput)
     val deserializer = ScalaReflection.deserializerFor[T]
 
-    val schema = serializer.dataType
+    val schema = ScalaReflection.schemaFor[T] match {
+      case ScalaReflection.Schema(s: StructType, _) => s
+      case ScalaReflection.Schema(dt, nullable) => new StructType().add("value", dt, nullable)
+    }
 
     new ExpressionEncoder[T](
       schema,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
index 2ca77e8394e17..6c75a7a50214f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala
@@ -74,7 +74,7 @@ case class ReferenceToExpressions(result: Expression, children: Seq[Expression])
         ctx.addMutableState("boolean", classChildVarIsNull, "")
 
         val classChildVar =
-          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType, child.nullable)
+          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType)
 
         val initCode = s"${classChildVar.value} = ${childGen.value};\n" +
           s"${classChildVar.isNull} = ${childGen.isNull};"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index a8aa1e725524a..e517ec18eb540 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -171,18 +171,15 @@ case class StaticInvoke(
  * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
  * @param propagateNull When true, and any of the arguments is null, null will be returned instead
  *                      of calling the function.
- * @param returnNullable When false, indicating the invoked method will always return
- *                       non-null value.
  */
 case class Invoke(
     targetObject: Expression,
     functionName: String,
     dataType: DataType,
     arguments: Seq[Expression] = Nil,
-    propagateNull: Boolean = true,
-    returnNullable : Boolean = true) extends InvokeLike {
+    propagateNull: Boolean = true) extends InvokeLike {
 
-  override def nullable: Boolean = targetObject.nullable || needNullCheck || returnNullable
+  override def nullable: Boolean = true
   override def children: Seq[Expression] = targetObject +: arguments
 
   override def eval(input: InternalRow): Any =
@@ -408,15 +405,13 @@ case class WrapOption(child: Expression, optType: DataType)
  * A place holder for the loop variable used in [[MapObjects]].  This should never be constructed
  * manually, but will instead be passed into the provided lambda function.
  */
-case class LambdaVariable(
-    value: String,
-    isNull: String,
-    dataType: DataType,
-    nullable: Boolean = true) extends LeafExpression
+case class LambdaVariable(value: String, isNull: String, dataType: DataType) extends LeafExpression
   with Unevaluable with NonSQLExpression {
 
+  override def nullable: Boolean = true
+
   override def genCode(ctx: CodegenContext): ExprCode = {
-    ExprCode(code = "", value = value, isNull = if (nullable) isNull else "false")
+    ExprCode(code = "", value = value, isNull = isNull)
   }
 }
 
@@ -597,8 +592,7 @@ object ExternalMapToCatalyst {
       keyType: DataType,
       keyConverter: Expression => Expression,
       valueType: DataType,
-      valueConverter: Expression => Expression,
-      valueNullable: Boolean): ExternalMapToCatalyst = {
+      valueConverter: Expression => Expression): ExternalMapToCatalyst = {
     val id = curId.getAndIncrement()
     val keyName = "ExternalMapToCatalyst_key" + id
     val valueName = "ExternalMapToCatalyst_value" + id
@@ -607,11 +601,11 @@ object ExternalMapToCatalyst {
     ExternalMapToCatalyst(
       keyName,
       keyType,
-      keyConverter(LambdaVariable(keyName, "false", keyType, false)),
+      keyConverter(LambdaVariable(keyName, "false", keyType)),
       valueName,
       valueIsNull,
       valueType,
-      valueConverter(LambdaVariable(valueName, valueIsNull, valueType, valueNullable)),
+      valueConverter(LambdaVariable(valueName, valueIsNull, valueType)),
       inputMap
     )
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 080f11b769388..4d896c2e38f10 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -24,7 +24,7 @@ import java.util.Arrays
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.sql.{Encoder, Encoders}
+import org.apache.spark.sql.Encoders
 import org.apache.spark.sql.catalyst.{OptionalData, PrimitiveData}
 import org.apache.spark.sql.catalyst.analysis.AnalysisTest
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -300,11 +300,6 @@ class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
   encodeDecodeTest(
     ReferenceValueClass(ReferenceValueClass.Container(1)), "reference value class")
 
-  encodeDecodeTest(Option(31), "option of int")
-  encodeDecodeTest(Option.empty[Int], "empty option of int")
-  encodeDecodeTest(Option("abc"), "option of string")
-  encodeDecodeTest(Option.empty[String], "empty option of string")
-
   productTest(("UDT", new ExamplePoint(0.1, 0.2)))
 
   test("nullable of encoder schema") {
@@ -343,18 +338,6 @@ class ExpressionEncoderSuite extends PlanTest with AnalysisTest {
     }
   }
 
-  test("nullable of encoder serializer") {
-    def checkNullable[T: Encoder](nullable: Boolean): Unit = {
-      assert(encoderFor[T].serializer.forall(_.nullable === nullable))
-    }
-
-    // test for flat encoders
-    checkNullable[Int](false)
-    checkNullable[Option[Int]](true)
-    checkNullable[java.lang.Integer](true)
-    checkNullable[String](true)
-  }
-
   test("null check for map key") {
     val encoder = ExpressionEncoder[Map[String, Int]]()
     val e = intercept[RuntimeException](encoder.toRow(Map(("a", 1), (null, 2))))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index d31c766cb779f..1174d7354f931 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -28,10 +28,7 @@ import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types._
-
-case class TestDataPoint(x: Int, y: Double, s: String, t: TestDataPoint2)
-case class TestDataPoint2(x: Int, s: String)
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 
 class DatasetSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -972,53 +969,6 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     assert(dataset.collect() sameElements Array(resultValue, resultValue))
   }
 
-  test("SPARK-18284: Serializer should have correct nullable value") {
-    val df1 = Seq(1, 2, 3, 4).toDF
-    assert(df1.schema(0).nullable == false)
-    val df2 = Seq(Integer.valueOf(1), Integer.valueOf(2)).toDF
-    assert(df2.schema(0).nullable == true)
-
-    val df3 = Seq(Seq(1, 2), Seq(3, 4)).toDF
-    assert(df3.schema(0).nullable == true)
-    assert(df3.schema(0).dataType.asInstanceOf[ArrayType].containsNull == false)
-    val df4 = Seq(Seq("a", "b"), Seq("c", "d")).toDF
-    assert(df4.schema(0).nullable == true)
-    assert(df4.schema(0).dataType.asInstanceOf[ArrayType].containsNull == true)
-
-    val df5 = Seq((0, 1.0), (2, 2.0)).toDF("id", "v")
-    assert(df5.schema(0).nullable == false)
-    assert(df5.schema(1).nullable == false)
-    val df6 = Seq((0, 1.0, "a"), (2, 2.0, "b")).toDF("id", "v1", "v2")
-    assert(df6.schema(0).nullable == false)
-    assert(df6.schema(1).nullable == false)
-    assert(df6.schema(2).nullable == true)
-
-    val df7 = (Tuple1(Array(1, 2, 3)) :: Nil).toDF("a")
-    assert(df7.schema(0).nullable == true)
-    assert(df7.schema(0).dataType.asInstanceOf[ArrayType].containsNull == false)
-
-    val df8 = (Tuple1(Array((null: Integer), (null: Integer))) :: Nil).toDF("a")
-    assert(df8.schema(0).nullable == true)
-    assert(df8.schema(0).dataType.asInstanceOf[ArrayType].containsNull == true)
-
-    val df9 = (Tuple1(Map(2 -> 3)) :: Nil).toDF("m")
-    assert(df9.schema(0).nullable == true)
-    assert(df9.schema(0).dataType.asInstanceOf[MapType].valueContainsNull == false)
-
-    val df10 = (Tuple1(Map(1 -> (null: Integer))) :: Nil).toDF("m")
-    assert(df10.schema(0).nullable == true)
-    assert(df10.schema(0).dataType.asInstanceOf[MapType].valueContainsNull == true)
-
-    val df11 = Seq(TestDataPoint(1, 2.2, "a", null),
-                   TestDataPoint(3, 4.4, "null", (TestDataPoint2(33, "b")))).toDF
-    assert(df11.schema(0).nullable == false)
-    assert(df11.schema(1).nullable == false)
-    assert(df11.schema(2).nullable == true)
-    assert(df11.schema(3).nullable == true)
-    assert(df11.schema(3).dataType.asInstanceOf[StructType].fields(0).nullable == false)
-    assert(df11.schema(3).dataType.asInstanceOf[StructType].fields(1).nullable == true)
-  }
-
   Seq(true, false).foreach { eager =>
     def testCheckpointing(testName: String)(f: => Unit): Unit = {
       test(s"Dataset.checkpoint() - $testName (eager = $eager)") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 54efae3fb4627..09613ef9e4348 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -86,7 +86,7 @@ class FileStreamSinkSuite extends StreamTest {
 
       val outputDf = spark.read.parquet(outputDir)
       val expectedSchema = new StructType()
-        .add(StructField("value", IntegerType, nullable = false))
+        .add(StructField("value", IntegerType))
         .add(StructField("id", IntegerType))
       assert(outputDf.schema === expectedSchema)
 

From e23c8cfc8e59508743fc69c82028831f95bc25d7 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 5 Dec 2016 11:37:13 -0800
Subject: [PATCH 279/534] [SPARK-18711][SQL] should disable subexpression
 elimination for LambdaVariable

## What changes were proposed in this pull request?

This is kind of a long-standing bug, it's hidden until https://github.com/apache/spark/pull/15780 , which may add `AssertNotNull` on top of `LambdaVariable` and thus enables subexpression elimination.

However, subexpression elimination will evaluate the common expressions at the beginning, which is invalid for `LambdaVariable`. `LambdaVariable` usually represents loop variable, which can't be evaluated ahead of the loop.

This PR skips expressions containing `LambdaVariable` when doing subexpression elimination.

## How was this patch tested?

updated test in `DatasetAggregatorSuite`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16143 from cloud-fan/aggregator.

(cherry picked from commit 01a7d33d0851d82fd1bb477a58d9925fe8d727d8)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/expressions/EquivalentExpressions.scala  | 6 +++++-
 .../org/apache/spark/sql/DatasetAggregatorSuite.scala     | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
index b8e2b67b2fe9c..6c246a5663ca3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable
 
 /**
  * This class is used to compute equality of (sub)expression trees. Expressions can be added
@@ -72,7 +73,10 @@ class EquivalentExpressions {
       root: Expression,
       ignoreLeaf: Boolean = true,
       skipReferenceToExpressions: Boolean = true): Unit = {
-    val skip = root.isInstanceOf[LeafExpression] && ignoreLeaf
+    val skip = (root.isInstanceOf[LeafExpression] && ignoreLeaf) ||
+      // `LambdaVariable` is usually used as a loop variable, which can't be evaluated ahead of the
+      // loop. So we can't evaluate sub-expressions containing `LambdaVariable` at the beginning.
+      root.find(_.isInstanceOf[LambdaVariable]).isDefined
     // There are some special expressions that we should not recurse into children.
     //   1. CodegenFallback: it's children will not be used to generate code (call eval() instead)
     //   2. ReferenceToExpressions: it's kind of an explicit sub-expression elimination.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
index 36b2651e5a9e8..0e7eaa9e88d57 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -92,13 +92,13 @@ object NameAgg extends Aggregator[AggData, String, String] {
 }
 
 
-object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[Int]] {
+object SeqAgg extends Aggregator[AggData, Seq[Int], Seq[(Int, Int)]] {
   def zero: Seq[Int] = Nil
   def reduce(b: Seq[Int], a: AggData): Seq[Int] = a.a +: b
   def merge(b1: Seq[Int], b2: Seq[Int]): Seq[Int] = b1 ++ b2
-  def finish(r: Seq[Int]): Seq[Int] = r
+  def finish(r: Seq[Int]): Seq[(Int, Int)] = r.map(i => i -> i)
   override def bufferEncoder: Encoder[Seq[Int]] = ExpressionEncoder()
-  override def outputEncoder: Encoder[Seq[Int]] = ExpressionEncoder()
+  override def outputEncoder: Encoder[Seq[(Int, Int)]] = ExpressionEncoder()
 }
 
 
@@ -281,7 +281,7 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext {
 
     checkDataset(
       ds.groupByKey(_.b).agg(SeqAgg.toColumn),
-      "a" -> Seq(1, 2)
+      "a" -> Seq(1 -> 1, 2 -> 2)
     )
   }
 

From 39759ff00ba4313a82834387eea53b1af7b7daaf Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Mon, 5 Dec 2016 12:57:41 -0800
Subject: [PATCH 280/534] [DOCS][MINOR] Update location of Spark YARN shuffle
 jar

Looking at the distributions provided on spark.apache.org, I see that the Spark YARN shuffle jar is under `yarn/` and not `lib/`.

This change is so minor I'm not sure it needs a JIRA. But let me know if so and I'll create one.

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #16130 from nchammas/yarn-doc-fix.

(cherry picked from commit 5a92dc76ab431d73275a2bdfbc2c0a8ceb0d75d1)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 docs/running-on-yarn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 4d1fafc07b8fc..d4144c86e94cf 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -518,7 +518,7 @@ instructions:
 pre-packaged distribution.
 1. Locate the `spark-<version>-yarn-shuffle.jar`. This should be under
 `$SPARK_HOME/common/network-yarn/target/scala-<version>` if you are building Spark yourself, and under
-`lib` if you are using a distribution.
+`yarn` if you are using a distribution.
 1. Add this jar to the classpath of all `NodeManager`s in your cluster.
 1. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`,
 then set `yarn.nodemanager.aux-services.spark_shuffle.class` to

From c6a4e3d96997bf166360524a95510b3490b68b49 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 5 Dec 2016 14:59:42 -0800
Subject: [PATCH 281/534] [SPARK-18694][SS] Add StreamingQuery.explain and
 exception to Python and fix StreamingQueryException (branch 2.1)

## What changes were proposed in this pull request?

Backport #16125 to branch 2.1.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16153 from zsxwing/SPARK-18694-2.1.
---
 project/MimaExcludes.scala                    |  9 +++-
 python/pyspark/sql/streaming.py               | 40 ++++++++++++++++++
 python/pyspark/sql/tests.py                   | 29 +++++++++++++
 .../execution/streaming/StreamExecution.scala |  5 ++-
 .../streaming/StreamingQueryException.scala   | 42 ++++++++++++-------
 .../apache/spark/sql/streaming/progress.scala |  7 ++++
 .../spark/sql/streaming/StreamTest.scala      |  2 -
 .../sql/streaming/StreamingQuerySuite.scala   | 10 +++--
 8 files changed, 119 insertions(+), 25 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 97391643322fc..9e6325432c0f6 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -97,7 +97,14 @@ object MimaExcludes {
       // [SPARK-18034] Upgrade to MiMa 0.1.11 to fix flakiness.
       ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.aggregationDepth"),
       ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.getAggregationDepth"),
-      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_=")
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_="),
+
+      // [SPARK-18694] Add StreamingQuery.explain and exception to Python and fix StreamingQueryException
+      ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryException$"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.endOffset"),
+      ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query")
     )
   }
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 84f01d3d9ac0b..4a7d17ba51a7b 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -30,6 +30,7 @@
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.sql.readwriter import OptionUtils, to_str
 from pyspark.sql.types import *
+from pyspark.sql.utils import StreamingQueryException
 
 __all__ = ["StreamingQuery", "StreamingQueryManager", "DataStreamReader", "DataStreamWriter"]
 
@@ -132,6 +133,45 @@ def stop(self):
         """
         self._jsq.stop()
 
+    @since(2.1)
+    def explain(self, extended=False):
+        """Prints the (logical and physical) plans to the console for debugging purpose.
+
+        :param extended: boolean, default ``False``. If ``False``, prints only the physical plan.
+
+        >>> sq = sdf.writeStream.format('memory').queryName('query_explain').start()
+        >>> sq.processAllAvailable() # Wait a bit to generate the runtime plans.
+        >>> sq.explain()
+        == Physical Plan ==
+        ...
+        >>> sq.explain(True)
+        == Parsed Logical Plan ==
+        ...
+        == Analyzed Logical Plan ==
+        ...
+        == Optimized Logical Plan ==
+        ...
+        == Physical Plan ==
+        ...
+        >>> sq.stop()
+        """
+        # Cannot call `_jsq.explain(...)` because it will print in the JVM process.
+        # We should print it in the Python process.
+        print(self._jsq.explainInternal(extended))
+
+    @since(2.1)
+    def exception(self):
+        """
+        :return: the StreamingQueryException if the query was terminated by an exception, or None.
+        """
+        if self._jsq.exception().isDefined():
+            je = self._jsq.exception().get()
+            msg = je.toString().split(': ', 1)[1]  # Drop the Java StreamingQueryException type info
+            stackTrace = '\n\t at '.join(map(lambda x: x.toString(), je.getStackTrace()))
+            return StreamingQueryException(msg, stackTrace)
+        else:
+            return None
+
 
 class StreamingQueryManager(object):
     """A class to manage all the :class:`StreamingQuery` StreamingQueries active.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 0aff9cebe91bd..9f34414f64d1b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1137,6 +1137,35 @@ def test_stream_await_termination(self):
             q.stop()
             shutil.rmtree(tmpPath)
 
+    def test_stream_exception(self):
+        sdf = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
+        sq = sdf.writeStream.format('memory').queryName('query_explain').start()
+        try:
+            sq.processAllAvailable()
+            self.assertEqual(sq.exception(), None)
+        finally:
+            sq.stop()
+
+        from pyspark.sql.functions import col, udf
+        from pyspark.sql.utils import StreamingQueryException
+        bad_udf = udf(lambda x: 1 / 0)
+        sq = sdf.select(bad_udf(col("value")))\
+            .writeStream\
+            .format('memory')\
+            .queryName('this_query')\
+            .start()
+        try:
+            # Process some data to fail the query
+            sq.processAllAvailable()
+            self.fail("bad udf should fail the query")
+        except StreamingQueryException as e:
+            # This is expected
+            self.assertTrue("ZeroDivisionError" in e.desc)
+        finally:
+            sq.stop()
+        self.assertTrue(type(sq.exception()) is StreamingQueryException)
+        self.assertTrue("ZeroDivisionError" in sq.exception().desc)
+
     def test_query_manager_await_termination(self):
         df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
         for q in self.spark._wrapped.streams.active:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 8804c647a75c3..6b1c01ab2a061 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -93,7 +93,7 @@ class StreamExecution(
    * once, since the field's value may change at any time.
    */
   @volatile
-  protected var availableOffsets = new StreamProgress
+  var availableOffsets = new StreamProgress
 
   /** The current batchId or -1 if execution has not yet been initialized. */
   protected var currentBatchId: Long = -1
@@ -263,7 +263,8 @@ class StreamExecution(
           this,
           s"Query $name terminated with exception: ${e.getMessage}",
           e,
-          Some(committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json)))
+          committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json).toString,
+          availableOffsets.toOffsetSeq(sources, streamExecutionMetadata.json).toString)
         logError(s"Query $name terminated with error", e)
         updateStatusMessage(s"Terminated with exception: ${e.getMessage}")
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
index 13f11ba1c9222..a96150aa89920 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryException.scala
@@ -24,32 +24,42 @@ import org.apache.spark.sql.execution.streaming.{Offset, OffsetSeq, StreamExecut
  * :: Experimental ::
  * Exception that stopped a [[StreamingQuery]]. Use `cause` get the actual exception
  * that caused the failure.
- * @param query       Query that caused the exception
  * @param message     Message of this exception
  * @param cause       Internal cause of this exception
- * @param startOffset Starting offset (if known) of the range of data in which exception occurred
- * @param endOffset   Ending offset (if known) of the range of data in exception occurred
+ * @param startOffset Starting offset in json of the range of data in which exception occurred
+ * @param endOffset   Ending offset in json of the range of data in exception occurred
  * @since 2.0.0
  */
 @Experimental
-class StreamingQueryException private[sql](
-    @transient val query: StreamingQuery,
+class StreamingQueryException private(
+    causeString: String,
     val message: String,
     val cause: Throwable,
-    val startOffset: Option[OffsetSeq] = None,
-    val endOffset: Option[OffsetSeq] = None)
+    val startOffset: String,
+    val endOffset: String)
   extends Exception(message, cause) {
 
+  private[sql] def this(
+      query: StreamingQuery,
+      message: String,
+      cause: Throwable,
+      startOffset: String,
+      endOffset: String) {
+    this(
+      // scalastyle:off
+      s"""${classOf[StreamingQueryException].getName}: ${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}
+         |
+         |${query.asInstanceOf[StreamExecution].toDebugString}
+         """.stripMargin,
+      // scalastyle:on
+      message,
+      cause,
+      startOffset,
+      endOffset)
+  }
+
   /** Time when the exception occurred */
   val time: Long = System.currentTimeMillis
 
-  override def toString(): String = {
-    val causeStr =
-      s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}"
-    s"""
-       |$causeStr
-       |
-       |${query.asInstanceOf[StreamExecution].toDebugString}
-       """.stripMargin
-  }
+  override def toString(): String = causeString
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index 4c8247458fcfe..fb5bad0123817 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -38,6 +38,13 @@ import org.apache.spark.annotation.Experimental
 class StateOperatorProgress private[sql](
     val numRowsTotal: Long,
     val numRowsUpdated: Long) {
+
+  /** The compact JSON representation of this progress. */
+  def json: String = compact(render(jsonValue))
+
+  /** The pretty (i.e. indented) JSON representation of this progress. */
+  def prettyJson: String = pretty(render(jsonValue))
+
   private[sql] def jsonValue: JValue = {
     ("numRowsTotal" -> JInt(numRowsTotal)) ~
     ("numRowsUpdated" -> JInt(numRowsUpdated))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index a2629f7f68160..43322651296b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -412,8 +412,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
               eventually("microbatch thread not stopped after termination with failure") {
                 assert(!currentStream.microBatchThread.isAlive)
               }
-              verify(thrownException.query.eq(currentStream),
-                s"incorrect query reference in exception")
               verify(currentStream.exception === Some(thrownException),
                 s"incorrect exception returned by query.exception()")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 56abe1201c0cc..f7fc19494d097 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -103,10 +103,12 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       TestAwaitTermination(ExpectException[SparkException]),
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 2000),
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 10),
-      AssertOnQuery(
-        q => q.exception.get.startOffset.get.offsets ===
-          q.committedOffsets.toOffsetSeq(Seq(inputData), "{}").offsets,
-        "incorrect start offset on exception")
+      AssertOnQuery(q => {
+        q.exception.get.startOffset ===
+          q.committedOffsets.toOffsetSeq(Seq(inputData), "{}").toString &&
+          q.exception.get.endOffset ===
+            q.availableOffsets.toOffsetSeq(Seq(inputData), "{}").toString
+      }, "incorrect start offset or end offset on exception")
     )
   }
 

From fecd23d2cebe691e4dee43ef26ef0090ead2c0d0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Mon, 5 Dec 2016 17:50:43 -0800
Subject: [PATCH 282/534] [SPARK-18634][PYSPARK][SQL] Corruption and
 Correctness issues with exploding Python UDFs

## What changes were proposed in this pull request?

As reported in the Jira, there are some weird issues with exploding Python UDFs in SparkSQL.

The following test code can reproduce it. Notice: the following test code is reported to return wrong results in the Jira. However, as I tested on master branch, it causes exception and so can't return any result.

    >>> from pyspark.sql.functions import *
    >>> from pyspark.sql.types import *
    >>>
    >>> df = spark.range(10)
    >>>
    >>> def return_range(value):
    ...   return [(i, str(i)) for i in range(value - 1, value + 1)]
    ...
    >>> range_udf = udf(return_range, ArrayType(StructType([StructField("integer_val", IntegerType()),
    ...                                                     StructField("string_val", StringType())])))
    >>>
    >>> df.select("id", explode(range_udf(df.id))).show()
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/spark/python/pyspark/sql/dataframe.py", line 318, in show
        print(self._jdf.showString(n, 20))
      File "/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__
      File "/spark/python/pyspark/sql/utils.py", line 63, in deco
        return f(*a, **kw)
      File "/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o126.showString.: java.lang.AssertionError: assertion failed
        at scala.Predef$.assert(Predef.scala:156)
        at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:120)
        at org.apache.spark.sql.execution.GenerateExec.consume(GenerateExec.scala:57)

The cause of this issue is, in `ExtractPythonUDFs` we insert `BatchEvalPythonExec` to run PythonUDFs in batch. `BatchEvalPythonExec` will add extra outputs (e.g., `pythonUDF0`) to original plan. In above case, the original `Range` only has one output `id`. After `ExtractPythonUDFs`, the added `BatchEvalPythonExec` has two outputs `id` and `pythonUDF0`.

Because the output of `GenerateExec` is given after analysis phase, in above case, it is the combination of `id`, i.e., the output of `Range`, and `col`. But in planning phase, we change `GenerateExec`'s child plan to `BatchEvalPythonExec` with additional output attributes.

It will cause no problem in non wholestage codegen. Because when evaluating the additional attributes are projected out the final output of `GenerateExec`.

However, as `GenerateExec` now supports wholestage codegen, the framework will input all the outputs of the child plan to `GenerateExec`. Then when consuming `GenerateExec`'s output data (i.e., calling `consume`), the number of output attributes is different to the output variables in wholestage codegen.

To solve this issue, this patch only gives the generator's output to `GenerateExec` after analysis phase. `GenerateExec`'s output is the combination of its child plan's output and the generator's output. So when we change `GenerateExec`'s child, its output is still correct.

## How was this patch tested?

Added test cases to PySpark.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #16120 from viirya/fix-py-udf-with-generator.

(cherry picked from commit 3ba69b64852ccbf6d4ec05a021bc20616a09f574)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 python/pyspark/sql/tests.py                   | 20 +++++++++++++++++++
 .../plans/logical/basicLogicalOperators.scala | 12 +++++------
 .../spark/sql/execution/GenerateExec.scala    | 15 +++++++++++---
 .../spark/sql/execution/SparkStrategies.scala |  3 ++-
 4 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 9f34414f64d1b..66a3490a640ba 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -384,6 +384,26 @@ def test_udf_in_generate(self):
         row = df.select(explode(f(*df))).groupBy().sum().first()
         self.assertEqual(row[0], 10)
 
+        df = self.spark.range(3)
+        res = df.select("id", explode(f(df.id))).collect()
+        self.assertEqual(res[0][0], 1)
+        self.assertEqual(res[0][1], 0)
+        self.assertEqual(res[1][0], 2)
+        self.assertEqual(res[1][1], 0)
+        self.assertEqual(res[2][0], 2)
+        self.assertEqual(res[2][1], 1)
+
+        range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
+        res = df.select("id", explode(range_udf(df.id))).collect()
+        self.assertEqual(res[0][0], 0)
+        self.assertEqual(res[0][1], -1)
+        self.assertEqual(res[1][0], 0)
+        self.assertEqual(res[1][1], 0)
+        self.assertEqual(res[2][0], 1)
+        self.assertEqual(res[2][1], 0)
+        self.assertEqual(res[3][0], 1)
+        self.assertEqual(res[3][1], 1)
+
     def test_udf_with_order_by_and_limit(self):
         from pyspark.sql.functions import udf
         my_copy = udf(lambda x: x, IntegerType())
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index da42df3366307..304367de4cf6a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -94,13 +94,13 @@ case class Generate(
 
   override def producedAttributes: AttributeSet = AttributeSet(generatorOutput)
 
-  def output: Seq[Attribute] = {
-    val qualified = qualifier.map(q =>
-      // prepend the new qualifier to the existed one
-      generatorOutput.map(a => a.withQualifier(Some(q)))
-    ).getOrElse(generatorOutput)
+  val qualifiedGeneratorOutput: Seq[Attribute] = qualifier.map { q =>
+    // prepend the new qualifier to the existed one
+    generatorOutput.map(a => a.withQualifier(Some(q)))
+  }.getOrElse(generatorOutput)
 
-    if (join) child.output ++ qualified else qualified
+  def output: Seq[Attribute] = {
+    if (join) child.output ++ qualifiedGeneratorOutput else qualifiedGeneratorOutput
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index 19fbf0c162048..1d9f96bcb5344 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -45,17 +45,26 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
  *              it.
  * @param outer when true, each input row will be output at least once, even if the output of the
  *              given `generator` is empty. `outer` has no effect when `join` is false.
- * @param output the output attributes of this node, which constructed in analysis phase,
- *               and we can not change it, as the parent node bound with it already.
+ * @param generatorOutput the qualified output attributes of the generator of this node, which
+ *                        constructed in analysis phase, and we can not change it, as the
+ *                        parent node bound with it already.
  */
 case class GenerateExec(
     generator: Generator,
     join: Boolean,
     outer: Boolean,
-    output: Seq[Attribute],
+    generatorOutput: Seq[Attribute],
     child: SparkPlan)
   extends UnaryExecNode {
 
+  override def output: Seq[Attribute] = {
+    if (join) {
+      child.output ++ generatorOutput
+    } else {
+      generatorOutput
+    }
+  }
+
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 2308ae8a6c611..d88cbdfbcfa0e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -403,7 +403,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.UnionExec(unionChildren.map(planLater)) :: Nil
       case g @ logical.Generate(generator, join, outer, _, _, child) =>
         execution.GenerateExec(
-          generator, join = join, outer = outer, g.output, planLater(child)) :: Nil
+          generator, join = join, outer = outer, g.qualifiedGeneratorOutput,
+          planLater(child)) :: Nil
       case logical.OneRowRelation =>
         execution.RDDScanExec(Nil, singleRowRdd, "OneRowRelation") :: Nil
       case r: logical.Range =>

From 6c4c3368473f7f2c8fe810b895b9148e72370ba6 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 5 Dec 2016 18:15:55 -0800
Subject: [PATCH 283/534] [SPARK-18729][SS] Move DataFrame.collect out of
 synchronized block in MemorySink

## What changes were proposed in this pull request?

Move DataFrame.collect out of synchronized block so that we can query content in MemorySink when `DataFrame.collect` is running.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16162 from zsxwing/SPARK-18729.

(cherry picked from commit 1b2785c3d0a40da2fca923af78066060dbfbcf0a)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/execution/streaming/memory.scala      | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index adf6963577f49..b370845481ed7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -186,16 +186,23 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }.mkString("\n")
   }
 
-  override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
-    if (latestBatchId.isEmpty || batchId > latestBatchId.get) {
+  override def addBatch(batchId: Long, data: DataFrame): Unit = {
+    val notCommitted = synchronized {
+      latestBatchId.isEmpty || batchId > latestBatchId.get
+    }
+    if (notCommitted) {
       logDebug(s"Committing batch $batchId to $this")
       outputMode match {
         case InternalOutputModes.Append | InternalOutputModes.Update =>
-          batches.append(AddedData(batchId, data.collect()))
+          val rows = AddedData(batchId, data.collect())
+          synchronized { batches += rows }
 
         case InternalOutputModes.Complete =>
-          batches.clear()
-          batches += AddedData(batchId, data.collect())
+          val rows = AddedData(batchId, data.collect())
+          synchronized {
+            batches.clear()
+            batches += rows
+          }
 
         case _ =>
           throw new IllegalArgumentException(
@@ -206,7 +213,7 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
     }
   }
 
-  def clear(): Unit = {
+  def clear(): Unit = synchronized {
     batches.clear()
   }
 

From 1946854abd4e4dc4bf0bba30ca521170b966d467 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 5 Dec 2016 18:17:38 -0800
Subject: [PATCH 284/534] [SPARK-18657][SPARK-18668] Make StreamingQuery.id
 persists across restart and not auto-generate StreamingQuery.name

Here are the major changes in this PR.
- Added the ability to recover `StreamingQuery.id` from checkpoint location, by writing the id to `checkpointLoc/metadata`.
- Added `StreamingQuery.runId` which is unique for every query started and does not persist across restarts. This is to identify each restart of a query separately (same as earlier behavior of `id`).
- Removed auto-generation of `StreamingQuery.name`. The purpose of name was to have the ability to define an identifier across restarts, but since id is precisely that, there is no need for a auto-generated name. This means name becomes purely cosmetic, and is null by default.
- Added `runId` to `StreamingQueryListener` events and `StreamingQueryProgress`.

Implementation details
- Renamed existing `StreamExecutionMetadata` to `OffsetSeqMetadata`, and moved it to the file `OffsetSeq.scala`, because that is what this metadata is tied to. Also did some refactoring to make the code cleaner (got rid of a lot of `.json` and `.getOrElse("{}")`).
- Added the `id` as the new `StreamMetadata`.
- When a StreamingQuery is created it gets or writes the `StreamMetadata` from `checkpointLoc/metadata`.
- All internal logging in `StreamExecution` uses `(name, id, runId)` instead of just `name`

TODO
- [x] Test handling of name=null in json generation of StreamingQueryProgress
- [x] Test handling of name=null in json generation of StreamingQueryListener events
- [x] Test python API of runId

Updated unit tests and new unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16113 from tdas/SPARK-18657.

(cherry picked from commit bb57bfe97d9fb077885065b8e804b85d4c493faf)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 project/MimaExcludes.scala                    |   6 +
 python/pyspark/sql/streaming.py               |  19 +++-
 .../sql/execution/streaming/OffsetSeq.scala   |  27 ++++-
 .../execution/streaming/OffsetSeqLog.scala    |   2 +-
 .../streaming/ProgressReporter.scala          |   6 +-
 .../execution/streaming/StreamExecution.scala | 105 ++++++++----------
 .../execution/streaming/StreamMetadata.scala  |  88 +++++++++++++++
 .../execution/streaming/StreamProgress.scala  |   2 +-
 .../spark/sql/streaming/StreamingQuery.scala  |  19 +++-
 .../streaming/StreamingQueryListener.scala    |  10 +-
 .../sql/streaming/StreamingQueryManager.scala |  25 +++--
 .../apache/spark/sql/streaming/progress.scala |   7 +-
 .../query-metadata-logs-version-2.1.0.txt     |   3 +
 .../streaming/OffsetSeqLogSuite.scala         |  13 ++-
 .../streaming/StreamMetadataSuite.scala       |  55 +++++++++
 .../StreamExecutionMetadataSuite.scala        |  35 ------
 .../StreamingQueryListenerSuite.scala         |  46 +++++---
 ...StreamingQueryStatusAndProgressSuite.scala |  78 +++++++++++--
 .../sql/streaming/StreamingQuerySuite.scala   | 100 ++++++++++++-----
 19 files changed, 469 insertions(+), 177 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala
 create mode 100644 sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 9e6325432c0f6..6650aad0be594 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -99,6 +99,12 @@ object MimaExcludes {
       ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.getAggregationDepth"),
       ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasAggregationDepth.org$apache$spark$ml$param$shared$HasAggregationDepth$_setter_$aggregationDepth_="),
 
+      // [SPARK-18236] Reduce duplicate objects in Spark UI and HistoryServer
+      ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.scheduler.TaskInfo.accumulables"),
+
+      // [SPARK-18657] Add StreamingQuery.runId
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.runId"),
+
       // [SPARK-18694] Add StreamingQuery.explain and exception to Python and fix StreamingQueryException
       ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.streaming.StreamingQueryException$"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 4a7d17ba51a7b..ee7a26d00df4b 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -51,14 +51,29 @@ def __init__(self, jsq):
     @property
     @since(2.0)
     def id(self):
-        """The id of the streaming query.
+        """Returns the unique id of this query that persists across restarts from checkpoint data.
+        That is, this id is generated when a query is started for the first time, and
+        will be the same every time it is restarted from checkpoint data.
+        There can only be one query with the same id active in a Spark cluster.
+        Also see, `runId`.
         """
         return self._jsq.id().toString()
 
+    @property
+    @since(2.1)
+    def runId(self):
+        """Returns the unique id of this query that does not persist across restarts. That is, every
+        query that is started (or restarted from checkpoint) will have a different runId.
+        """
+        return self._jsq.runId().toString()
+
     @property
     @since(2.0)
     def name(self):
-        """The name of the streaming query. This name is unique across all active queries.
+        """Returns the user-specified name of the query, or null if not specified.
+        This name can be specified in the `org.apache.spark.sql.streaming.DataStreamWriter`
+        as `dataframe.writeStream.queryName("query").start()`.
+        This name, if set, must be unique across all active queries.
         """
         return self._jsq.name()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index 7469caeee3be5..e5a1997d6b808 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -17,13 +17,16 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
 
 /**
  * An ordered collection of offsets, used to track the progress of processing data from one or more
  * [[Source]]s that are present in a streaming query. This is similar to simplified, single-instance
  * vector clock that must progress linearly forward.
  */
-case class OffsetSeq(offsets: Seq[Option[Offset]], metadata: Option[String] = None) {
+case class OffsetSeq(offsets: Seq[Option[Offset]], metadata: Option[OffsetSeqMetadata] = None) {
 
   /**
    * Unpacks an offset into [[StreamProgress]] by associating each offset with the order list of
@@ -54,6 +57,26 @@ object OffsetSeq {
    * `nulls` in the sequence are converted to `None`s.
    */
   def fill(metadata: Option[String], offsets: Offset*): OffsetSeq = {
-    OffsetSeq(offsets.map(Option(_)), metadata)
+    OffsetSeq(offsets.map(Option(_)), metadata.map(OffsetSeqMetadata.apply))
   }
 }
+
+
+/**
+ * Contains metadata associated with a [[OffsetSeq]]. This information is
+ * persisted to the offset log in the checkpoint location via the [[OffsetSeq]] metadata field.
+ *
+ * @param batchWatermarkMs: The current eventTime watermark, used to
+ * bound the lateness of data that will processed. Time unit: milliseconds
+ * @param batchTimestampMs: The current batch processing timestamp.
+ * Time unit: milliseconds
+ */
+case class OffsetSeqMetadata(var batchWatermarkMs: Long = 0, var batchTimestampMs: Long = 0) {
+  def json: String = Serialization.write(this)(OffsetSeqMetadata.format)
+}
+
+object OffsetSeqMetadata {
+  private implicit val format = Serialization.formats(NoTypeHints)
+  def apply(json: String): OffsetSeqMetadata = Serialization.read[OffsetSeqMetadata](json)
+}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
index cc25b4474ba2c..3210d8ad64e22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -74,7 +74,7 @@ class OffsetSeqLog(sparkSession: SparkSession, path: String)
 
     // write metadata
     out.write('\n')
-    out.write(offsetSeq.metadata.getOrElse("").getBytes(UTF_8))
+    out.write(offsetSeq.metadata.map(_.json).getOrElse("").getBytes(UTF_8))
 
     // write offsets, one per line
     offsetSeq.offsets.map(_.map(_.json)).foreach { offset =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index ba77e7c7bf2b3..7d0d086746c79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -43,6 +43,7 @@ trait ProgressReporter extends Logging {
 
   // Internal state of the stream, required for computing metrics.
   protected def id: UUID
+  protected def runId: UUID
   protected def name: String
   protected def triggerClock: Clock
   protected def logicalPlan: LogicalPlan
@@ -52,7 +53,7 @@ trait ProgressReporter extends Logging {
   protected def committedOffsets: StreamProgress
   protected def sources: Seq[Source]
   protected def sink: Sink
-  protected def streamExecutionMetadata: StreamExecutionMetadata
+  protected def offsetSeqMetadata: OffsetSeqMetadata
   protected def currentBatchId: Long
   protected def sparkSession: SparkSession
 
@@ -134,11 +135,12 @@ trait ProgressReporter extends Logging {
 
     val newProgress = new StreamingQueryProgress(
       id = id,
+      runId = runId,
       name = name,
       timestamp = currentTriggerStartTimestamp,
       batchId = currentBatchId,
       durationMs = currentDurationsMs.toMap.mapValues(long2Long).asJava,
-      currentWatermark = streamExecutionMetadata.batchWatermarkMs,
+      currentWatermark = offsetSeqMetadata.batchWatermarkMs,
       stateOperators = executionStats.stateOperators.toArray,
       sources = sourceProgress.toArray,
       sink = sinkProgress)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 6b1c01ab2a061..083cce8eb52a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -25,8 +25,6 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.fs.Path
-import org.json4s.NoTypeHints
-import org.json4s.jackson.Serialization
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
@@ -58,9 +56,6 @@ class StreamExecution(
 
   import org.apache.spark.sql.streaming.StreamingQueryListener._
 
-  // TODO: restore this from the checkpoint directory.
-  override val id: UUID = UUID.randomUUID()
-
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
   private val noDataProgressEventInterval =
@@ -98,8 +93,30 @@ class StreamExecution(
   /** The current batchId or -1 if execution has not yet been initialized. */
   protected var currentBatchId: Long = -1
 
-  /** Stream execution metadata */
-  protected var streamExecutionMetadata = StreamExecutionMetadata()
+  /** Metadata associated with the whole query */
+  protected val streamMetadata: StreamMetadata = {
+    val metadataPath = new Path(checkpointFile("metadata"))
+    val hadoopConf = sparkSession.sessionState.newHadoopConf()
+    StreamMetadata.read(metadataPath, hadoopConf).getOrElse {
+      val newMetadata = new StreamMetadata(UUID.randomUUID.toString)
+      StreamMetadata.write(newMetadata, metadataPath, hadoopConf)
+      newMetadata
+    }
+  }
+
+  /** Metadata associated with the offset seq of a batch in the query. */
+  protected var offsetSeqMetadata = OffsetSeqMetadata()
+
+  override val id: UUID = UUID.fromString(streamMetadata.id)
+
+  override val runId: UUID = UUID.randomUUID
+
+  /**
+   * Pretty identified string of printing in logs. Format is
+   * If name is set "queryName [id = xyz, runId = abc]" else "[id = xyz, runId = abc]"
+   */
+  private val prettyIdString =
+    Option(name).map(_ + " ").getOrElse("") + s"[id = $id, runId = $runId]"
 
   /** All stream sources present in the query plan. */
   protected val sources =
@@ -128,8 +145,9 @@ class StreamExecution(
   /* Get the call site in the caller thread; will pass this into the micro batch thread */
   private val callSite = Utils.getCallSite()
 
-  /** Used to report metrics to coda-hale. */
-  lazy val streamMetrics = new MetricsReporter(this, s"spark.streaming.$name")
+  /** Used to report metrics to coda-hale. This uses id for easier tracking across restarts. */
+  lazy val streamMetrics = new MetricsReporter(
+    this, s"spark.streaming.${Option(name).getOrElse(id)}")
 
   /**
    * The thread that runs the micro-batches of this stream. Note that this thread must be
@@ -137,7 +155,7 @@ class StreamExecution(
    * [[HDFSMetadataLog]]. See SPARK-14131 for more details.
    */
   val microBatchThread =
-    new StreamExecutionThread(s"stream execution thread for $name") {
+    new StreamExecutionThread(s"stream execution thread for $prettyIdString") {
       override def run(): Unit = {
         // To fix call site like "run at <unknown>:0", we bridge the call site from the caller
         // thread to this micro batch thread
@@ -191,7 +209,7 @@ class StreamExecution(
         sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
       }
 
-      postEvent(new QueryStartedEvent(id, name)) // Assumption: Does not throw exception.
+      postEvent(new QueryStartedEvent(id, runId, name)) // Assumption: Does not throw exception.
 
       // Unblock starting thread
       startLatch.countDown()
@@ -261,10 +279,10 @@ class StreamExecution(
       case e: Throwable =>
         streamDeathCause = new StreamingQueryException(
           this,
-          s"Query $name terminated with exception: ${e.getMessage}",
+          s"Query $prettyIdString terminated with exception: ${e.getMessage}",
           e,
-          committedOffsets.toOffsetSeq(sources, streamExecutionMetadata.json).toString,
-          availableOffsets.toOffsetSeq(sources, streamExecutionMetadata.json).toString)
+          committedOffsets.toOffsetSeq(sources, offsetSeqMetadata).toString,
+          availableOffsets.toOffsetSeq(sources, offsetSeqMetadata).toString)
         logError(s"Query $name terminated with error", e)
         updateStatusMessage(s"Terminated with exception: ${e.getMessage}")
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
@@ -282,7 +300,7 @@ class StreamExecution(
       // Notify others
       sparkSession.streams.notifyQueryTermination(StreamExecution.this)
       postEvent(
-       new QueryTerminatedEvent(id, exception.map(_.cause).map(Utils.exceptionString)))
+       new QueryTerminatedEvent(id, runId, exception.map(_.cause).map(Utils.exceptionString)))
       terminationLatch.countDown()
     }
   }
@@ -301,9 +319,9 @@ class StreamExecution(
         logInfo(s"Resuming streaming query, starting with batch $batchId")
         currentBatchId = batchId
         availableOffsets = nextOffsets.toStreamProgress(sources)
-        streamExecutionMetadata = StreamExecutionMetadata(nextOffsets.metadata.getOrElse("{}"))
+        offsetSeqMetadata = nextOffsets.metadata.getOrElse(OffsetSeqMetadata())
         logDebug(s"Found possibly unprocessed offsets $availableOffsets " +
-          s"at batch timestamp ${streamExecutionMetadata.batchTimestampMs}")
+          s"at batch timestamp ${offsetSeqMetadata.batchTimestampMs}")
 
         offsetLog.get(batchId - 1).foreach {
           case lastOffsets =>
@@ -359,15 +377,15 @@ class StreamExecution(
     }
     if (hasNewData) {
       // Current batch timestamp in milliseconds
-      streamExecutionMetadata.batchTimestampMs = triggerClock.getTimeMillis()
+      offsetSeqMetadata.batchTimestampMs = triggerClock.getTimeMillis()
       updateStatusMessage("Writing offsets to log")
       reportTimeTaken("walCommit") {
         assert(offsetLog.add(
           currentBatchId,
-          availableOffsets.toOffsetSeq(sources, streamExecutionMetadata.json)),
+          availableOffsets.toOffsetSeq(sources, offsetSeqMetadata)),
           s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
         logInfo(s"Committed offsets for batch $currentBatchId. " +
-          s"Metadata ${streamExecutionMetadata.toString}")
+          s"Metadata ${offsetSeqMetadata.toString}")
 
         // NOTE: The following code is correct because runBatches() processes exactly one
         // batch at a time. If we add pipeline parallelism (multiple batches in flight at
@@ -437,21 +455,21 @@ class StreamExecution(
     val triggerLogicalPlan = withNewSources transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) => replacementMap(a)
       case ct: CurrentTimestamp =>
-        CurrentBatchTimestamp(streamExecutionMetadata.batchTimestampMs,
+        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
           ct.dataType)
       case cd: CurrentDate =>
-        CurrentBatchTimestamp(streamExecutionMetadata.batchTimestampMs,
+        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
           cd.dataType)
     }
 
-    val executedPlan = reportTimeTaken("queryPlanning") {
+    reportTimeTaken("queryPlanning") {
       lastExecution = new IncrementalExecution(
         sparkSession,
         triggerLogicalPlan,
         outputMode,
         checkpointFile("state"),
         currentBatchId,
-        streamExecutionMetadata.batchWatermarkMs)
+        offsetSeqMetadata.batchWatermarkMs)
       lastExecution.executedPlan // Force the lazy generation of execution plan
     }
 
@@ -468,12 +486,12 @@ class StreamExecution(
         logTrace(s"Maximum observed eventTime: ${e.maxEventTime.value}")
         (e.maxEventTime.value / 1000) - e.delay.milliseconds()
     }.headOption.foreach { newWatermark =>
-      if (newWatermark > streamExecutionMetadata.batchWatermarkMs) {
+      if (newWatermark > offsetSeqMetadata.batchWatermarkMs) {
         logInfo(s"Updating eventTime watermark to: $newWatermark ms")
-        streamExecutionMetadata.batchWatermarkMs = newWatermark
+        offsetSeqMetadata.batchWatermarkMs = newWatermark
       } else {
         logTrace(s"Event time didn't move: $newWatermark < " +
-          s"$streamExecutionMetadata.currentEventTimeWatermark")
+          s"$offsetSeqMetadata.currentEventTimeWatermark")
       }
     }
 
@@ -503,7 +521,7 @@ class StreamExecution(
       microBatchThread.join()
     }
     uniqueSources.foreach(_.stop())
-    logInfo(s"Query $name was stopped")
+    logInfo(s"Query $prettyIdString was stopped")
   }
 
   /**
@@ -594,7 +612,7 @@ class StreamExecution(
   override def explain(): Unit = explain(extended = false)
 
   override def toString: String = {
-    s"Streaming Query - $name [state = $state]"
+    s"Streaming Query $prettyIdString [state = $state]"
   }
 
   def toDebugString: String = {
@@ -603,7 +621,7 @@ class StreamExecution(
     } else ""
     s"""
        |=== Streaming Query ===
-       |Name: $name
+       |Identifier: $prettyIdString
        |Current Offsets: $committedOffsets
        |
        |Current State: $state
@@ -622,33 +640,6 @@ class StreamExecution(
   case object TERMINATED extends State
 }
 
-/**
- * Contains metadata associated with a stream execution. This information is
- * persisted to the offset log via the OffsetSeq metadata field. Current
- * information contained in this object includes:
- *
- * @param batchWatermarkMs: The current eventTime watermark, used to
- * bound the lateness of data that will processed. Time unit: milliseconds
- * @param batchTimestampMs: The current batch processing timestamp.
- * Time unit: milliseconds
- */
-case class StreamExecutionMetadata(
-    var batchWatermarkMs: Long = 0,
-    var batchTimestampMs: Long = 0) {
-  private implicit val formats = StreamExecutionMetadata.formats
-
-  /**
-   * JSON string representation of this object.
-   */
-  def json: String = Serialization.write(this)
-}
-
-object StreamExecutionMetadata {
-  private implicit val formats = Serialization.formats(NoTypeHints)
-
-  def apply(json: String): StreamExecutionMetadata =
-    Serialization.read[StreamExecutionMetadata](json)
-}
 
 /**
  * A special thread to run the stream query. Some codes require to run in the StreamExecutionThread
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala
new file mode 100644
index 0000000000000..7807c9fae840a
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.{InputStreamReader, OutputStreamWriter}
+import java.nio.charset.StandardCharsets
+
+import scala.util.control.NonFatal
+
+import org.apache.commons.io.IOUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path}
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.streaming.StreamingQuery
+
+/**
+ * Contains metadata associated with a [[StreamingQuery]]. This information is written
+ * in the checkpoint location the first time a query is started and recovered every time the query
+ * is restarted.
+ *
+ * @param id  unique id of the [[StreamingQuery]] that needs to be persisted across restarts
+ */
+case class StreamMetadata(id: String) {
+  def json: String = Serialization.write(this)(StreamMetadata.format)
+}
+
+object StreamMetadata extends Logging {
+  implicit val format = Serialization.formats(NoTypeHints)
+
+  /** Read the metadata from file if it exists */
+  def read(metadataFile: Path, hadoopConf: Configuration): Option[StreamMetadata] = {
+    val fs = FileSystem.get(hadoopConf)
+    if (fs.exists(metadataFile)) {
+      var input: FSDataInputStream = null
+      try {
+        input = fs.open(metadataFile)
+        val reader = new InputStreamReader(input, StandardCharsets.UTF_8)
+        val metadata = Serialization.read[StreamMetadata](reader)
+        Some(metadata)
+      } catch {
+        case NonFatal(e) =>
+          logError(s"Error reading stream metadata from $metadataFile", e)
+          throw e
+      } finally {
+        IOUtils.closeQuietly(input)
+      }
+    } else None
+  }
+
+  /** Write metadata to file */
+  def write(
+      metadata: StreamMetadata,
+      metadataFile: Path,
+      hadoopConf: Configuration): Unit = {
+    var output: FSDataOutputStream = null
+    try {
+      val fs = FileSystem.get(hadoopConf)
+      output = fs.create(metadataFile)
+      val writer = new OutputStreamWriter(output)
+      Serialization.write(metadata, writer)
+      writer.close()
+    } catch {
+      case NonFatal(e) =>
+        logError(s"Error writing stream metadata $metadata to $metadataFile", e)
+        throw e
+    } finally {
+      IOUtils.closeQuietly(output)
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index 21b8750ca913d..a3f3662e6f4c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -26,7 +26,7 @@ class StreamProgress(
     val baseMap: immutable.Map[Source, Offset] = new immutable.HashMap[Source, Offset])
   extends scala.collection.immutable.Map[Source, Offset] {
 
-  def toOffsetSeq(source: Seq[Source], metadata: String): OffsetSeq = {
+  def toOffsetSeq(source: Seq[Source], metadata: OffsetSeqMetadata): OffsetSeq = {
     OffsetSeq(source.map(get), Some(metadata))
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
index 8fc4e43b6de53..1794e75462cfd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
@@ -32,20 +32,31 @@ import org.apache.spark.sql.SparkSession
 trait StreamingQuery {
 
   /**
-   * Returns the name of the query. This name is unique across all active queries. This can be
-   * set in the `org.apache.spark.sql.streaming.DataStreamWriter` as
-   * `dataframe.writeStream.queryName("query").start()`.
+   * Returns the user-specified name of the query, or null if not specified.
+   * This name can be specified in the `org.apache.spark.sql.streaming.DataStreamWriter`
+   * as `dataframe.writeStream.queryName("query").start()`.
+   * This name, if set, must be unique across all active queries.
    *
    * @since 2.0.0
    */
   def name: String
 
   /**
-   * Returns the unique id of this query.
+   * Returns the unique id of this query that persists across restarts from checkpoint data.
+   * That is, this id is generated when a query is started for the first time, and
+   * will be the same every time it is restarted from checkpoint data. Also see [[runId]].
+   *
    * @since 2.1.0
    */
   def id: UUID
 
+  /**
+   * Returns the unique id of this run of the query. That is, every start/restart of a query will
+   * generated a unique runId. Therefore, every time a query is restarted from
+   * checkpoint, it will have the same [[id]] but different [[runId]]s.
+   */
+  def runId: UUID
+
   /**
    * Returns the `SparkSession` associated with `this`.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
index d9ee75c064065..6fc859d88d97e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
@@ -86,7 +86,10 @@ object StreamingQueryListener {
    * @since 2.1.0
    */
   @Experimental
-  class QueryStartedEvent private[sql](val id: UUID, val name: String) extends Event
+  class QueryStartedEvent private[sql](
+      val id: UUID,
+      val runId: UUID,
+      val name: String) extends Event
 
   /**
    * :: Experimental ::
@@ -106,5 +109,8 @@ object StreamingQueryListener {
    * @since 2.1.0
    */
   @Experimental
-  class QueryTerminatedEvent private[sql](val id: UUID, val exception: Option[String]) extends Event
+  class QueryTerminatedEvent private[sql](
+      val id: UUID,
+      val runId: UUID,
+      val exception: Option[String]) extends Event
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index c448468bea519..c6ab41655f5ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -207,10 +207,14 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
       trigger: Trigger = ProcessingTime(0),
       triggerClock: Clock = new SystemClock()): StreamingQuery = {
     activeQueriesLock.synchronized {
-      val name = userSpecifiedName.getOrElse(s"query-${StreamingQueryManager.nextId}")
-      if (activeQueries.values.exists(_.name == name)) {
-        throw new IllegalArgumentException(
-          s"Cannot start query with name $name as a query with that name is already active")
+      val name = userSpecifiedName match {
+        case Some(n) =>
+          if (activeQueries.values.exists(_.name == userSpecifiedName.get)) {
+            throw new IllegalArgumentException(
+              s"Cannot start query with name $n as a query with that name is already active")
+          }
+          n
+        case None => null
       }
       val checkpointLocation = userSpecifiedCheckpointLocation.map { userSpecified =>
         new Path(userSpecified).toUri.toString
@@ -268,6 +272,14 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
         trigger,
         triggerClock,
         outputMode)
+
+      if (activeQueries.values.exists(_.id == query.id)) {
+        throw new IllegalStateException(
+          s"Cannot start query with id ${query.id} as another query with same id is " +
+            s"already active. Perhaps you are attempting to restart a query from checkpoint" +
+            s"that is already active.")
+      }
+
       query.start()
       activeQueries.put(query.id, query)
       query
@@ -287,8 +299,3 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
     }
   }
 }
-
-private object StreamingQueryManager {
-  private val _nextId = new AtomicLong(0)
-  private def nextId: Long = _nextId.getAndIncrement()
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index fb5bad0123817..f768080f5d2c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -57,8 +57,9 @@ class StateOperatorProgress private[sql](
  * a trigger. Each event relates to processing done for a single trigger of the streaming
  * query. Events are emitted even when no new data is available to be processed.
  *
- * @param id A unique id of the query.
- * @param name Name of the query. This name is unique across all active queries.
+ * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
+ * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
+ * @param name User-specified name of the query, null if not specified.
  * @param timestamp Timestamp (ms) of the beginning of the trigger.
  * @param batchId A unique id for the current batch of data being processed.  Note that in the
  *                case of retries after a failure a given batchId my be executed more than once.
@@ -73,6 +74,7 @@ class StateOperatorProgress private[sql](
 @Experimental
 class StreamingQueryProgress private[sql](
   val id: UUID,
+  val runId: UUID,
   val name: String,
   val timestamp: Long,
   val batchId: Long,
@@ -105,6 +107,7 @@ class StreamingQueryProgress private[sql](
     }
 
     ("id" -> JString(id.toString)) ~
+    ("runId" -> JString(runId.toString)) ~
     ("name" -> JString(name)) ~
     ("timestamp" -> JInt(timestamp)) ~
     ("numInputRows" -> JInt(numInputRows)) ~
diff --git a/sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt b/sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt
new file mode 100644
index 0000000000000..79613e2362164
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/query-metadata-logs-version-2.1.0.txt
@@ -0,0 +1,3 @@
+{
+  "id": "d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
index 3afd11fa4686d..d3a83ea0b922f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -27,10 +27,19 @@ class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
   /** test string offset type */
   case class StringOffset(override val json: String) extends Offset
 
-  testWithUninterruptibleThread("serialization - deserialization") {
+  test("OffsetSeqMetadata - deserialization") {
+    assert(OffsetSeqMetadata(0, 0) === OffsetSeqMetadata("""{}"""))
+    assert(OffsetSeqMetadata(1, 0) === OffsetSeqMetadata("""{"batchWatermarkMs":1}"""))
+    assert(OffsetSeqMetadata(0, 2) === OffsetSeqMetadata("""{"batchTimestampMs":2}"""))
+    assert(
+      OffsetSeqMetadata(1, 2) ===
+        OffsetSeqMetadata("""{"batchWatermarkMs":1,"batchTimestampMs":2}"""))
+  }
+
+  testWithUninterruptibleThread("OffsetSeqLog - serialization - deserialization") {
     withTempDir { temp =>
       val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir
-    val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
+      val metadataLog = new OffsetSeqLog(spark, dir.getAbsolutePath)
       val batch0 = OffsetSeq.fill(LongOffset(0), LongOffset(1), LongOffset(2))
       val batch1 = OffsetSeq.fill(StringOffset("one"), StringOffset("two"), StringOffset("three"))
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala
new file mode 100644
index 0000000000000..87f8004ab9588
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/StreamMetadataSuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.io.File
+import java.util.UUID
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.streaming.StreamTest
+
+class StreamMetadataSuite extends StreamTest {
+
+  test("writing and reading") {
+    withTempDir { dir =>
+      val id = UUID.randomUUID.toString
+      val metadata = StreamMetadata(id)
+      val file = new Path(new File(dir, "test").toString)
+      StreamMetadata.write(metadata, file, hadoopConf)
+      val readMetadata = StreamMetadata.read(file, hadoopConf)
+      assert(readMetadata.nonEmpty)
+      assert(readMetadata.get.id === id)
+    }
+  }
+
+  test("read Spark 2.1.0 format") {
+    // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0
+    assert(
+      readForResource("query-metadata-logs-version-2.1.0.txt") ===
+      StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e"))
+  }
+
+  private def readForResource(fileName: String): StreamMetadata = {
+    val input = getClass.getResource(s"/structured-streaming/$fileName")
+    StreamMetadata.read(new Path(input.toString), hadoopConf).get
+  }
+
+  private val hadoopConf = new Configuration()
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala
deleted file mode 100644
index c7139c588d1d3..0000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamExecutionMetadataSuite.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.apache.spark.sql.execution.streaming.StreamExecutionMetadata
-
-class StreamExecutionMetadataSuite extends StreamTest {
-
-  test("stream execution metadata") {
-    assert(StreamExecutionMetadata(0, 0) ===
-      StreamExecutionMetadata("""{}"""))
-    assert(StreamExecutionMetadata(1, 0) ===
-      StreamExecutionMetadata("""{"batchWatermarkMs":1}"""))
-    assert(StreamExecutionMetadata(0, 2) ===
-      StreamExecutionMetadata("""{"batchTimestampMs":2}"""))
-    assert(StreamExecutionMetadata(1, 2) ===
-      StreamExecutionMetadata(
-        """{"batchWatermarkMs":1,"batchTimestampMs":2}"""))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 3086abf03cd6c..a38c05eed5e33 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -69,6 +69,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         AssertOnQuery { query =>
           assert(listener.startEvent !== null)
           assert(listener.startEvent.id === query.id)
+          assert(listener.startEvent.runId === query.runId)
           assert(listener.startEvent.name === query.name)
           assert(listener.progressEvents.isEmpty)
           assert(listener.terminationEvent === null)
@@ -92,6 +93,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
           eventually(Timeout(streamingTimeout)) {
             assert(listener.terminationEvent !== null)
             assert(listener.terminationEvent.id === query.id)
+            assert(listener.terminationEvent.runId === query.runId)
             assert(listener.terminationEvent.exception === None)
           }
           listener.checkAsyncErrors()
@@ -167,30 +169,40 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   }
 
   test("QueryStartedEvent serialization") {
-    val queryStarted = new StreamingQueryListener.QueryStartedEvent(UUID.randomUUID(), "name")
-    val json = JsonProtocol.sparkEventToJson(queryStarted)
-    val newQueryStarted = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryStartedEvent]
+    def testSerialization(event: QueryStartedEvent): Unit = {
+      val json = JsonProtocol.sparkEventToJson(event)
+      val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryStartedEvent]
+      assert(newEvent.id === event.id)
+      assert(newEvent.runId === event.runId)
+      assert(newEvent.name === event.name)
+    }
+
+    testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, "name"))
+    testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, null))
   }
 
   test("QueryProgressEvent serialization") {
-    val event = new StreamingQueryListener.QueryProgressEvent(
-      StreamingQueryStatusAndProgressSuite.testProgress)
-    val json = JsonProtocol.sparkEventToJson(event)
-    val newEvent = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryProgressEvent]
-    assert(event.progress.json === newEvent.progress.json)
+    def testSerialization(event: QueryProgressEvent): Unit = {
+      val json = JsonProtocol.sparkEventToJson(event)
+      val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryProgressEvent]
+      assert(newEvent.progress.json === event.progress.json)  // json as a proxy for equality
+    }
+    testSerialization(new QueryProgressEvent(StreamingQueryStatusAndProgressSuite.testProgress1))
+    testSerialization(new QueryProgressEvent(StreamingQueryStatusAndProgressSuite.testProgress2))
   }
 
   test("QueryTerminatedEvent serialization") {
+    def testSerialization(event: QueryTerminatedEvent): Unit = {
+      val json = JsonProtocol.sparkEventToJson(event)
+      val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryTerminatedEvent]
+      assert(newEvent.id === event.id)
+      assert(newEvent.runId === event.runId)
+      assert(newEvent.exception === event.exception)
+    }
+
     val exception = new RuntimeException("exception")
-    val queryQueryTerminated = new StreamingQueryListener.QueryTerminatedEvent(
-      UUID.randomUUID, Some(exception.getMessage))
-    val json = JsonProtocol.sparkEventToJson(queryQueryTerminated)
-    val newQueryTerminated = JsonProtocol.sparkEventFromJson(json)
-      .asInstanceOf[StreamingQueryListener.QueryTerminatedEvent]
-    assert(queryQueryTerminated.id === newQueryTerminated.id)
-    assert(queryQueryTerminated.exception === newQueryTerminated.exception)
+    testSerialization(
+      new QueryTerminatedEvent(UUID.randomUUID, UUID.randomUUID, Some(exception.getMessage)))
   }
 
   test("only one progress event per interval when no data") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
index 4da712fa0f7e0..96f19db1a90e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -31,12 +31,13 @@ import org.apache.spark.sql.streaming.StreamingQueryStatusAndProgressSuite._
 class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
 
   test("StreamingQueryProgress - prettyJson") {
-    val json = testProgress.prettyJson
-    assert(json ===
+    val json1 = testProgress1.prettyJson
+    assert(json1 ===
       s"""
         |{
-        |  "id" : "${testProgress.id.toString}",
-        |  "name" : "name",
+        |  "id" : "${testProgress1.id.toString}",
+        |  "runId" : "${testProgress1.runId.toString}",
+        |  "name" : "myName",
         |  "timestamp" : 1,
         |  "numInputRows" : 678,
         |  "inputRowsPerSecond" : 10.0,
@@ -60,16 +61,48 @@ class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
         |  }
         |}
       """.stripMargin.trim)
-    assert(compact(parse(json)) === testProgress.json)
-
+    assert(compact(parse(json1)) === testProgress1.json)
+
+    val json2 = testProgress2.prettyJson
+    assert(
+      json2 ===
+        s"""
+         |{
+         |  "id" : "${testProgress2.id.toString}",
+         |  "runId" : "${testProgress2.runId.toString}",
+         |  "name" : null,
+         |  "timestamp" : 1,
+         |  "numInputRows" : 678,
+         |  "durationMs" : {
+         |    "total" : 0
+         |  },
+         |  "currentWatermark" : 3,
+         |  "stateOperators" : [ {
+         |    "numRowsTotal" : 0,
+         |    "numRowsUpdated" : 1
+         |  } ],
+         |  "sources" : [ {
+         |    "description" : "source",
+         |    "startOffset" : 123,
+         |    "endOffset" : 456,
+         |    "numInputRows" : 678
+         |  } ],
+         |  "sink" : {
+         |    "description" : "sink"
+         |  }
+         |}
+      """.stripMargin.trim)
+    assert(compact(parse(json2)) === testProgress2.json)
   }
 
   test("StreamingQueryProgress - json") {
-    assert(compact(parse(testProgress.json)) === testProgress.json)
+    assert(compact(parse(testProgress1.json)) === testProgress1.json)
+    assert(compact(parse(testProgress2.json)) === testProgress2.json)
   }
 
   test("StreamingQueryProgress - toString") {
-    assert(testProgress.toString === testProgress.prettyJson)
+    assert(testProgress1.toString === testProgress1.prettyJson)
+    assert(testProgress2.toString === testProgress2.prettyJson)
   }
 
   test("StreamingQueryStatus - prettyJson") {
@@ -94,9 +127,10 @@ class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
 }
 
 object StreamingQueryStatusAndProgressSuite {
-  val testProgress = new StreamingQueryProgress(
-    id = UUID.randomUUID(),
-    name = "name",
+  val testProgress1 = new StreamingQueryProgress(
+    id = UUID.randomUUID,
+    runId = UUID.randomUUID,
+    name = "myName",
     timestamp = 1L,
     batchId = 2L,
     durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
@@ -115,6 +149,28 @@ object StreamingQueryStatusAndProgressSuite {
     sink = new SinkProgress("sink")
   )
 
+  val testProgress2 = new StreamingQueryProgress(
+    id = UUID.randomUUID,
+    runId = UUID.randomUUID,
+    name = null, // should not be present in the json
+    timestamp = 1L,
+    batchId = 2L,
+    durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
+    currentWatermark = 3L,
+    stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
+    sources = Array(
+      new SourceProgress(
+        description = "source",
+        startOffset = "123",
+        endOffset = "456",
+        numInputRows = 678,
+        inputRowsPerSecond = Double.NaN, // should not be present in the json
+        processedRowsPerSecond = Double.NegativeInfinity // should not be present in the json
+      )
+    ),
+    sink = new SinkProgress("sink")
+  )
+
   val testStatus = new StreamingQueryStatus("active", true, false)
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index f7fc19494d097..893cb762c6580 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.streaming
 
+import org.apache.commons.lang3.RandomStringUtils
 import org.scalactic.TolerantNumerics
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.BeforeAndAfter
@@ -28,7 +29,7 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.SparkException
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
-import org.apache.spark.util.{ManualClock, Utils}
+import org.apache.spark.util.ManualClock
 
 
 class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
@@ -43,38 +44,77 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     sqlContext.streams.active.foreach(_.stop())
   }
 
-  test("names unique across active queries, ids unique across all started queries") {
-    val inputData = MemoryStream[Int]
-    val mapped = inputData.toDS().map { 6 / _}
+  test("name unique in active queries") {
+    withTempDir { dir =>
+      def startQuery(name: Option[String]): StreamingQuery = {
+        val writer = MemoryStream[Int].toDS.writeStream
+        name.foreach(writer.queryName)
+        writer
+          .foreach(new TestForeachWriter)
+          .start()
+      }
 
-    def startQuery(queryName: String): StreamingQuery = {
-      val metadataRoot = Utils.createTempDir(namePrefix = "streaming.checkpoint").getCanonicalPath
-      val writer = mapped.writeStream
-      writer
-        .queryName(queryName)
-        .format("memory")
-        .option("checkpointLocation", metadataRoot)
-        .start()
-    }
+      // No name by default, multiple active queries can have no name
+      val q1 = startQuery(name = None)
+      assert(q1.name === null)
+      val q2 = startQuery(name = None)
+      assert(q2.name === null)
+
+      // Can be set by user
+      val q3 = startQuery(name = Some("q3"))
+      assert(q3.name === "q3")
 
-    val q1 = startQuery("q1")
-    assert(q1.name === "q1")
+      // Multiple active queries cannot have same name
+      val e = intercept[IllegalArgumentException] {
+        startQuery(name = Some("q3"))
+      }
 
-    // Verify that another query with same name cannot be started
-    val e1 = intercept[IllegalArgumentException] {
-      startQuery("q1")
+      q1.stop()
+      q2.stop()
+      q3.stop()
     }
-    Seq("q1", "already active").foreach { s => assert(e1.getMessage.contains(s)) }
+  }
 
-    // Verify q1 was unaffected by the above exception and stop it
-    assert(q1.isActive)
-    q1.stop()
+  test(
+    "id unique in active queries + persists across restarts, runId unique across start/restarts") {
+    val inputData = MemoryStream[Int]
+    withTempDir { dir =>
+      var cpDir: String = null
+
+      def startQuery(restart: Boolean): StreamingQuery = {
+        if (cpDir == null || !restart) cpDir = s"$dir/${RandomStringUtils.randomAlphabetic(10)}"
+        MemoryStream[Int].toDS().groupBy().count()
+          .writeStream
+          .format("memory")
+          .outputMode("complete")
+          .queryName(s"name${RandomStringUtils.randomAlphabetic(10)}")
+          .option("checkpointLocation", cpDir)
+          .start()
+      }
 
-    // Verify another query can be started with name q1, but will have different id
-    val q2 = startQuery("q1")
-    assert(q2.name === "q1")
-    assert(q2.id !== q1.id)
-    q2.stop()
+      // id and runId unique for new queries
+      val q1 = startQuery(restart = false)
+      val q2 = startQuery(restart = false)
+      assert(q1.id !== q2.id)
+      assert(q1.runId !== q2.runId)
+      q1.stop()
+      q2.stop()
+
+      // id persists across restarts, runId unique across restarts
+      val q3 = startQuery(restart = false)
+      q3.stop()
+
+      val q4 = startQuery(restart = true)
+      q4.stop()
+      assert(q3.id === q3.id)
+      assert(q3.runId !== q4.runId)
+
+      // Only one query with same id can be active
+      val q5 = startQuery(restart = false)
+      val e = intercept[IllegalStateException] {
+        startQuery(restart = true)
+      }
+    }
   }
 
   testQuietly("isActive, exception, and awaitTermination") {
@@ -105,9 +145,9 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       TestAwaitTermination(ExpectException[SparkException], timeoutMs = 10),
       AssertOnQuery(q => {
         q.exception.get.startOffset ===
-          q.committedOffsets.toOffsetSeq(Seq(inputData), "{}").toString &&
+          q.committedOffsets.toOffsetSeq(Seq(inputData), OffsetSeqMetadata()).toString &&
           q.exception.get.endOffset ===
-            q.availableOffsets.toOffsetSeq(Seq(inputData), "{}").toString
+            q.availableOffsets.toOffsetSeq(Seq(inputData), OffsetSeqMetadata()).toString
       }, "incorrect start offset or end offset on exception")
     )
   }
@@ -274,7 +314,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
 
     /** Whether metrics of a query is registered for reporting */
     def isMetricsRegistered(query: StreamingQuery): Boolean = {
-      val sourceName = s"spark.streaming.${query.name}"
+      val sourceName = s"spark.streaming.${query.id}"
       val sources = spark.sparkContext.env.metricsSystem.getSourcesByName(sourceName)
       require(sources.size <= 1)
       sources.nonEmpty

From d4588165ed0c68c2712304a6814eda4fbb470ea2 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 5 Dec 2016 18:51:07 -0800
Subject: [PATCH 285/534] [SPARK-18722][SS] Move no data rate limit from
 StreamExecution to ProgressReporter

## What changes were proposed in this pull request?

Move no data rate limit from StreamExecution to ProgressReporter to make `recentProgresses` and listener events consistent.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16155 from zsxwing/SPARK-18722.

(cherry picked from commit 4af142f55771affa5fc7f2abbbf5e47766194e6e)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/ProgressReporter.scala          | 33 ++++++++++++++++---
 .../execution/streaming/StreamExecution.scala | 20 +----------
 .../StreamingQueryListenerSuite.scala         |  4 +++
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 7d0d086746c79..d95f55267e142 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent
 import org.apache.spark.util.Clock
 
 /**
@@ -56,6 +57,7 @@ trait ProgressReporter extends Logging {
   protected def offsetSeqMetadata: OffsetSeqMetadata
   protected def currentBatchId: Long
   protected def sparkSession: SparkSession
+  protected def postEvent(event: StreamingQueryListener.Event): Unit
 
   // Local timestamps and counters.
   private var currentTriggerStartTimestamp = -1L
@@ -70,6 +72,12 @@ trait ProgressReporter extends Logging {
   /** Holds the most recent query progress updates.  Accesses must lock on the queue itself. */
   private val progressBuffer = new mutable.Queue[StreamingQueryProgress]()
 
+  private val noDataProgressEventInterval =
+    sparkSession.sessionState.conf.streamingNoDataProgressEventInterval
+
+  // The timestamp we report an event that has no input data
+  private var lastNoDataProgressEventTime = Long.MinValue
+
   @volatile
   protected var currentStatus: StreamingQueryStatus = {
     new StreamingQueryStatus(
@@ -100,6 +108,17 @@ trait ProgressReporter extends Logging {
     currentDurationsMs.clear()
   }
 
+  private def updateProgress(newProgress: StreamingQueryProgress): Unit = {
+    progressBuffer.synchronized {
+      progressBuffer += newProgress
+      while (progressBuffer.length >= sparkSession.sqlContext.conf.streamingProgressRetention) {
+        progressBuffer.dequeue()
+      }
+    }
+    postEvent(new QueryProgressEvent(newProgress))
+    logInfo(s"Streaming query made progress: $newProgress")
+  }
+
   /** Finalizes the query progress and adds it to list of recent status updates. */
   protected def finishTrigger(hasNewData: Boolean): Unit = {
     currentTriggerEndTimestamp = triggerClock.getTimeMillis()
@@ -145,14 +164,18 @@ trait ProgressReporter extends Logging {
       sources = sourceProgress.toArray,
       sink = sinkProgress)
 
-    progressBuffer.synchronized {
-      progressBuffer += newProgress
-      while (progressBuffer.length >= sparkSession.sqlContext.conf.streamingProgressRetention) {
-        progressBuffer.dequeue()
+    if (hasNewData) {
+      // Reset noDataEventTimestamp if we processed any data
+      lastNoDataProgressEventTime = Long.MinValue
+      updateProgress(newProgress)
+    } else {
+      val now = triggerClock.getTimeMillis()
+      if (now - noDataProgressEventInterval >= lastNoDataProgressEventTime) {
+        lastNoDataProgressEventTime = now
+        updateProgress(newProgress)
       }
     }
 
-    logInfo(s"Streaming query made progress: $newProgress")
     currentStatus = currentStatus.copy(isTriggerActive = false)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 083cce8eb52a6..39be222d05d0f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -58,9 +58,6 @@ class StreamExecution(
 
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
-  private val noDataProgressEventInterval =
-    sparkSession.sessionState.conf.streamingNoDataProgressEventInterval
-
   /**
    * A lock used to wait/notify when batches complete. Use a fair lock to avoid thread starvation.
    */
@@ -217,9 +214,6 @@ class StreamExecution(
       // While active, repeatedly attempt to run batches.
       SparkSession.setActiveSession(sparkSession)
 
-      // The timestamp we report an event that has no input data
-      var lastNoDataProgressEventTime = Long.MinValue
-
       triggerExecutor.execute(() => {
         startTrigger()
 
@@ -242,18 +236,6 @@ class StreamExecution(
 
             // Report trigger as finished and construct progress object.
             finishTrigger(dataAvailable)
-            if (dataAvailable) {
-              // Reset noDataEventTimestamp if we processed any data
-              lastNoDataProgressEventTime = Long.MinValue
-              postEvent(new QueryProgressEvent(lastProgress))
-            } else {
-              val now = triggerClock.getTimeMillis()
-              if (now - noDataProgressEventInterval >= lastNoDataProgressEventTime) {
-                lastNoDataProgressEventTime = now
-                postEvent(new QueryProgressEvent(lastProgress))
-              }
-            }
-
             if (dataAvailable) {
               // We'll increase currentBatchId after we complete processing current batch's data
               currentBatchId += 1
@@ -504,7 +486,7 @@ class StreamExecution(
     }
   }
 
-  private def postEvent(event: StreamingQueryListener.Event) {
+  override protected def postEvent(event: StreamingQueryListener.Event): Unit = {
     sparkSession.streams.postListenerEvent(event)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index a38c05eed5e33..1cd503c6de696 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -237,6 +237,10 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
           }
           true
         }
+        // `recentProgresses` should not receive too many no data events
+        actions += AssertOnQuery { q =>
+          q.recentProgresses.size > 1 && q.recentProgresses.size <= 11
+        }
         testStream(input.toDS)(actions: _*)
         spark.sparkContext.listenerBus.waitUntilEmpty(10000)
         // 11 is the max value of the possible numbers of events.

From 8ca6a82c1d04b0986d3063e3ee321698fc278992 Mon Sep 17 00:00:00 2001
From: Michael Allman <michael@videoamp.com>
Date: Tue, 6 Dec 2016 11:33:35 +0800
Subject: [PATCH 286/534] [SPARK-18572][SQL] Add a method `listPartitionNames`
 to `ExternalCatalog`

(Link to Jira issue: https://issues.apache.org/jira/browse/SPARK-18572)

## What changes were proposed in this pull request?

Currently Spark answers the `SHOW PARTITIONS` command by fetching all of the table's partition metadata from the external catalog and constructing partition names therefrom. The Hive client has a `getPartitionNames` method which is many times faster for this purpose, with the performance improvement scaling with the number of partitions in a table.

To test the performance impact of this PR, I ran the `SHOW PARTITIONS` command on two Hive tables with large numbers of partitions. One table has ~17,800 partitions, and the other has ~95,000 partitions. For the purposes of this PR, I'll call the former table `table1` and the latter table `table2`. I ran 5 trials for each table with before-and-after versions of this PR. The results are as follows:

Spark at bdc8153, `SHOW PARTITIONS table1`, times in seconds:
7.901
3.983
4.018
4.331
4.261

Spark at bdc8153, `SHOW PARTITIONS table2`
(Timed out after 10 minutes with a `SocketTimeoutException`.)

Spark at this PR, `SHOW PARTITIONS table1`, times in seconds:
3.801
0.449
0.395
0.348
0.336

Spark at this PR, `SHOW PARTITIONS table2`, times in seconds:
5.184
1.63
1.474
1.519
1.41

Taking the best times from each trial, we get a 12x performance improvement for a table with ~17,800 partitions and at least a 426x improvement for a table with ~95,000 partitions. More significantly, the latter command doesn't even complete with the current code in master.

This is actually a patch we've been using in-house at VideoAmp since Spark 1.1. It's made all the difference in the practical usability of our largest tables. Even with tables with about 1,000 partitions there's a performance improvement of about 2-3x.

## How was this patch tested?

I added a unit test to `VersionsSuite` which tests that the Hive client's `getPartitionNames` method returns the correct number of partitions.

Author: Michael Allman <michael@videoamp.com>

Closes #15998 from mallman/spark-18572-list_partition_names.

(cherry picked from commit 772ddbeaa6fe5abf189d01246f57d295f9346fa3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/catalog/ExternalCatalog.scala    | 26 +++++++++++-
 .../catalyst/catalog/InMemoryCatalog.scala    | 14 +++++++
 .../sql/catalyst/catalog/SessionCatalog.scala | 23 ++++++++++
 .../catalog/ExternalCatalogSuite.scala        | 25 +++++++++++
 .../catalog/SessionCatalogSuite.scala         | 39 +++++++++++++++++
 .../spark/sql/execution/command/tables.scala  | 12 +-----
 .../datasources/DataSourceStrategy.scala      | 22 +++++-----
 .../datasources/PartitioningUtils.scala       | 13 +++++-
 .../spark/sql/hive/HiveExternalCatalog.scala  | 42 +++++++++++++++++--
 .../spark/sql/hive/client/HiveClient.scala    | 10 +++++
 .../sql/hive/client/HiveClientImpl.scala      | 20 +++++++++
 .../spark/sql/hive/client/VersionsSuite.scala |  5 +++
 12 files changed, 221 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 259008f183b56..4b8cac8f32b06 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -189,15 +189,37 @@ abstract class ExternalCatalog {
       table: String,
       spec: TablePartitionSpec): Option[CatalogTablePartition]
 
+  /**
+   * List the names of all partitions that belong to the specified table, assuming it exists.
+   *
+   * For a table with partition columns p1, p2, p3, each partition name is formatted as
+   * `p1=v1/p2=v2/p3=v3`. Each partition column name and value is an escaped path name, and can be
+   * decoded with the `ExternalCatalogUtils.unescapePathName` method.
+   *
+   * The returned sequence is sorted as strings.
+   *
+   * A partial partition spec may optionally be provided to filter the partitions returned, as
+   * described in the `listPartitions` method.
+   *
+   * @param db database name
+   * @param table table name
+   * @param partialSpec partition spec
+   */
+  def listPartitionNames(
+      db: String,
+      table: String,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String]
+
   /**
    * List the metadata of all partitions that belong to the specified table, assuming it exists.
    *
    * A partial partition spec may optionally be provided to filter the partitions returned.
    * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'),
    * then a partial spec of (a='1') will return the first two only.
+   *
    * @param db database name
    * @param table table name
-   * @param partialSpec  partition spec
+   * @param partialSpec partition spec
    */
   def listPartitions(
       db: String,
@@ -210,7 +232,7 @@ abstract class ExternalCatalog {
    *
    * @param db database name
    * @param table table name
-   * @param predicates  partition-pruning predicates
+   * @param predicates partition-pruning predicates
    */
   def listPartitionsByFilter(
       db: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 880a7a0dc4225..a6bebe1a3938c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -28,6 +28,7 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.util.StringUtils
 
@@ -488,6 +489,19 @@ class InMemoryCatalog(
     }
   }
 
+  override def listPartitionNames(
+      db: String,
+      table: String,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = synchronized {
+    val partitionColumnNames = getTable(db, table).partitionColumnNames
+
+    listPartitions(db, table, partialSpec).map { partition =>
+      partitionColumnNames.map { name =>
+        escapePathName(name) + "=" + escapePathName(partition.spec(name))
+      }.mkString("/")
+    }.sorted
+  }
+
   override def listPartitions(
       db: String,
       table: String,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index da3a2079f42d3..7a3d2097a85c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -748,6 +748,26 @@ class SessionCatalog(
     externalCatalog.getPartition(db, table, spec)
   }
 
+  /**
+   * List the names of all partitions that belong to the specified table, assuming it exists.
+   *
+   * A partial partition spec may optionally be provided to filter the partitions returned.
+   * For instance, if there exist partitions (a='1', b='2'), (a='1', b='3') and (a='2', b='4'),
+   * then a partial spec of (a='1') will return the first two only.
+   */
+  def listPartitionNames(
+      tableName: TableIdentifier,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = {
+    val db = formatDatabaseName(tableName.database.getOrElse(getCurrentDatabase))
+    val table = formatTableName(tableName.table)
+    requireDbExists(db)
+    requireTableExists(TableIdentifier(table, Option(db)))
+    partialSpec.foreach { spec =>
+      requirePartialMatchedPartitionSpec(Seq(spec), getTableMetadata(tableName))
+    }
+    externalCatalog.listPartitionNames(db, table, partialSpec)
+  }
+
   /**
    * List the metadata of all partitions that belong to the specified table, assuming it exists.
    *
@@ -762,6 +782,9 @@ class SessionCatalog(
     val table = formatTableName(tableName.table)
     requireDbExists(db)
     requireTableExists(TableIdentifier(table, Option(db)))
+    partialSpec.foreach { spec =>
+      requirePartialMatchedPartitionSpec(Seq(spec), getTableMetadata(tableName))
+    }
     externalCatalog.listPartitions(db, table, partialSpec)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
index 3b39f420af494..00e663c324cb4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala
@@ -346,6 +346,31 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
     assert(new Path(partitionLocation) == defaultPartitionLocation)
   }
 
+  test("list partition names") {
+    val catalog = newBasicCatalog()
+    val newPart = CatalogTablePartition(Map("a" -> "1", "b" -> "%="), storageFormat)
+    catalog.createPartitions("db2", "tbl2", Seq(newPart), ignoreIfExists = false)
+
+    val partitionNames = catalog.listPartitionNames("db2", "tbl2")
+    assert(partitionNames == Seq("a=1/b=%25%3D", "a=1/b=2", "a=3/b=4"))
+  }
+
+  test("list partition names with partial partition spec") {
+    val catalog = newBasicCatalog()
+    val newPart = CatalogTablePartition(Map("a" -> "1", "b" -> "%="), storageFormat)
+    catalog.createPartitions("db2", "tbl2", Seq(newPart), ignoreIfExists = false)
+
+    val partitionNames1 = catalog.listPartitionNames("db2", "tbl2", Some(Map("a" -> "1")))
+    assert(partitionNames1 == Seq("a=1/b=%25%3D", "a=1/b=2"))
+
+    // Partial partition specs including "weird" partition values should use the unescaped values
+    val partitionNames2 = catalog.listPartitionNames("db2", "tbl2", Some(Map("b" -> "%=")))
+    assert(partitionNames2 == Seq("a=1/b=%25%3D"))
+
+    val partitionNames3 = catalog.listPartitionNames("db2", "tbl2", Some(Map("b" -> "%25%3D")))
+    assert(partitionNames3.isEmpty)
+  }
+
   test("list partitions with partial partition spec") {
     val catalog = newBasicCatalog()
     val parts = catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "1")))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index f9c4b2687bf7a..5cc772d8e9a1e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -878,6 +878,31 @@ class SessionCatalogSuite extends SparkFunSuite {
       "the partition spec (a, b) defined in table '`db2`.`tbl1`'"))
   }
 
+  test("list partition names") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    val expectedPartitionNames = Seq("a=1/b=2", "a=3/b=4")
+    assert(catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2"))) ==
+      expectedPartitionNames)
+    // List partition names without explicitly specifying database
+    catalog.setCurrentDatabase("db2")
+    assert(catalog.listPartitionNames(TableIdentifier("tbl2")) == expectedPartitionNames)
+  }
+
+  test("list partition names with partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    assert(
+      catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2")), Some(Map("a" -> "1"))) ==
+        Seq("a=1/b=2"))
+  }
+
+  test("list partition names with invalid partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    intercept[AnalysisException] {
+      catalog.listPartitionNames(TableIdentifier("tbl2", Some("db2")),
+        Some(Map("unknown" -> "unknown")))
+    }
+  }
+
   test("list partitions") {
     val catalog = new SessionCatalog(newBasicCatalog())
     assert(catalogPartitionsEqual(
@@ -887,6 +912,20 @@ class SessionCatalogSuite extends SparkFunSuite {
     assert(catalogPartitionsEqual(catalog.listPartitions(TableIdentifier("tbl2")), part1, part2))
   }
 
+  test("list partitions with partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    assert(catalogPartitionsEqual(
+      catalog.listPartitions(TableIdentifier("tbl2", Some("db2")), Some(Map("a" -> "1"))), part1))
+  }
+
+  test("list partitions with invalid partial partition spec") {
+    val catalog = new SessionCatalog(newBasicCatalog())
+    intercept[AnalysisException] {
+      catalog.listPartitions(
+        TableIdentifier("tbl2", Some("db2")), Some(Map("unknown" -> "unknown")))
+    }
+  }
+
   test("list partitions when database/table does not exist") {
     val catalog = new SessionCatalog(newBasicCatalog())
     intercept[NoSuchDatabaseException] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 57d66f1f14785..5d507759d6a38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -715,13 +715,6 @@ case class ShowPartitionsCommand(
     AttributeReference("partition", StringType, nullable = false)() :: Nil
   }
 
-  private def getPartName(spec: TablePartitionSpec, partColNames: Seq[String]): String = {
-    partColNames.map { name =>
-      ExternalCatalogUtils.escapePathName(name) + "=" +
-        ExternalCatalogUtils.escapePathName(spec(name))
-    }.mkString(File.separator)
-  }
-
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
     val table = catalog.getTableMetadata(tableName)
@@ -758,10 +751,7 @@ case class ShowPartitionsCommand(
       }
     }
 
-    val partNames = catalog.listPartitions(tableName, spec).map { p =>
-      getPartName(p.spec, table.partitionColumnNames)
-    }
-
+    val partNames = catalog.listPartitionNames(tableName, spec)
     partNames.map(Row(_))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 4468dc58e404a..03eed251763b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -161,8 +161,8 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
       insert.copy(partition = parts.map(p => (p._1, None)), child = Project(projectList, query))
 
 
-    case i @ logical.InsertIntoTable(
-           l @ LogicalRelation(t: HadoopFsRelation, _, table), part, query, overwrite, false)
+    case logical.InsertIntoTable(
+      l @ LogicalRelation(t: HadoopFsRelation, _, table), _, query, overwrite, false)
         if query.resolved && t.schema.sameType(query.schema) =>
 
       // Sanity checks
@@ -192,11 +192,19 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
       var initialMatchingPartitions: Seq[TablePartitionSpec] = Nil
       var customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty
 
+      val staticPartitionKeys: TablePartitionSpec = if (overwrite.enabled) {
+        overwrite.staticPartitionKeys.map { case (k, v) =>
+          (partitionSchema.map(_.name).find(_.equalsIgnoreCase(k)).get, v)
+        }
+      } else {
+        Map.empty
+      }
+
       // When partitions are tracked by the catalog, compute all custom partition locations that
       // may be relevant to the insertion job.
       if (partitionsTrackedByCatalog) {
         val matchingPartitions = t.sparkSession.sessionState.catalog.listPartitions(
-          l.catalogTable.get.identifier, Some(overwrite.staticPartitionKeys))
+          l.catalogTable.get.identifier, Some(staticPartitionKeys))
         initialMatchingPartitions = matchingPartitions.map(_.spec)
         customPartitionLocations = getCustomPartitionLocations(
           t.sparkSession, l.catalogTable.get, outputPath, matchingPartitions)
@@ -225,14 +233,6 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] {
         t.location.refresh()
       }
 
-      val staticPartitionKeys: TablePartitionSpec = if (overwrite.enabled) {
-        overwrite.staticPartitionKeys.map { case (k, v) =>
-          (partitionSchema.map(_.name).find(_.equalsIgnoreCase(k)).get, v)
-        }
-      } else {
-        Map.empty
-      }
-
       val insertCmd = InsertIntoHadoopFsRelationCommand(
         outputPath,
         staticPartitionKeys,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index bf9f318780ec2..bc290702dc37f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -244,13 +244,22 @@ object PartitioningUtils {
 
   /**
    * Given a partition path fragment, e.g. `fieldOne=1/fieldTwo=2`, returns a parsed spec
-   * for that fragment, e.g. `Map(("fieldOne", "1"), ("fieldTwo", "2"))`.
+   * for that fragment as a `TablePartitionSpec`, e.g. `Map(("fieldOne", "1"), ("fieldTwo", "2"))`.
    */
   def parsePathFragment(pathFragment: String): TablePartitionSpec = {
+    parsePathFragmentAsSeq(pathFragment).toMap
+  }
+
+  /**
+   * Given a partition path fragment, e.g. `fieldOne=1/fieldTwo=2`, returns a parsed spec
+   * for that fragment as a `Seq[(String, String)]`, e.g.
+   * `Seq(("fieldOne", "1"), ("fieldTwo", "2"))`.
+   */
+  def parsePathFragmentAsSeq(pathFragment: String): Seq[(String, String)] = {
     pathFragment.split("/").map { kv =>
       val pair = kv.split("=", 2)
       (unescapePathName(pair(0)), unescapePathName(pair(1)))
-    }.toMap
+    }
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index c213e8e0b22e6..f67ddc9be1a5a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -35,10 +35,12 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Statistics}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.command.DDLUtils
+import org.apache.spark.sql.execution.datasources.PartitioningUtils
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
@@ -812,9 +814,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     spec.map { case (k, v) => k.toLowerCase -> v }
   }
 
+  // Build a map from lower-cased partition column names to exact column names for a given table
+  private def buildLowerCasePartColNameMap(table: CatalogTable): Map[String, String] = {
+    val actualPartColNames = table.partitionColumnNames
+    actualPartColNames.map(colName => (colName.toLowerCase, colName)).toMap
+  }
+
   // Hive metastore is not case preserving and the column names of the partition specification we
   // get from the metastore are always lower cased. We should restore them w.r.t. the actual table
   // partition columns.
+  private def restorePartitionSpec(
+      spec: TablePartitionSpec,
+      partColMap: Map[String, String]): TablePartitionSpec = {
+    spec.map { case (k, v) => partColMap(k.toLowerCase) -> v }
+  }
+
   private def restorePartitionSpec(
       spec: TablePartitionSpec,
       partCols: Seq[String]): TablePartitionSpec = {
@@ -927,13 +941,32 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   /**
    * Returns the partition names from hive metastore for a given table in a database.
    */
+  override def listPartitionNames(
+      db: String,
+      table: String,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = withClient {
+    val catalogTable = getTable(db, table)
+    val partColNameMap = buildLowerCasePartColNameMap(catalogTable).mapValues(escapePathName)
+    val clientPartitionNames =
+      client.getPartitionNames(catalogTable, partialSpec.map(lowerCasePartitionSpec))
+    clientPartitionNames.map { partName =>
+      val partSpec = PartitioningUtils.parsePathFragmentAsSeq(partName)
+      partSpec.map { case (partName, partValue) =>
+        partColNameMap(partName.toLowerCase) + "=" + escapePathName(partValue)
+      }.mkString("/")
+    }
+  }
+
+  /**
+   * Returns the partitions from hive metastore for a given table in a database.
+   */
   override def listPartitions(
       db: String,
       table: String,
       partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
-    val actualPartColNames = getTable(db, table).partitionColumnNames
+    val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
     client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
-      part.copy(spec = restorePartitionSpec(part.spec, actualPartColNames))
+      part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
     }
   }
 
@@ -954,10 +987,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
 
     val partitionSchema = catalogTable.partitionSchema
+    val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
 
     if (predicates.nonEmpty) {
       val clientPrunedPartitions = client.getPartitionsByFilter(rawTable, predicates).map { part =>
-        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+        part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
       }
       val boundPredicate =
         InterpretedPredicate.create(predicates.reduce(And).transform {
@@ -968,7 +1002,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       clientPrunedPartitions.filter { p => boundPredicate(p.toRow(partitionSchema)) }
     } else {
       client.getPartitions(catalogTable).map { part =>
-        part.copy(spec = restorePartitionSpec(part.spec, catalogTable.partitionColumnNames))
+        part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
index 4c76932b61758..8e7c871183dfd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala
@@ -156,6 +156,16 @@ private[hive] trait HiveClient {
     }
   }
 
+  /**
+   * Returns the partition names for the given table that match the supplied partition spec.
+   * If no partition spec is specified, all partitions are returned.
+   *
+   * The returned sequence is sorted as strings.
+   */
+  def getPartitionNames(
+      table: CatalogTable,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String]
+
   /** Returns the specified partition or None if it does not exist. */
   final def getPartitionOption(
       db: String,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index bd840af5b1649..db73596e5f520 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -519,6 +519,26 @@ private[hive] class HiveClientImpl(
     client.alterPartitions(table, newParts.map { p => toHivePartition(p, hiveTable) }.asJava)
   }
 
+  /**
+   * Returns the partition names for the given table that match the supplied partition spec.
+   * If no partition spec is specified, all partitions are returned.
+   *
+   * The returned sequence is sorted as strings.
+   */
+  override def getPartitionNames(
+      table: CatalogTable,
+      partialSpec: Option[TablePartitionSpec] = None): Seq[String] = withHiveState {
+    val hivePartitionNames =
+      partialSpec match {
+        case None =>
+          // -1 for result limit means "no limit/return all"
+          client.getPartitionNames(table.database, table.identifier.table, -1)
+        case Some(s) =>
+          client.getPartitionNames(table.database, table.identifier.table, s.asJava, -1)
+      }
+    hivePartitionNames.asScala.sorted
+  }
+
   override def getPartitionOption(
       table: CatalogTable,
       spec: TablePartitionSpec): Option[CatalogTablePartition] = withHiveState {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 16ae345de6d95..79e76b3134c2a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -254,6 +254,11 @@ class VersionsSuite extends SparkFunSuite with Logging {
         "default", "src_part", partitions, ignoreIfExists = true)
     }
 
+    test(s"$version: getPartitionNames(catalogTable)") {
+      val partitionNames = (1 to testPartitionCount).map(key2 => s"key1=1/key2=$key2")
+      assert(partitionNames == client.getPartitionNames(client.getTable("default", "src_part")))
+    }
+
     test(s"$version: getPartitions(catalogTable)") {
       assert(testPartitionCount ==
         client.getPartitions(client.getTable("default", "src_part")).size)

From 655297b35651fc68632ebe92ea97ed560548c68e Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 5 Dec 2016 20:35:24 -0800
Subject: [PATCH 287/534] [SPARK-18721][SS] Fix ForeachSink with watermark +
 append

## What changes were proposed in this pull request?

Right now ForeachSink creates a new physical plan, so StreamExecution cannot retrieval metrics and watermark.

This PR changes ForeachSink to manually convert InternalRows to objects without creating a new plan.

## How was this patch tested?

`test("foreach with watermark: append")`.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16160 from zsxwing/SPARK-18721.

(cherry picked from commit 7863c623791d088684107f833fdecb4b5fdab4ec)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/execution/streaming/ForeachSink.scala | 45 ++++--------
 .../streaming/ForeachSinkSuite.scala          | 68 ++++++++++++++++++-
 2 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
index c93fcfb77cc93..de09fb568d2a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ForeachSink.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.TaskContext
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Dataset, Encoder, ForeachWriter}
-import org.apache.spark.sql.catalyst.plans.logical.CatalystSerde
+import org.apache.spark.sql.{DataFrame, Encoder, ForeachWriter}
+import org.apache.spark.sql.catalyst.encoders.encoderFor
 
 /**
  * A [[Sink]] that forwards all data into [[ForeachWriter]] according to the contract defined by
@@ -32,46 +31,26 @@ import org.apache.spark.sql.catalyst.plans.logical.CatalystSerde
 class ForeachSink[T : Encoder](writer: ForeachWriter[T]) extends Sink with Serializable {
 
   override def addBatch(batchId: Long, data: DataFrame): Unit = {
-    // TODO: Refine this method when SPARK-16264 is resolved; see comments below.
-
     // This logic should've been as simple as:
     // ```
     //   data.as[T].foreachPartition { iter => ... }
     // ```
     //
     // Unfortunately, doing that would just break the incremental planing. The reason is,
-    // `Dataset.foreachPartition()` would further call `Dataset.rdd()`, but `Dataset.rdd()` just
-    // does not support `IncrementalExecution`.
+    // `Dataset.foreachPartition()` would further call `Dataset.rdd()`, but `Dataset.rdd()` will
+    // create a new plan. Because StreamExecution uses the existing plan to collect metrics and
+    // update watermark, we should never create a new plan. Otherwise, metrics and watermark are
+    // updated in the new plan, and StreamExecution cannot retrieval them.
     //
-    // So as a provisional fix, below we've made a special version of `Dataset` with its `rdd()`
-    // method supporting incremental planning. But in the long run, we should generally make newly
-    // created Datasets use `IncrementalExecution` where necessary (which is SPARK-16264 tries to
-    // resolve).
-    val incrementalExecution = data.queryExecution.asInstanceOf[IncrementalExecution]
-    val datasetWithIncrementalExecution =
-      new Dataset(data.sparkSession, incrementalExecution, implicitly[Encoder[T]]) {
-        override lazy val rdd: RDD[T] = {
-          val objectType = exprEnc.deserializer.dataType
-          val deserialized = CatalystSerde.deserialize[T](logicalPlan)
-
-          // was originally: sparkSession.sessionState.executePlan(deserialized) ...
-          val newIncrementalExecution = new IncrementalExecution(
-            this.sparkSession,
-            deserialized,
-            incrementalExecution.outputMode,
-            incrementalExecution.checkpointLocation,
-            incrementalExecution.currentBatchId,
-            incrementalExecution.currentEventTimeWatermark)
-          newIncrementalExecution.toRdd.mapPartitions { rows =>
-            rows.map(_.get(0, objectType))
-          }.asInstanceOf[RDD[T]]
-        }
-      }
-    datasetWithIncrementalExecution.foreachPartition { iter =>
+    // Hence, we need to manually convert internal rows to objects using encoder.
+    val encoder = encoderFor[T].resolveAndBind(
+      data.logicalPlan.output,
+      data.sparkSession.sessionState.analyzer)
+    data.queryExecution.toRdd.foreachPartition { iter =>
       if (writer.open(TaskContext.getPartitionId(), batchId)) {
         try {
           while (iter.hasNext) {
-            writer.process(iter.next())
+            writer.process(encoder.fromRow(iter.next()))
           }
         } catch {
           case e: Throwable =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index ee6261036fdd0..4a3eeb70b1702 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -171,7 +171,7 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
     }
   }
 
-  test("foreach with watermark") {
+  test("foreach with watermark: complete") {
     val inputData = MemoryStream[Int]
 
     val windowedAggregation = inputData.toDF()
@@ -204,6 +204,72 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
       query.stop()
     }
   }
+
+  test("foreach with watermark: append") {
+    val inputData = MemoryStream[Int]
+
+    val windowedAggregation = inputData.toDF()
+      .withColumn("eventTime", $"value".cast("timestamp"))
+      .withWatermark("eventTime", "10 seconds")
+      .groupBy(window($"eventTime", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"count".as[Long])
+      .map(_.toInt)
+      .repartition(1)
+
+    val query = windowedAggregation
+      .writeStream
+      .outputMode(OutputMode.Append)
+      .foreach(new TestForeachWriter())
+      .start()
+    try {
+      inputData.addData(10, 11, 12)
+      query.processAllAvailable()
+      inputData.addData(25) // Advance watermark to 15 seconds
+      query.processAllAvailable()
+      inputData.addData(25) // Evict items less than previous watermark
+      query.processAllAvailable()
+
+      // There should be 3 batches and only does the last batch contain a value.
+      val allEvents = ForeachSinkSuite.allEvents()
+      assert(allEvents.size === 3)
+      val expectedEvents = Seq(
+        Seq(
+          ForeachSinkSuite.Open(partition = 0, version = 0),
+          ForeachSinkSuite.Close(None)
+        ),
+        Seq(
+          ForeachSinkSuite.Open(partition = 0, version = 1),
+          ForeachSinkSuite.Close(None)
+        ),
+        Seq(
+          ForeachSinkSuite.Open(partition = 0, version = 2),
+          ForeachSinkSuite.Process(value = 3),
+          ForeachSinkSuite.Close(None)
+        )
+      )
+      assert(allEvents === expectedEvents)
+    } finally {
+      query.stop()
+    }
+  }
+
+  test("foreach sink should support metrics") {
+    val inputData = MemoryStream[Int]
+    val query = inputData.toDS()
+      .writeStream
+      .foreach(new TestForeachWriter())
+      .start()
+    try {
+      inputData.addData(10, 11, 12)
+      query.processAllAvailable()
+      val recentProgress = query.recentProgresses.filter(_.numInputRows != 0).headOption
+      assert(recentProgress.isDefined && recentProgress.get.numInputRows === 3,
+        s"recentProgresses[${query.recentProgresses.toList}] doesn't contain correct metrics")
+    } finally {
+      query.stop()
+    }
+  }
 }
 
 /** A global object to collect events in the executor */

From e362d998d045f9c6b22f34cba0ad1e77a505883b Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Tue, 6 Dec 2016 05:51:39 -0800
Subject: [PATCH 288/534] [SPARK-18634][SQL][TRIVIAL] Touch-up Generate

## What changes were proposed in this pull request?
I jumped the gun on merging https://github.com/apache/spark/pull/16120, and missed a tiny potential problem. This PR fixes that by changing a val into a def; this should prevent potential serialization/initialization weirdness from happening.

## How was this patch tested?
Existing tests.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #16170 from hvanhovell/SPARK-18634.

(cherry picked from commit 381ef4ea76b0920e05c81adb44b1fef88bee5d25)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/plans/logical/basicLogicalOperators.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 304367de4cf6a..0f33e1dae944e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -94,7 +94,7 @@ case class Generate(
 
   override def producedAttributes: AttributeSet = AttributeSet(generatorOutput)
 
-  val qualifiedGeneratorOutput: Seq[Attribute] = qualifier.map { q =>
+  def qualifiedGeneratorOutput: Seq[Attribute] = qualifier.map { q =>
     // prepend the new qualifier to the existed one
     generatorOutput.map(a => a.withQualifier(Some(q)))
   }.getOrElse(generatorOutput)

From ace4079c5f2049d9888a8f27c1fe544c92a9fd2d Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 6 Dec 2016 11:48:11 -0800
Subject: [PATCH 289/534] [SPARK-18714][SQL] Add a simple time function to
 SparkSession

## What changes were proposed in this pull request?
Many Spark developers often want to test the runtime of some function in interactive debugging and testing. This patch adds a simple time function to SparkSession:

```
scala> spark.time { spark.range(1000).count() }
Time taken: 77 ms
res1: Long = 1000
```

## How was this patch tested?
I tested this interactively in spark-shell.

Author: Reynold Xin <rxin@databricks.com>

Closes #16140 from rxin/SPARK-18714.

(cherry picked from commit cb1f10b468e7771af75cb2288d375a87ab66d316)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../org/apache/spark/sql/SparkSession.scala      | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 08d74ac0185b8..f3dde480eabe0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -618,6 +618,22 @@ class SparkSession private(
   @InterfaceStability.Evolving
   def readStream: DataStreamReader = new DataStreamReader(self)
 
+  /**
+   * Executes some code block and prints to stdout the time taken to execute the block. This is
+   * available in Scala only and is used primarily for interactive testing and debugging.
+   *
+   * @since 2.1.0
+   */
+  @InterfaceStability.Stable
+  def time[T](f: => T): T = {
+    val start = System.nanoTime()
+    val ret = f
+    val end = System.nanoTime()
+    // scalastyle:off println
+    println(s"Time taken: ${(end - start) / 1000 / 1000} ms")
+    // scalastyle:on println
+    ret
+  }
 
   // scalastyle:off
   // Disable style checker so "implicits" object can start with lowercase i

From d20e0d6b8919eccaab9ae7db94ba80fdfac03c9d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 6 Dec 2016 13:05:22 -0800
Subject: [PATCH 290/534] [SPARK-18671][SS][TEST] Added tests to ensure
 stability of that all Structured Streaming log formats

## What changes were proposed in this pull request?

To be able to restart StreamingQueries across Spark version, we have already made the logs (offset log, file source log, file sink log) use json. We should added tests with actual json files in the Spark such that any incompatible changes in reading the logs is immediately caught. This PR add tests for FileStreamSourceLog, FileStreamSinkLog, and OffsetSeqLog.

## How was this patch tested?
new unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16128 from tdas/SPARK-18671.

(cherry picked from commit 1ef6b296d7cd2d93cdfd5f54940842d6bb915ce0)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 dev/.rat-excludes                             |  1 +
 .../apache/spark/sql/kafka010/JsonUtils.scala |  9 +++++-
 .../sql/kafka010/KafkaSourceOffsetSuite.scala | 12 ++++++++
 .../file-sink-log-version-2.1.0/7.compact     |  9 ++++++
 .../file-sink-log-version-2.1.0/8             |  3 ++
 .../file-sink-log-version-2.1.0/9             |  2 ++
 .../file-source-log-version-2.1.0/2.compact   |  4 +++
 .../file-source-log-version-2.1.0/3           |  2 ++
 .../file-source-log-version-2.1.0/4           |  2 ++
 .../file-source-offset-version-2.1.0.txt      |  1 +
 .../kafka-source-offset-version-2.1.0.txt     |  1 +
 .../offset-log-version-2.1.0/0                |  4 +++
 .../streaming/FileStreamSinkLogSuite.scala    | 21 +++++++++++++
 .../streaming/OffsetSeqLogSuite.scala         | 16 ++++++++++
 .../sql/streaming/FileStreamSourceSuite.scala | 30 +++++++++++++++++--
 15 files changed, 114 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0.txt
 create mode 100644 sql/core/src/test/resources/structured-streaming/kafka-source-offset-version-2.1.0.txt
 create mode 100644 sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0

diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index a3efddeaa515a..6be1c72bc6cfb 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -102,3 +102,4 @@ org.apache.spark.scheduler.ExternalClusterManager
 .Rbuildignore
 org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
 spark-warehouse
+structured-streaming/*
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
index 13d717092a898..868edb5dcdc0c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala
@@ -81,7 +81,14 @@ private object JsonUtils {
    */
   def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
     val result = new HashMap[String, HashMap[Int, Long]]()
-    partitionOffsets.foreach { case (tp, off) =>
+    implicit val ordering = new Ordering[TopicPartition] {
+      override def compare(x: TopicPartition, y: TopicPartition): Int = {
+        Ordering.Tuple2[String, Int].compare((x.topic, x.partition), (y.topic, y.partition))
+      }
+    }
+    val partitions = partitionOffsets.keySet.toSeq.sorted  // sort for more determinism
+    partitions.foreach { tp =>
+        val off = partitionOffsets(tp)
         val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
         parts += tp.partition -> off
         result += tp.topic -> parts
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
index 881018fd95665..c8326ffcc7ad4 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
@@ -89,4 +89,16 @@ class KafkaSourceOffsetSuite extends OffsetSuite with SharedSQLContext {
         Array(0 -> batch0Serialized, 1 -> batch1Serialized))
     }
   }
+
+  test("read Spark 2.1.0 log format") {
+    val offset = readFromResource("kafka-source-offset-version-2.1.0.txt")
+    assert(KafkaSourceOffset(offset) ===
+      KafkaSourceOffset(("topic1", 0, 456L), ("topic1", 1, 789L), ("topic2", 0, 0L)))
+  }
+
+  private def readFromResource(file: String): SerializedOffset = {
+    import scala.io.Source
+    val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString
+    SerializedOffset(str)
+  }
 }
diff --git a/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact
new file mode 100644
index 0000000000000..e1ec8a74f052c
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/7.compact
@@ -0,0 +1,9 @@
+v1
+{"path":"/a/b/0","size":1,"isDir":false,"modificationTime":1,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/1","size":100,"isDir":false,"modificationTime":100,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/2","size":200,"isDir":false,"modificationTime":200,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/3","size":300,"isDir":false,"modificationTime":300,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/4","size":400,"isDir":false,"modificationTime":400,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/5","size":500,"isDir":false,"modificationTime":500,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/6","size":600,"isDir":false,"modificationTime":600,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/7","size":700,"isDir":false,"modificationTime":700,"blockReplication":1,"blockSize":100,"action":"add"}
diff --git a/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8 b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8
new file mode 100644
index 0000000000000..e7989804e8886
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/8
@@ -0,0 +1,3 @@
+v1
+{"path":"/a/b/8","size":800,"isDir":false,"modificationTime":800,"blockReplication":1,"blockSize":100,"action":"add"}
+{"path":"/a/b/0","size":100,"isDir":false,"modificationTime":100,"blockReplication":1,"blockSize":100,"action":"delete"}
diff --git a/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9 b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9
new file mode 100644
index 0000000000000..42fb0ee416922
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-sink-log-version-2.1.0/9
@@ -0,0 +1,2 @@
+v1
+{"path":"/a/b/9","size":900,"isDir":false,"modificationTime":900,"blockReplication":3,"blockSize":200,"action":"add"}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact
new file mode 100644
index 0000000000000..95f78bb2620d4
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/2.compact
@@ -0,0 +1,4 @@
+v1
+{"path":"/a/b/0","timestamp":1480730949000,"batchId":0}
+{"path":"/a/b/1","timestamp":1480730950000,"batchId":1}
+{"path":"/a/b/2","timestamp":1480730950000,"batchId":2}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3 b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3
new file mode 100644
index 0000000000000..2caa5972e42eb
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/3
@@ -0,0 +1,2 @@
+v1
+{"path":"/a/b/3","timestamp":1480730950000,"batchId":3}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4 b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4
new file mode 100644
index 0000000000000..e54b943229880
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-log-version-2.1.0/4
@@ -0,0 +1,2 @@
+v1
+{"path":"/a/b/4","timestamp":1480730951000,"batchId":4}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0.txt b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0.txt
new file mode 100644
index 0000000000000..51b4008129ffe
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0.txt
@@ -0,0 +1 @@
+345
diff --git a/sql/core/src/test/resources/structured-streaming/kafka-source-offset-version-2.1.0.txt b/sql/core/src/test/resources/structured-streaming/kafka-source-offset-version-2.1.0.txt
new file mode 100644
index 0000000000000..6410031743d26
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/kafka-source-offset-version-2.1.0.txt
@@ -0,0 +1 @@
+{"topic1":{"0":456,"1":789},"topic2":{"0":0}}
diff --git a/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0 b/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
new file mode 100644
index 0000000000000..fe5c1d44a6e26
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
@@ -0,0 +1,4 @@
+v1
+{"batchWatermarkMs":0,"batchTimestampMs":1480981499528}
+0
+{"topic-0":{"0":1}}
\ No newline at end of file
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
index e046fee0c04d3..8a21b76e8f029 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
@@ -185,6 +185,21 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
     }
   }
 
+  test("read Spark 2.1.0 log format") {
+    assert(readFromResource("file-sink-log-version-2.1.0") === Seq(
+      // SinkFileStatus("/a/b/0", 100, false, 100, 1, 100, FileStreamSinkLog.ADD_ACTION), -> deleted
+      SinkFileStatus("/a/b/1", 100, false, 100, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/2", 200, false, 200, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/3", 300, false, 300, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/4", 400, false, 400, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/5", 500, false, 500, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/6", 600, false, 600, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/7", 700, false, 700, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/8", 800, false, 800, 1, 100, FileStreamSinkLog.ADD_ACTION),
+      SinkFileStatus("/a/b/9", 900, false, 900, 3, 200, FileStreamSinkLog.ADD_ACTION)
+    ))
+  }
+
   /**
    * Create a fake SinkFileStatus using path and action. Most of tests don't care about other fields
    * in SinkFileStatus.
@@ -206,4 +221,10 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
       f(sinkLog)
     }
   }
+
+  private def readFromResource(dir: String): Seq[SinkFileStatus] = {
+    val input = getClass.getResource(s"/structured-streaming/$dir")
+    val log = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, input.toString)
+    log.allFiles()
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
index d3a83ea0b922f..d139efaaf824f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -69,4 +69,20 @@ class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
         Array(0 -> batch0Serialized, 1 -> batch1Serialized))
     }
   }
+
+  test("read Spark 2.1.0 log format") {
+    val (batchId, offsetSeq) = readFromResource("offset-log-version-2.1.0")
+    assert(batchId === 0)
+    assert(offsetSeq.offsets === Seq(
+      Some(SerializedOffset("0")),
+      Some(SerializedOffset("""{"topic-0":{"0":1}}"""))
+    ))
+    assert(offsetSeq.metadata === Some(OffsetSeqMetadata(0L, 1480981499528L)))
+  }
+
+  private def readFromResource(dir: String): (Long, OffsetSeq) = {
+    val input = getClass.getResource(s"/structured-streaming/$dir")
+    val log = new OffsetSeqLog(spark, input.toString)
+    log.getLatest().get
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 8256c63d87090..ff1f3e26f1593 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark.sql.streaming
 
 import java.io.File
 
-import scala.collection.mutable
-
 import org.scalatest.PrivateMethodTester
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.FileStreamSource.FileEntry
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -1022,6 +1021,33 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     val options = new FileStreamOptions(Map("maxfilespertrigger" -> "1"))
     assert(options.maxFilesPerTrigger == Some(1))
   }
+
+  test("FileStreamSource offset - read Spark 2.1.0 log format") {
+    val offset = readOffsetFromResource("file-source-offset-version-2.1.0.txt")
+    assert(LongOffset.convert(offset) === Some(LongOffset(345)))
+  }
+
+  test("FileStreamSourceLog - read Spark 2.1.0 log format") {
+    assert(readLogFromResource("file-source-log-version-2.1.0") === Seq(
+      FileEntry("/a/b/0", 1480730949000L, 0L),
+      FileEntry("/a/b/1", 1480730950000L, 1L),
+      FileEntry("/a/b/2", 1480730950000L, 2L),
+      FileEntry("/a/b/3", 1480730950000L, 3L),
+      FileEntry("/a/b/4", 1480730951000L, 4L)
+    ))
+  }
+
+  private def readLogFromResource(dir: String): Seq[FileEntry] = {
+    val input = getClass.getResource(s"/structured-streaming/$dir")
+    val log = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, input.toString)
+    log.allFiles()
+  }
+
+  private def readOffsetFromResource(file: String): SerializedOffset = {
+    import scala.io.Source
+    val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString
+    SerializedOffset(str.trim)
+  }
 }
 
 class FileStreamSourceStressTestSuite extends FileStreamSourceTest {

From 65f5331a7f3a9de8ca7382b2a14db6c0670c4015 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Wed, 7 Dec 2016 06:09:27 +0800
Subject: [PATCH 291/534] [SPARK-18652][PYTHON] Include the example data and
 third-party licenses in pyspark package.

## What changes were proposed in this pull request?

Since we already include the python examples in the pyspark package, we should include the example data with it as well.

We should also include the third-party licences since we distribute their jars with the pyspark package.

## How was this patch tested?

Manually tested with python2.7 and python3.4
```sh
$ ./build/mvn -DskipTests -Phive -Phive-thriftserver -Pyarn -Pmesos clean package
$ cd python
$ python setup.py sdist
$ pip install  dist/pyspark-2.1.0.dev0.tar.gz

$ ls -1 /usr/local/lib/python2.7/dist-packages/pyspark/data/
graphx
mllib
streaming

$ du -sh /usr/local/lib/python2.7/dist-packages/pyspark/data/
600K    /usr/local/lib/python2.7/dist-packages/pyspark/data/

$ ls -1  /usr/local/lib/python2.7/dist-packages/pyspark/licenses/|head -5
LICENSE-AnchorJS.txt
LICENSE-DPark.txt
LICENSE-Mockito.txt
LICENSE-SnapTree.txt
LICENSE-antlr.txt
```

Author: Shuai Lin <linshuai2012@gmail.com>

Closes #16082 from lins05/include-data-in-pyspark-dist.

(cherry picked from commit bd9a4a5ac3abcc48131d1249df55e7d68266343a)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 python/MANIFEST.in |  2 ++
 python/setup.py    | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index bbcce1baa439d..40f1fb2f1ee7e 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -17,6 +17,8 @@
 global-exclude *.py[cod] __pycache__ .DS_Store
 recursive-include deps/jars *.jar
 graft deps/bin
+recursive-include deps/data *.data *.txt
+recursive-include deps/licenses *.txt
 recursive-include deps/examples *.py
 recursive-include lib *.zip
 include README.md
diff --git a/python/setup.py b/python/setup.py
index 625aea04073f5..bc2eb4ce9dbd0 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -69,10 +69,14 @@
 
 EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
 SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+DATA_PATH = os.path.join(SPARK_HOME, "data")
+LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
+
 SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
 JARS_TARGET = os.path.join(TEMP_PATH, "jars")
 EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
-
+DATA_TARGET = os.path.join(TEMP_PATH, "data")
+LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
 
 # Check and see if we are under the spark path in which case we need to build the symlink farm.
 # This is important because we only want to build the symlink farm while under Spark otherwise we
@@ -114,11 +118,15 @@ def _supports_symlinks():
             os.symlink(JARS_PATH, JARS_TARGET)
             os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
             os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+            os.symlink(DATA_PATH, DATA_TARGET)
+            os.symlink(LICENSES_PATH, LICENSES_TARGET)
         else:
             # For windows fall back to the slower copytree
             copytree(JARS_PATH, JARS_TARGET)
             copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
             copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+            copytree(DATA_PATH, DATA_TARGET)
+            copytree(LICENSES_PATH, LICENSES_TARGET)
     else:
         # If we are not inside of SPARK_HOME verify we have the required symlink farm
         if not os.path.exists(JARS_TARGET):
@@ -161,18 +169,24 @@ def _supports_symlinks():
                   'pyspark.jars',
                   'pyspark.python.pyspark',
                   'pyspark.python.lib',
+                  'pyspark.data',
+                  'pyspark.licenses',
                   'pyspark.examples.src.main.python'],
         include_package_data=True,
         package_dir={
             'pyspark.jars': 'deps/jars',
             'pyspark.bin': 'deps/bin',
             'pyspark.python.lib': 'lib',
+            'pyspark.data': 'deps/data',
+            'pyspark.licenses': 'deps/licenses',
             'pyspark.examples.src.main.python': 'deps/examples',
         },
         package_data={
             'pyspark.jars': ['*.jar'],
             'pyspark.bin': ['*'],
             'pyspark.python.lib': ['*.zip'],
+            'pyspark.data': ['*.txt', '*.data'],
+            'pyspark.licenses': ['*.txt'],
             'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
         scripts=scripts,
         license='http://www.apache.org/licenses/LICENSE-2.0',
@@ -202,8 +216,12 @@ def _supports_symlinks():
             os.remove(os.path.join(TEMP_PATH, "jars"))
             os.remove(os.path.join(TEMP_PATH, "bin"))
             os.remove(os.path.join(TEMP_PATH, "examples"))
+            os.remove(os.path.join(TEMP_PATH, "data"))
+            os.remove(os.path.join(TEMP_PATH, "licenses"))
         else:
             rmtree(os.path.join(TEMP_PATH, "jars"))
             rmtree(os.path.join(TEMP_PATH, "bin"))
             rmtree(os.path.join(TEMP_PATH, "examples"))
+            rmtree(os.path.join(TEMP_PATH, "data"))
+            rmtree(os.path.join(TEMP_PATH, "licenses"))
         os.rmdir(TEMP_PATH)

From 9b5bc2a6aeb9580fc2dde3f37a77b4d1fbc6299e Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 6 Dec 2016 17:04:26 -0800
Subject: [PATCH 292/534] [SPARK-18734][SS] Represent timestamp in
 StreamingQueryProgress as formatted string instead of millis

## What changes were proposed in this pull request?

Easier to read while debugging as a formatted string (in ISO8601 format) than in millis

## How was this patch tested?
Updated unit tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16166 from tdas/SPARK-18734.

(cherry picked from commit 539bb3cf9573be5cd86e7e6502523ce89c0de170)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../spark/sql/execution/streaming/ProgressReporter.scala  | 8 ++++++--
 .../scala/org/apache/spark/sql/streaming/progress.scala   | 6 +++---
 .../streaming/StreamingQueryStatusAndProgressSuite.scala  | 8 ++++----
 .../apache/spark/sql/streaming/StreamingQuerySuite.scala  | 2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index d95f55267e142..12d0c1e9b49f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.util.UUID
+import java.text.SimpleDateFormat
+import java.util.{Date, TimeZone, UUID}
 
 import scala.collection.mutable
 import scala.collection.JavaConverters._
@@ -78,6 +79,9 @@ trait ProgressReporter extends Logging {
   // The timestamp we report an event that has no input data
   private var lastNoDataProgressEventTime = Long.MinValue
 
+  private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
+  timestampFormat.setTimeZone(TimeZone.getTimeZone("UTC"))
+
   @volatile
   protected var currentStatus: StreamingQueryStatus = {
     new StreamingQueryStatus(
@@ -156,7 +160,7 @@ trait ProgressReporter extends Logging {
       id = id,
       runId = runId,
       name = name,
-      timestamp = currentTriggerStartTimestamp,
+      timestamp = timestampFormat.format(new Date(currentTriggerStartTimestamp)),
       batchId = currentBatchId,
       durationMs = currentDurationsMs.toMap.mapValues(long2Long).asJava,
       currentWatermark = offsetSeqMetadata.batchWatermarkMs,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index f768080f5d2c6..d1568758b7a43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -29,6 +29,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 
 /**
  * :: Experimental ::
@@ -76,7 +77,7 @@ class StreamingQueryProgress private[sql](
   val id: UUID,
   val runId: UUID,
   val name: String,
-  val timestamp: Long,
+  val timestamp: String,
   val batchId: Long,
   val durationMs: ju.Map[String, java.lang.Long],
   val currentWatermark: Long,
@@ -109,7 +110,7 @@ class StreamingQueryProgress private[sql](
     ("id" -> JString(id.toString)) ~
     ("runId" -> JString(runId.toString)) ~
     ("name" -> JString(name)) ~
-    ("timestamp" -> JInt(timestamp)) ~
+    ("timestamp" -> JString(timestamp)) ~
     ("numInputRows" -> JInt(numInputRows)) ~
     ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
     ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond)) ~
@@ -121,7 +122,6 @@ class StreamingQueryProgress private[sql](
     ("stateOperators" -> JArray(stateOperators.map(_.jsonValue).toList)) ~
     ("sources" -> JArray(sources.map(_.jsonValue).toList)) ~
     ("sink" -> sink.jsonValue)
-
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
index 96f19db1a90e0..193c943f83be8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -38,7 +38,7 @@ class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
         |  "id" : "${testProgress1.id.toString}",
         |  "runId" : "${testProgress1.runId.toString}",
         |  "name" : "myName",
-        |  "timestamp" : 1,
+        |  "timestamp" : "2016-12-05T20:54:20.827Z",
         |  "numInputRows" : 678,
         |  "inputRowsPerSecond" : 10.0,
         |  "durationMs" : {
@@ -71,7 +71,7 @@ class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
          |  "id" : "${testProgress2.id.toString}",
          |  "runId" : "${testProgress2.runId.toString}",
          |  "name" : null,
-         |  "timestamp" : 1,
+         |  "timestamp" : "2016-12-05T20:54:20.827Z",
          |  "numInputRows" : 678,
          |  "durationMs" : {
          |    "total" : 0
@@ -131,7 +131,7 @@ object StreamingQueryStatusAndProgressSuite {
     id = UUID.randomUUID,
     runId = UUID.randomUUID,
     name = "myName",
-    timestamp = 1L,
+    timestamp = "2016-12-05T20:54:20.827Z",
     batchId = 2L,
     durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
     currentWatermark = 3L,
@@ -153,7 +153,7 @@ object StreamingQueryStatusAndProgressSuite {
     id = UUID.randomUUID,
     runId = UUID.randomUUID,
     name = null, // should not be present in the json
-    timestamp = 1L,
+    timestamp = "2016-12-05T20:54:20.827Z",
     batchId = 2L,
     durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
     currentWatermark = 3L,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 893cb762c6580..55dd1a5d51e37 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -243,7 +243,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
         assert(progress.id === query.id)
         assert(progress.name === query.name)
         assert(progress.batchId === 0)
-        assert(progress.timestamp === 100)
+        assert(progress.timestamp === "1970-01-01T00:00:00.100Z") // 100 ms in UTC
         assert(progress.numInputRows === 2)
         assert(progress.processedRowsPerSecond === 2.0)
 

From 3750c6e9b580be0f2e25f691a1fd582f1b7e430a Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 6 Dec 2016 21:51:38 -0800
Subject: [PATCH 293/534] [SPARK-18671][SS][TEST-MAVEN] Follow up PR to fix
 test for Maven

## What changes were proposed in this pull request?

Maven compilation seem to not allow resource is sql/test to be easily referred to in kafka-0-10-sql tests. So moved the kafka-source-offset-version-2.1.0 from sql test resources to kafka-0-10-sql test resources.

## How was this patch tested?

Manually ran maven test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16183 from tdas/SPARK-18671-1.

(cherry picked from commit 5c6bcdbda4dd23bbd112a7395cd9d1cfd04cf4bb)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../src/test/resources}/kafka-source-offset-version-2.1.0.txt  | 0
 .../org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala | 3 ++-
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename {sql/core/src/test/resources/structured-streaming => external/kafka-0-10-sql/src/test/resources}/kafka-source-offset-version-2.1.0.txt (100%)

diff --git a/sql/core/src/test/resources/structured-streaming/kafka-source-offset-version-2.1.0.txt b/external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt
similarity index 100%
rename from sql/core/src/test/resources/structured-streaming/kafka-source-offset-version-2.1.0.txt
rename to external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
index c8326ffcc7ad4..22668fd6faaa9 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
@@ -98,7 +98,8 @@ class KafkaSourceOffsetSuite extends OffsetSuite with SharedSQLContext {
 
   private def readFromResource(file: String): SerializedOffset = {
     import scala.io.Source
-    val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString
+    val input = getClass.getResource(s"/$file").toURI
+    val str = Source.fromFile(input).mkString
     SerializedOffset(str)
   }
 }

From 340e9aea4853805c42b8739004d93efe8fe16ba4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 7 Dec 2016 00:31:11 -0800
Subject: [PATCH 294/534] [SPARK-18686][SPARKR][ML] Several cleanup and
 improvements for spark.logit.

## What changes were proposed in this pull request?
Several cleanup and improvements for ```spark.logit```:
* ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model.
* ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently.
* SparkR test improvement: comparing the training result with native R glmnet.
* Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users.

## How was this patch tested?
Unit tests.

The ```summary``` output after this change:
multinomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> model <- spark.logit(df, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
             versicolor  virginica   setosa
(Intercept)  1.514031    -2.609108   1.095077
Sepal_Length 0.02511006  0.2649821   -0.2900921
Sepal_Width  -0.5291215  -0.02016446 0.549286
Petal_Length 0.03647411  0.1544119   -0.190886
Petal_Width  0.000236092 0.4195804   -0.4198165
```
binomial logistic regression:
```
> df <- suppressWarnings(createDataFrame(iris))
> training <- df[df$Species %in% c("versicolor", "virginica"), ]
> model <- spark.logit(training, Species ~ ., regParam = 0.5)
> summary(model)
$coefficients
             Estimate
(Intercept)  -6.053815
Sepal_Length 0.2449379
Sepal_Width  0.1648321
Petal_Length 0.4730718
Petal_Width  1.031947
```

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16117 from yanboliang/spark-18686.

(cherry picked from commit 90b59d1bf262b41c3a5f780697f504030f9d079c)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/mllib.R                               |  86 +++-----
 R/pkg/inst/tests/testthat/test_mllib.R        | 183 ++++++++++++------
 .../ml/r/LogisticRegressionWrapper.scala      |  81 ++++----
 3 files changed, 203 insertions(+), 147 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index eed829356f2be..074e9cbebe1d4 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -733,8 +733,6 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
 #'                  is the original probability of that class and t is the class's threshold.
 #' @param weightCol The weight column name.
-#' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
-#'                         are large, this param could be adjusted to a larger size.
 #' @param probabilityCol column name for predicted class conditional probabilities.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model
@@ -746,45 +744,35 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' \dontrun{
 #' sparkR.session()
 #' # binary logistic regression
-#' label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
-#' features <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-#' binary_data <- as.data.frame(cbind(label, features))
-#' binary_df <- createDataFrame(binary_data)
-#' blr_model <- spark.logit(binary_df, label ~ features, thresholds = 1.0)
-#' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-#'
-#' # summary of binary logistic regression
-#' blr_summary <- summary(blr_model)
-#' blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
+#' df <- createDataFrame(iris)
+#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
+#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
+#' summary <- summary(model)
+#'
+#' # fitted values on training data
+#' fitted <- predict(model, training)
+#'
 #' # save fitted model to input path
 #' path <- "path/to/model"
-#' write.ml(blr_model, path)
+#' write.ml(model, path)
 #'
 #' # can also read back the saved model and predict
 #' # Note that summary deos not work on loaded model
 #' savedModel <- read.ml(path)
-#' blr_predict2 <- collect(select(predict(savedModel, binary_df), "prediction"))
+#' summary(savedModel)
 #'
 #' # multinomial logistic regression
 #'
-#' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
-#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
-#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
-#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
-#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
-#' df <- createDataFrame(data)
+#' df <- createDataFrame(iris)
+#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
+#' summary <- summary(model)
 #'
-#' # Note that summary of multinomial logistic regression is not implemented yet
-#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds = c(0, 1, 1))
-#' predict1 <- collect(select(predict(model, df), "prediction"))
 #' }
 #' @note spark.logit since 2.1.0
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
                    tol = 1E-6, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
-                   probabilityCol = "probability") {
+                   thresholds = 0.5, weightCol = NULL, probabilityCol = "probability") {
             formula <- paste(deparse(formula), collapse = "")
 
             if (is.null(weightCol)) {
@@ -796,8 +784,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
                                 as.numeric(elasticNetParam), as.integer(maxIter),
                                 as.numeric(tol), as.character(family),
                                 as.logical(standardization), as.array(thresholds),
-                                as.character(weightCol), as.integer(aggregationDepth),
-                                as.character(probabilityCol))
+                                as.character(weightCol), as.character(probabilityCol))
             new("LogisticRegressionModel", jobj = jobj)
           })
 
@@ -817,10 +804,7 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
 #  Get the summary of an LogisticRegressionModel
 
 #' @param object an LogisticRegressionModel fitted by \code{spark.logit}
-#' @return \code{summary} returns the Binary Logistic regression results of a given model as list,
-#'         including roc, areaUnderROC, pr, fMeasureByThreshold, precisionByThreshold,
-#'         recallByThreshold, totalIterations, objectiveHistory. Note that Multinomial logistic
-#'         regression summary is not available now.
+#' @return \code{summary} returns coefficients matrix of the fitted model
 #' @rdname spark.logit
 #' @aliases summary,LogisticRegressionModel-method
 #' @export
@@ -828,33 +812,21 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
 setMethod("summary", signature(object = "LogisticRegressionModel"),
           function(object) {
             jobj <- object@jobj
-            is.loaded <- callJMethod(jobj, "isLoaded")
-
-            if (is.loaded) {
-              stop("Loaded model doesn't have training summary.")
+            features <- callJMethod(jobj, "rFeatures")
+            labels <- callJMethod(jobj, "labels")
+            coefficients <- callJMethod(jobj, "rCoefficients")
+            nCol <- length(coefficients) / length(features)
+            coefficients <- matrix(coefficients, ncol = nCol)
+            # If nCol == 1, means this is a binomial logistic regression model with pivoting.
+            # Otherwise, it's a multinomial logistic regression model without pivoting.
+            if (nCol == 1) {
+              colnames(coefficients) <- c("Estimate")
+            } else {
+              colnames(coefficients) <- unlist(labels)
             }
+            rownames(coefficients) <- unlist(features)
 
-            roc <- dataFrame(callJMethod(jobj, "roc"))
-
-            areaUnderROC <- callJMethod(jobj, "areaUnderROC")
-
-            pr <- dataFrame(callJMethod(jobj, "pr"))
-
-            fMeasureByThreshold <- dataFrame(callJMethod(jobj, "fMeasureByThreshold"))
-
-            precisionByThreshold <- dataFrame(callJMethod(jobj, "precisionByThreshold"))
-
-            recallByThreshold <- dataFrame(callJMethod(jobj, "recallByThreshold"))
-
-            totalIterations <- callJMethod(jobj, "totalIterations")
-
-            objectiveHistory <- callJMethod(jobj, "objectiveHistory")
-
-            list(roc = roc, areaUnderROC = areaUnderROC, pr = pr,
-                 fMeasureByThreshold = fMeasureByThreshold,
-                 precisionByThreshold = precisionByThreshold,
-                 recallByThreshold = recallByThreshold,
-                 totalIterations = totalIterations, objectiveHistory = objectiveHistory)
+            list(coefficients = coefficients)
           })
 
 #' Multilayer Perceptron Classification Model
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 07e812fd98013..d7aa965422652 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -635,68 +635,141 @@ test_that("spark.isotonicRegression", {
 })
 
 test_that("spark.logit", {
-  # test binary logistic regression
-  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
-  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-  binary_data <- as.data.frame(cbind(label, feature))
-  binary_df <- createDataFrame(binary_data)
-
-  blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
-  blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
-  blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
-  blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
-  expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0"))
-
-  # test summary of binary logistic regression
-  blr_summary <- summary(blr_model)
-  blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
-  expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
-               tolerance = 1e-4)
-  expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
-               tolerance = 1e-4)
-  blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
-  expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
-               tolerance = 1e-4)
-  blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
-  expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
-               tolerance = 1e-4)
+  # R code to reproduce the result.
+  # nolint start
+  #' library(glmnet)
+  #' iris.x = as.matrix(iris[, 1:4])
+  #' iris.y = as.factor(as.character(iris[, 5]))
+  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # $setosa
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               1.0981324
+  # Sepal.Length -0.2909860
+  # Sepal.Width   0.5510907
+  # Petal.Length -0.1915217
+  # Petal.Width  -0.4211946
+  #
+  # $versicolor
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               1.520061e+00
+  # Sepal.Length  2.524501e-02
+  # Sepal.Width  -5.310313e-01
+  # Petal.Length  3.656543e-02
+  # Petal.Width  -3.144464e-05
+  #
+  # $virginica
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #              -2.61819385
+  # Sepal.Length  0.26574097
+  # Sepal.Width  -0.02005932
+  # Petal.Length  0.15495629
+  # Petal.Width   0.42122607
+  # nolint end
 
-  # test model save and read
-  modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
-  write.ml(blr_model, modelPath)
-  expect_error(write.ml(blr_model, modelPath))
-  write.ml(blr_model, modelPath, overwrite = TRUE)
-  blr_model2 <- read.ml(modelPath)
-  blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, blr_predict2$prediction)
-  expect_error(summary(blr_model2))
+  # Test multinomial logistic regression againt three classes
+  df <- suppressWarnings(createDataFrame(iris))
+  model <- spark.logit(df, Species ~ ., regParam = 0.5)
+  summary <- summary(model)
+  versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
+  virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
+  setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
+  versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
+  virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
+  setosaCoefs <- unlist(summary$coefficients[, "setosa"])
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+  expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
+
+  # Test model save and load
+  modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  coefs <- summary(model)$coefficients
+  coefs2 <- summary(model2)$coefficients
+  expect_equal(coefs, coefs2)
   unlink(modelPath)
 
-  # test prediction label as text
-  training <- suppressWarnings(createDataFrame(iris))
-  binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ]
-  binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width)
-  prediction <- predict(binomial_model, binomial_training)
+  # R code to reproduce the result.
+  # nolint start
+  #' library(glmnet)
+  #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  #' iris.x = as.matrix(iris2[, 1:4])
+  #' iris.y = as.factor(as.character(iris2[, 5]))
+  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # $versicolor
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               3.93844796
+  # Sepal.Length -0.13538675
+  # Sepal.Width  -0.02386443
+  # Petal.Length -0.35076451
+  # Petal.Width  -0.77971954
+  #
+  # $virginica
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #              -3.93844796
+  # Sepal.Length  0.13538675
+  # Sepal.Width   0.02386443
+  # Petal.Length  0.35076451
+  # Petal.Width   0.77971954
+  #
+  #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  # (Intercept)  -6.0824412
+  # Sepal.Length  0.2458260
+  # Sepal.Width   0.1642093
+  # Petal.Length  0.4759487
+  # Petal.Width   1.0383948
+  #
+  # nolint end
+
+  # Test multinomial logistic regression againt two classes
+  df <- suppressWarnings(createDataFrame(iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
+  summary <- summary(model)
+  versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
+  virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
+  versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
+  virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+
+  # Test binomial logistic regression againt two classes
+  model <- spark.logit(training, Species ~ ., regParam = 0.5)
+  summary <- summary(model)
+  coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
+  coefs <- unlist(summary$coefficients[, "Estimate"])
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+
+  # Test prediction with string label
+  prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
-  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
-                "versicolor", "virginica", "versicolor", "virginica", "versicolor")
+  expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
+                "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
   expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
 
-  # test multinomial logistic regression
-  label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
-  feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
-  feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
-  feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
-  feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-  data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+  # Test prediction with numeric label
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+  data <- as.data.frame(cbind(label, feature))
   df <- createDataFrame(data)
-
-  model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
-  predict1 <- collect(select(predict(model, df), "prediction"))
-  expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
-  # Summary of multinomial logistic regression is not implemented yet
-  expect_error(summary(model))
+  model <- spark.logit(df, label ~ feature)
+  prediction <- collect(select(predict(model, df), "prediction"))
+  expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
 })
 
 test_that("spark.gaussianMixture", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 9fe6202980fca..7f0f3cea2124a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -23,8 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
+import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -32,38 +33,48 @@ import org.apache.spark.sql.{DataFrame, Dataset}
 private[r] class LogisticRegressionWrapper private (
     val pipeline: PipelineModel,
     val features: Array[String],
-    val isLoaded: Boolean = false) extends MLWritable {
+    val labels: Array[String]) extends MLWritable {
 
   import LogisticRegressionWrapper._
 
-  private val logisticRegressionModel: LogisticRegressionModel =
+  private val lrModel: LogisticRegressionModel =
     pipeline.stages(1).asInstanceOf[LogisticRegressionModel]
 
-  lazy val totalIterations: Int = logisticRegressionModel.summary.totalIterations
-
-  lazy val objectiveHistory: Array[Double] = logisticRegressionModel.summary.objectiveHistory
-
-  lazy val blrSummary =
-    logisticRegressionModel.summary.asInstanceOf[BinaryLogisticRegressionSummary]
-
-  lazy val roc: DataFrame = blrSummary.roc
-
-  lazy val areaUnderROC: Double = blrSummary.areaUnderROC
-
-  lazy val pr: DataFrame = blrSummary.pr
-
-  lazy val fMeasureByThreshold: DataFrame = blrSummary.fMeasureByThreshold
-
-  lazy val precisionByThreshold: DataFrame = blrSummary.precisionByThreshold
+  val rFeatures: Array[String] = if (lrModel.getFitIntercept) {
+    Array("(Intercept)") ++ features
+  } else {
+    features
+  }
 
-  lazy val recallByThreshold: DataFrame = blrSummary.recallByThreshold
+  val rCoefficients: Array[Double] = {
+    val numRows = lrModel.coefficientMatrix.numRows
+    val numCols = lrModel.coefficientMatrix.numCols
+    val numColsWithIntercept = if (lrModel.getFitIntercept) numCols + 1 else numCols
+    val coefficients: Array[Double] = new Array[Double](numRows * numColsWithIntercept)
+    val coefficientVectors: Seq[Vector] = lrModel.coefficientMatrix.rowIter.toSeq
+    var i = 0
+    if (lrModel.getFitIntercept) {
+      while (i < numRows) {
+        coefficients(i * numColsWithIntercept) = lrModel.interceptVector(i)
+        System.arraycopy(coefficientVectors(i).toArray, 0,
+          coefficients, i * numColsWithIntercept + 1, numCols)
+        i += 1
+      }
+    } else {
+      while (i < numRows) {
+        System.arraycopy(coefficientVectors(i).toArray, 0,
+          coefficients, i * numColsWithIntercept, numCols)
+        i += 1
+      }
+    }
+    coefficients
+  }
 
   def transform(dataset: Dataset[_]): DataFrame = {
     pipeline.transform(dataset)
       .drop(PREDICTED_LABEL_INDEX_COL)
-      .drop(logisticRegressionModel.getFeaturesCol)
-      .drop(logisticRegressionModel.getLabelCol)
-
+      .drop(lrModel.getFeaturesCol)
+      .drop(lrModel.getLabelCol)
   }
 
   override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)
@@ -86,8 +97,7 @@ private[r] object LogisticRegressionWrapper
       standardization: Boolean,
       thresholds: Array[Double],
       weightCol: String,
-      aggregationDepth: Int,
-      probability: String
+      probabilityCol: String
       ): LogisticRegressionWrapper = {
 
     val rFormula = new RFormula()
@@ -102,7 +112,7 @@ private[r] object LogisticRegressionWrapper
     val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
-    val logisticRegression = new LogisticRegression()
+    val lr = new LogisticRegression()
       .setRegParam(regParam)
       .setElasticNetParam(elasticNetParam)
       .setMaxIter(maxIter)
@@ -111,16 +121,15 @@ private[r] object LogisticRegressionWrapper
       .setFamily(family)
       .setStandardization(standardization)
       .setWeightCol(weightCol)
-      .setAggregationDepth(aggregationDepth)
       .setFeaturesCol(rFormula.getFeaturesCol)
       .setLabelCol(rFormula.getLabelCol)
-      .setProbabilityCol(probability)
+      .setProbabilityCol(probabilityCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
 
     if (thresholds.length > 1) {
-      logisticRegression.setThresholds(thresholds)
+      lr.setThresholds(thresholds)
     } else {
-      logisticRegression.setThreshold(thresholds(0))
+      lr.setThreshold(thresholds(0))
     }
 
     val idxToStr = new IndexToString()
@@ -129,10 +138,10 @@ private[r] object LogisticRegressionWrapper
       .setLabels(labels)
 
     val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, logisticRegression, idxToStr))
+      .setStages(Array(rFormulaModel, lr, idxToStr))
       .fit(data)
 
-    new LogisticRegressionWrapper(pipeline, features)
+    new LogisticRegressionWrapper(pipeline, features, labels)
   }
 
   override def read: MLReader[LogisticRegressionWrapper] = new LogisticRegressionWrapperReader
@@ -146,7 +155,8 @@ private[r] object LogisticRegressionWrapper
       val pipelinePath = new Path(path, "pipeline").toString
 
       val rMetadata = ("class" -> instance.getClass.getName) ~
-        ("features" -> instance.features.toSeq)
+        ("features" -> instance.features.toSeq) ~
+        ("labels" -> instance.labels.toSeq)
       val rMetadataJson: String = compact(render(rMetadata))
       sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)
 
@@ -164,9 +174,10 @@ private[r] object LogisticRegressionWrapper
       val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
       val rMetadata = parse(rMetadataStr)
       val features = (rMetadata \ "features").extract[Array[String]]
+      val labels = (rMetadata \ "labels").extract[Array[String]]
 
       val pipeline = PipelineModel.load(pipelinePath)
-      new LogisticRegressionWrapper(pipeline, features, isLoaded = true)
+      new LogisticRegressionWrapper(pipeline, features, labels)
     }
   }
-}
\ No newline at end of file
+}

From 99c293eeaa9733fc424404d04a9671e9525a1e36 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 7 Dec 2016 16:37:25 +0800
Subject: [PATCH 295/534] [SPARK-18701][ML] Fix Poisson GLM failure due to
 wrong initialization

Poisson GLM fails for many standard data sets (see example in test or JIRA). The issue is incorrect initialization leading to almost zero probability and weights. Specifically, the mean is initialized as the response, which could be zero. Applying the log link results in very negative numbers (protected against -Inf), which again leads to close to zero probability and weights in the weighted least squares. Fix and test are included in the commits.

## What changes were proposed in this pull request?
Update initialization in Poisson GLM

## How was this patch tested?
Add test in GeneralizedLinearRegressionSuite

srowen sethah yanboliang HyukjinKwon mengxr

Author: actuaryzhang <actuaryzhang10@gmail.com>

Closes #16131 from actuaryzhang/master.

(cherry picked from commit b8280271396eb74638da6546d76bbb2d06c7011b)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../GeneralizedLinearRegression.scala         |  6 +++++-
 .../GeneralizedLinearRegressionSuite.scala    | 21 +++++++++++--------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 770a2571bb9c2..f137c8cb41894 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -505,7 +505,11 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
     override def initialize(y: Double, weight: Double): Double = {
       require(y >= 0.0, "The response variable of Poisson family " +
         s"should be non-negative, but got $y")
-      y
+      /*
+        Force Poisson mean > 0 to avoid numerical instability in IRLS.
+        R uses y + 0.1 for initialization. See poisson()$initialize.
+       */
+      math.max(y, 0.1)
     }
 
     override def variance(mu: Double): Double = mu
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 4fab2160339c6..3e9e1fced8ec4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -89,11 +89,14 @@ class GeneralizedLinearRegressionSuite
       xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01,
       family = "poisson", link = "log").toDF()
 
-    datasetPoissonLogWithZero = generateGeneralizedLinearRegressionInput(
-      intercept = -1.5, coefficients = Array(0.22, 0.06), xMean = Array(2.9, 10.5),
-      xVariance = Array(0.7, 1.2), nPoints = 100, seed, noiseLevel = 0.01,
-      family = "poisson", link = "log")
-      .map{x => LabeledPoint(if (x.label < 0.7) 0.0 else x.label, x.features)}.toDF()
+    datasetPoissonLogWithZero = Seq(
+      LabeledPoint(0.0, Vectors.dense(18, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(12, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(15, 0.0)),
+      LabeledPoint(0.0, Vectors.dense(13, 2.0)),
+      LabeledPoint(0.0, Vectors.dense(15, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(16, 1.0))
+    ).toDF()
 
     datasetPoissonIdentity = generateGeneralizedLinearRegressionInput(
       intercept = 2.5, coefficients = Array(2.2, 0.6), xMean = Array(2.9, 10.5),
@@ -480,12 +483,12 @@ class GeneralizedLinearRegressionSuite
          model <- glm(formula, family="poisson", data=data)
          print(as.vector(coef(model)))
        }
-       [1]  0.4272661 -0.1565423
-       [1] -3.6911354  0.6214301  0.1295814
+       [1] -0.0457441 -0.6833928
+       [1] 1.8121235  -0.1747493  -0.5815417
      */
     val expected = Seq(
-      Vectors.dense(0.0, 0.4272661, -0.1565423),
-      Vectors.dense(-3.6911354, 0.6214301, 0.1295814))
+      Vectors.dense(0.0, -0.0457441, -0.6833928),
+      Vectors.dense(1.8121235, -0.1747493, -0.5815417))
 
     import GeneralizedLinearRegression._
 

From 51754d6df703c02ecb23ec1779889602ff8fb038 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Wed, 7 Dec 2016 17:34:45 +0800
Subject: [PATCH 296/534] [SPARK-18678][ML] Skewed reservoir sampling in
 SamplingUtils

## What changes were proposed in this pull request?

Fix reservoir sampling bias for small k. An off-by-one error meant that the probability of replacement was slightly too high -- k/(l-1) after l element instead of k/l, which matters for small k.

## How was this patch tested?

Existing test plus new test case.

Author: Sean Owen <sowen@cloudera.com>

Closes #16129 from srowen/SPARK-18678.

(cherry picked from commit 79f5f281bb69cb2de9f64006180abd753e8ae427)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 R/pkg/inst/tests/testthat/test_mllib.R              |  9 +++++----
 .../apache/spark/util/random/SamplingUtils.scala    |  5 ++++-
 .../spark/util/random/SamplingUtilsSuite.scala      | 13 +++++++++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index d7aa965422652..9f810befcd40f 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -1007,10 +1007,11 @@ test_that("spark.randomForest", {
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 20, seed = 123)
   predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.379, 61.096, 60.636, 62.258,
-                                         63.736, 64.296, 64.868, 64.300,
-                                         66.709, 67.697, 67.966, 67.252,
-                                         68.866, 69.593, 69.195, 69.658),
+  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
+                                         63.53160, 64.05470, 65.12710, 64.30450,
+                                         66.70910, 67.86125, 68.08700, 67.21865,
+                                         68.89275, 69.53180, 69.39640, 69.68250),
+
                tolerance = 1e-4)
   stats <- summary(model)
   expect_equal(stats$numTrees, 20)
diff --git a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
index 297524c943e1f..a7e0075debedb 100644
--- a/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/SamplingUtils.scala
@@ -56,11 +56,14 @@ private[spark] object SamplingUtils {
       val rand = new XORShiftRandom(seed)
       while (input.hasNext) {
         val item = input.next()
+        l += 1
+        // There are k elements in the reservoir, and the l-th element has been
+        // consumed. It should be chosen with probability k/l. The expression
+        // below is a random long chosen uniformly from [0,l)
         val replacementIndex = (rand.nextDouble() * l).toLong
         if (replacementIndex < k) {
           reservoir(replacementIndex.toInt) = item
         }
-        l += 1
       }
       (reservoir, l)
     }
diff --git a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
index 667a4db6f7bb6..55c5dd5e2460d 100644
--- a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
@@ -44,6 +44,19 @@ class SamplingUtilsSuite extends SparkFunSuite {
     assert(sample3.length === 10)
   }
 
+  test("SPARK-18678 reservoirSampleAndCount with tiny input") {
+    val input = Seq(0, 1)
+    val counts = new Array[Int](input.size)
+    for (i <- 0 until 500) {
+      val (samples, inputSize) = SamplingUtils.reservoirSampleAndCount(input.iterator, 1)
+      assert(inputSize === 2)
+      assert(samples.length === 1)
+      counts(samples.head) += 1
+    }
+    // If correct, should be true with prob ~ 0.99999707
+    assert(math.abs(counts(0) - counts(1)) <= 100)
+  }
+
   test("computeFraction") {
     // test that the computed fraction guarantees enough data points
     // in the sample with a failure rate <= 0.0001

From 4432a2a8386f951775957f352e4ba223c6ce4fa3 Mon Sep 17 00:00:00 2001
From: Jie Xiong <jiexiong@fb.com>
Date: Wed, 7 Dec 2016 04:33:30 -0800
Subject: [PATCH 297/534] [SPARK-18208][SHUFFLE] Executor OOM due to a growing
 LongArray in BytesToBytesMap

## What changes were proposed in this pull request?

BytesToBytesMap currently does not release the in-memory storage (the longArray variable) after it spills to disk. This is typically not a problem during aggregation because the longArray should be much smaller than the pages, and because we grow the longArray at a conservative rate.

However this can lead to an OOM when an already running task is allocated more than its fair share, this can happen because of a scheduling delay. In this case the longArray can grow beyond the fair share of memory for the task. This becomes problematic when the task spills and the long array is not freed, that causes subsequent memory allocation requests to be denied by the memory manager resulting in an OOM.

This PR fixes this issuing by freeing the longArray when the BytesToBytesMap spills.

## How was this patch tested?

Existing tests and tested on realworld workloads.

Author: Jie Xiong <jiexiong@fb.com>
Author: jiexiong <jiexiong@gmail.com>

Closes #15722 from jiexiong/jie_oom_fix.

(cherry picked from commit c496d03b5289f7c604661a12af86f6accddcf125)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../java/org/apache/spark/unsafe/map/BytesToBytesMap.java  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index d2fcdea4f2cee..44120e591f2fb 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -170,6 +170,8 @@ public final class BytesToBytesMap extends MemoryConsumer {
 
   private long peakMemoryUsedBytes = 0L;
 
+  private final int initialCapacity;
+
   private final BlockManager blockManager;
   private final SerializerManager serializerManager;
   private volatile MapIterator destructiveIterator = null;
@@ -202,6 +204,7 @@ public BytesToBytesMap(
       throw new IllegalArgumentException("Page size " + pageSizeBytes + " cannot exceed " +
         TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES);
     }
+    this.initialCapacity = initialCapacity;
     allocate(initialCapacity);
   }
 
@@ -902,12 +905,12 @@ public LongArray getArray() {
   public void reset() {
     numKeys = 0;
     numValues = 0;
-    longArray.zeroOut();
-
+    freeArray(longArray);
     while (dataPages.size() > 0) {
       MemoryBlock dataPage = dataPages.removeLast();
       freePage(dataPage);
     }
+    allocate(initialCapacity);
     currentPage = null;
     pageCursor = 0;
   }

From 5dbcd4fcfbc14ba8c17e1cb364ca45b99aa90708 Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Wed, 7 Dec 2016 04:44:14 -0800
Subject: [PATCH 298/534] [SPARK-17760][SQL] AnalysisException with dataframe
 pivot when groupBy column is not attribute

## What changes were proposed in this pull request?

Fixes AnalysisException for pivot queries that have group by columns that are expressions and not attributes by substituting the expressions output attribute in the second aggregation and final projection.

## How was this patch tested?

existing and additional unit tests

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #16177 from aray/SPARK-17760.

(cherry picked from commit f1fca81b165c5a673f7d86b268e04ea42a6c267e)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 5 +++--
 .../scala/org/apache/spark/sql/DataFramePivotSuite.scala  | 8 ++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f738ae822178a..9ca990144fc23 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -404,14 +404,15 @@ class Analyzer(
               .toAggregateExpression()
             , "__pivot_" + a.sql)()
           }
-          val secondAgg = Aggregate(groupByExprs, groupByExprs ++ pivotAggs, firstAgg)
+          val groupByExprsAttr = groupByExprs.map(_.toAttribute)
+          val secondAgg = Aggregate(groupByExprsAttr, groupByExprsAttr ++ pivotAggs, firstAgg)
           val pivotAggAttribute = pivotAggs.map(_.toAttribute)
           val pivotOutputs = pivotValues.zipWithIndex.flatMap { case (value, i) =>
             aggregates.zip(pivotAggAttribute).map { case (aggregate, pivotAtt) =>
               Alias(ExtractValue(pivotAtt, Literal(i), resolver), outputName(value, aggregate))()
             }
           }
-          Project(groupByExprs ++ pivotOutputs, secondAgg)
+          Project(groupByExprsAttr ++ pivotOutputs, secondAgg)
         } else {
           val pivotAggregates: Seq[NamedExpression] = pivotValues.flatMap { value =>
             def ifExpr(expr: Expression) = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
index 1bbe1354d55f4..a8d854ccbc943 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala
@@ -208,4 +208,12 @@ class DataFramePivotSuite extends QueryTest with SharedSQLContext{
     )
   }
 
+  test("pivot with column definition in groupby") {
+    checkAnswer(
+      courseSales.groupBy(substring(col("course"), 0, 1).as("foo"))
+        .pivot("year", Seq(2012, 2013))
+        .sum("earnings"),
+      Row("d", 15000.0, 48000.0) :: Row("J", 20000.0, 30000.0) :: Nil
+    )
+  }
 }

From acb6ac5da7a5694cc3270772c6d68933b7d761dc Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 7 Dec 2016 10:30:05 -0800
Subject: [PATCH 299/534] [SPARK-18764][CORE] Add a warning log when skipping a
 corrupted file

## What changes were proposed in this pull request?

It's better to add a warning log when skipping a corrupted file. It will be helpful when we want to finish the job first, then find them in the log and fix these files.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16192 from zsxwing/SPARK-18764.

(cherry picked from commit dbf3e298a1a35c0243f087814ddf88034ff96d66)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala    | 4 +++-
 core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 6 +++++-
 .../spark/sql/execution/datasources/FileScanRDD.scala       | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index ae4320d4583d6..3133a28755884 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -257,7 +257,9 @@ class HadoopRDD[K, V](
         try {
           finished = !reader.next(key, value)
         } catch {
-          case e: IOException if ignoreCorruptFiles => finished = true
+          case e: IOException if ignoreCorruptFiles =>
+            logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
+            finished = true
         }
         if (!finished) {
           inputMetrics.incRecordsRead(1)
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index c783e1375283a..c6ddb4b090928 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -186,7 +186,11 @@ class NewHadoopRDD[K, V](
           try {
             finished = !reader.nextKeyValue
           } catch {
-            case e: IOException if ignoreCorruptFiles => finished = true
+            case e: IOException if ignoreCorruptFiles =>
+              logWarning(
+                s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}",
+                e)
+              finished = true
           }
           if (finished) {
             // Close and release the reader here; close() will also be called when the task
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 89944570df662..237cdabb5f795 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -138,6 +138,7 @@ class FileScanRDD(
                     }
                   } catch {
                     case e: IOException =>
+                      logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
                       finished = true
                       null
                   }

From 76e1f1651f5a7207c9c66686616709b62b798fa3 Mon Sep 17 00:00:00 2001
From: sarutak <sarutak@oss.nttdata.co.jp>
Date: Wed, 7 Dec 2016 11:41:23 -0800
Subject: [PATCH 300/534] [SPARK-18762][WEBUI] Web UI should be http:4040
 instead of https:4040

## What changes were proposed in this pull request?

When SSL is enabled, the Spark shell shows:
```
Spark context Web UI available at https://192.168.99.1:4040
```
This is wrong because 4040 is http, not https. It redirects to the https port.
More importantly, this introduces several broken links in the UI. For example, in the master UI, the worker link is https:8081 instead of http:8081 or https:8481.

CC: mengxr liancheng

I manually tested accessing by accessing MasterPage, WorkerPage and HistoryServer with SSL enabled.

Author: sarutak <sarutak@oss.nttdata.co.jp>

Closes #16190 from sarutak/SPARK-18761.

(cherry picked from commit bb94f61a7ac97bf904ec0e8d5a4ab69a4142443f)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../main/scala/org/apache/spark/deploy/worker/Worker.scala   | 3 +--
 core/src/main/scala/org/apache/spark/ui/WebUI.scala          | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 8b1c6bf2e5fd5..0940f3c55844c 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -187,8 +187,7 @@ private[deploy] class Worker(
     webUi = new WorkerWebUI(this, workDir, webUiPort)
     webUi.bind()
 
-    val scheme = if (webUi.sslOptions.enabled) "https" else "http"
-    workerWebUiUrl = s"$scheme://$publicAddress:${webUi.boundPort}"
+    workerWebUiUrl = s"http://$publicAddress:${webUi.boundPort}"
     registerWithMaster()
 
     metricsSystem.registerSource(workerSource)
diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
index a05e0efb7a3e3..4118fcf46b428 100644
--- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala
@@ -147,10 +147,7 @@ private[spark] abstract class WebUI(
   }
 
   /** Return the url of web interface. Only valid after bind(). */
-  def webUrl: String = {
-    val protocol = if (sslOptions.enabled) "https" else "http"
-    s"$protocol://$publicHostName:$boundPort"
-  }
+  def webUrl: String = s"http://$publicHostName:$boundPort"
 
   /** Return the actual port to which this server is bound. Only valid after bind(). */
   def boundPort: Int = serverInfo.map(_.boundPort).getOrElse(-1)

From e9b3afac9ce5ea4bffb8201a58856598c521a3a9 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 7 Dec 2016 13:47:44 -0800
Subject: [PATCH 301/534] [SPARK-18588][TESTS] Fix flaky test:
 KafkaSourceStressForDontFailOnDataLossSuite

## What changes were proposed in this pull request?

Fixed the following failures:

```
org.scalatest.exceptions.TestFailedDueToTimeoutException: The code passed to eventually never returned normally. Attempted 3745 times over 1.0000790851666665 minutes. Last failure message: assertion failed: failOnDataLoss-0 not deleted after timeout.
```

```
sbt.ForkMain$ForkError: org.apache.spark.sql.streaming.StreamingQueryException: Query query-66 terminated with exception: null
	at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:252)
	at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:146)
Caused by: sbt.ForkMain$ForkError: java.lang.NullPointerException: null
	at java.util.ArrayList.addAll(ArrayList.java:577)
	at org.apache.kafka.clients.Metadata.getClusterForCurrentTopics(Metadata.java:257)
	at org.apache.kafka.clients.Metadata.update(Metadata.java:177)
	at org.apache.kafka.clients.NetworkClient$DefaultMetadataUpdater.handleResponse(NetworkClient.java:605)
	at org.apache.kafka.clients.NetworkClient$DefaultMetadataUpdater.maybeHandleCompletedReceive(NetworkClient.java:582)
	at org.apache.kafka.clients.NetworkClient.handleCompletedReceives(NetworkClient.java:450)
	at org.apache.kafka.clients.NetworkClient.poll(NetworkClient.java:269)
	at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.clientPoll(ConsumerNetworkClient.java:360)
	at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:224)
	at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:192)
	at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.awaitPendingRequests(ConsumerNetworkClient.java:260)
	at org.apache.kafka.clients.consumer.internals.AbstractCoordinator.ensureActiveGroup(AbstractCoordinator.java:222)
	at org.apache.kafka.clients.consumer.internals.ConsumerCoordinator.ensurePartitionAssignment(ConsumerCoordinator.java:366)
	at org.apache.kafka.clients.consumer.KafkaConsumer.pollOnce(KafkaConsumer.java:978)
	at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:938)
	at
...
```

## How was this patch tested?

Tested in #16048 by running many times.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16109 from zsxwing/fix-kafka-flaky-test.

(cherry picked from commit edc87e18922b98be47c298cdc3daa2b049a737e9)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/kafka010/CachedKafkaConsumer.scala    | 39 ++++++++--
 .../spark/sql/kafka010/KafkaSource.scala      |  2 +-
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 11 ++-
 .../spark/sql/kafka010/KafkaTestUtils.scala   | 75 ++++++++++++-------
 .../spark/sql/test/SharedSQLContext.scala     |  8 +-
 5 files changed, 96 insertions(+), 39 deletions(-)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
index 3f438e99185b5..3f396a7e6b698 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/CachedKafkaConsumer.scala
@@ -86,7 +86,7 @@ private[kafka010] case class CachedKafkaConsumer private(
     var toFetchOffset = offset
     while (toFetchOffset != UNKNOWN_OFFSET) {
       try {
-        return fetchData(toFetchOffset, pollTimeoutMs)
+        return fetchData(toFetchOffset, untilOffset, pollTimeoutMs, failOnDataLoss)
       } catch {
         case e: OffsetOutOfRangeException =>
           // When there is some error thrown, it's better to use a new consumer to drop all cached
@@ -159,14 +159,18 @@ private[kafka010] case class CachedKafkaConsumer private(
   }
 
   /**
-   * Get the record at `offset`.
+   * Get the record for the given offset if available. Otherwise it will either throw error
+   * (if failOnDataLoss = true), or return the next available offset within [offset, untilOffset),
+   * or null.
    *
    * @throws OffsetOutOfRangeException if `offset` is out of range
    * @throws TimeoutException if cannot fetch the record in `pollTimeoutMs` milliseconds.
    */
   private def fetchData(
       offset: Long,
-      pollTimeoutMs: Long): ConsumerRecord[Array[Byte], Array[Byte]] = {
+      untilOffset: Long,
+      pollTimeoutMs: Long,
+      failOnDataLoss: Boolean): ConsumerRecord[Array[Byte], Array[Byte]] = {
     if (offset != nextOffsetInFetchedData || !fetchedData.hasNext()) {
       // This is the first fetch, or the last pre-fetched data has been drained.
       // Seek to the offset because we may call seekToBeginning or seekToEnd before this.
@@ -190,10 +194,31 @@ private[kafka010] case class CachedKafkaConsumer private(
     } else {
       val record = fetchedData.next()
       nextOffsetInFetchedData = record.offset + 1
-      // `seek` is always called before "poll". So "record.offset" must be same as "offset".
-      assert(record.offset == offset,
-        s"The fetched data has a different offset: expected $offset but was ${record.offset}")
-      record
+      // In general, Kafka uses the specified offset as the start point, and tries to fetch the next
+      // available offset. Hence we need to handle offset mismatch.
+      if (record.offset > offset) {
+        // This may happen when some records aged out but their offsets already got verified
+        if (failOnDataLoss) {
+          reportDataLoss(true, s"Cannot fetch records in [$offset, ${record.offset})")
+          // Never happen as "reportDataLoss" will throw an exception
+          null
+        } else {
+          if (record.offset >= untilOffset) {
+            reportDataLoss(false, s"Skip missing records in [$offset, $untilOffset)")
+            null
+          } else {
+            reportDataLoss(false, s"Skip missing records in [$offset, ${record.offset})")
+            record
+          }
+        }
+      } else if (record.offset < offset) {
+        // This should not happen. If it does happen, then we probably misunderstand Kafka internal
+        // mechanism.
+        throw new IllegalStateException(
+          s"Tried to fetch $offset but the returned record offset was ${record.offset}")
+      } else {
+        record
+      }
     }
   }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index d9ab4bb4f873d..92ee0ed93d940 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -102,7 +102,7 @@ private[kafka010] case class KafkaSource(
     sourceOptions.getOrElse("fetchOffset.numRetries", "3").toInt
 
   private val offsetFetchAttemptIntervalMs =
-    sourceOptions.getOrElse("fetchOffset.retryIntervalMs", "10").toLong
+    sourceOptions.getOrElse("fetchOffset.retryIntervalMs", "1000").toLong
 
   private val maxOffsetsPerTrigger =
     sourceOptions.get("maxOffsetsPerTrigger").map(_.toLong)
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 2d6ccb22ddb06..0e40abac65251 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -31,11 +31,12 @@ import org.scalatest.concurrent.Eventually._
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.SparkContext
 import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 
 abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
@@ -811,6 +812,11 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
 
   private def newTopic(): String = s"failOnDataLoss-${topicId.getAndIncrement()}"
 
+  override def createSparkSession(): TestSparkSession = {
+    // Set maxRetries to 3 to handle NPE from `poll` when deleting a topic
+    new TestSparkSession(new SparkContext("local[2,3]", "test-sql-context", sparkConf))
+  }
+
   override def beforeAll(): Unit = {
     super.beforeAll()
     testUtils = new KafkaTestUtils {
@@ -839,7 +845,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  ignore("stress test for failOnDataLoss=false") {
+  test("stress test for failOnDataLoss=false") {
     val reader = spark
       .readStream
       .format("kafka")
@@ -848,6 +854,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
       .option("subscribePattern", "failOnDataLoss.*")
       .option("startingOffsets", "earliest")
       .option("failOnDataLoss", "false")
+      .option("fetchOffset.retryIntervalMs", "3000")
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
       .as[(String, String)]
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index f43917e151c57..fd1689acf6727 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -184,7 +184,7 @@ class KafkaTestUtils extends Logging {
   def deleteTopic(topic: String): Unit = {
     val partitions = zkUtils.getPartitionsForTopics(Seq(topic))(topic).size
     AdminUtils.deleteTopic(zkUtils, topic)
-    verifyTopicDeletion(zkUtils, topic, partitions, List(this.server))
+    verifyTopicDeletionWithRetries(zkUtils, topic, partitions, List(this.server))
   }
 
   /** Add new paritions to a Kafka topic */
@@ -286,36 +286,57 @@ class KafkaTestUtils extends Logging {
     props
   }
 
+  /** Verify topic is deleted in all places, e.g, brokers, zookeeper. */
   private def verifyTopicDeletion(
+      topic: String,
+      numPartitions: Int,
+      servers: Seq[KafkaServer]): Unit = {
+    val topicAndPartitions = (0 until numPartitions).map(TopicAndPartition(topic, _))
+
+    import ZkUtils._
+    // wait until admin path for delete topic is deleted, signaling completion of topic deletion
+    assert(
+      !zkUtils.pathExists(getDeleteTopicPath(topic)),
+      s"${getDeleteTopicPath(topic)} still exists")
+    assert(!zkUtils.pathExists(getTopicPath(topic)), s"${getTopicPath(topic)} still exists")
+    // ensure that the topic-partition has been deleted from all brokers' replica managers
+    assert(servers.forall(server => topicAndPartitions.forall(tp =>
+      server.replicaManager.getPartition(tp.topic, tp.partition) == None)),
+      s"topic $topic still exists in the replica manager")
+    // ensure that logs from all replicas are deleted if delete topic is marked successful
+    assert(servers.forall(server => topicAndPartitions.forall(tp =>
+      server.getLogManager().getLog(tp).isEmpty)),
+      s"topic $topic still exists in log mananger")
+    // ensure that topic is removed from all cleaner offsets
+    assert(servers.forall(server => topicAndPartitions.forall { tp =>
+      val checkpoints = server.getLogManager().logDirs.map { logDir =>
+        new OffsetCheckpoint(new File(logDir, "cleaner-offset-checkpoint")).read()
+      }
+      checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp))
+    }), s"checkpoint for topic $topic still exists")
+    // ensure the topic is gone
+    assert(
+      !zkUtils.getAllTopics().contains(topic),
+      s"topic $topic still exists on zookeeper")
+  }
+
+  /** Verify topic is deleted. Retry to delete the topic if not. */
+  private def verifyTopicDeletionWithRetries(
       zkUtils: ZkUtils,
       topic: String,
       numPartitions: Int,
       servers: Seq[KafkaServer]) {
-    import ZkUtils._
-    val topicAndPartitions = (0 until numPartitions).map(TopicAndPartition(topic, _))
-    def isDeleted(): Boolean = {
-      // wait until admin path for delete topic is deleted, signaling completion of topic deletion
-      val deletePath = !zkUtils.pathExists(getDeleteTopicPath(topic))
-      val topicPath = !zkUtils.pathExists(getTopicPath(topic))
-      // ensure that the topic-partition has been deleted from all brokers' replica managers
-      val replicaManager = servers.forall(server => topicAndPartitions.forall(tp =>
-        server.replicaManager.getPartition(tp.topic, tp.partition) == None))
-      // ensure that logs from all replicas are deleted if delete topic is marked successful
-      val logManager = servers.forall(server => topicAndPartitions.forall(tp =>
-        server.getLogManager().getLog(tp).isEmpty))
-      // ensure that topic is removed from all cleaner offsets
-      val cleaner = servers.forall(server => topicAndPartitions.forall { tp =>
-        val checkpoints = server.getLogManager().logDirs.map { logDir =>
-          new OffsetCheckpoint(new File(logDir, "cleaner-offset-checkpoint")).read()
-        }
-        checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp))
-      })
-      // ensure the topic is gone
-      val deleted = !zkUtils.getAllTopics().contains(topic)
-      deletePath && topicPath && replicaManager && logManager && cleaner && deleted
-    }
-    eventually(timeout(60.seconds)) {
-      assert(isDeleted, s"$topic not deleted after timeout")
+    eventually(timeout(60.seconds), interval(200.millis)) {
+      try {
+        verifyTopicDeletion(topic, numPartitions, servers)
+      } catch {
+        case e: Throwable =>
+          // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small
+          // chance that a topic will be recreated after deletion due to the asynchronous update.
+          // Hence, delete the topic and retry.
+          AdminUtils.deleteTopic(zkUtils, topic)
+          throw e
+      }
     }
   }
 
@@ -331,7 +352,7 @@ class KafkaTestUtils extends Logging {
       case _ =>
         false
     }
-    eventually(timeout(10.seconds)) {
+    eventually(timeout(60.seconds)) {
       assert(isPropagated, s"Partition [$topic, $partition] metadata not propagated after timeout")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
index db24ee8b46dd5..2239f10870eda 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SharedSQLContext.scala
@@ -48,14 +48,18 @@ trait SharedSQLContext extends SQLTestUtils with BeforeAndAfterEach {
    */
   protected implicit def sqlContext: SQLContext = _spark.sqlContext
 
+  protected def createSparkSession: TestSparkSession = {
+    new TestSparkSession(
+      sparkConf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName))
+  }
+
   /**
    * Initialize the [[TestSparkSession]].
    */
   protected override def beforeAll(): Unit = {
     SparkSession.sqlListener.set(null)
     if (_spark == null) {
-      _spark = new TestSparkSession(
-        sparkConf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName))
+      _spark = createSparkSession
     }
     // Ensure we have initialized the context before calling parent code
     super.beforeAll()

From 1c6419718aadf0bdc200f9b328242062a07f2277 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 7 Dec 2016 15:36:29 -0800
Subject: [PATCH 302/534] [SPARK-18754][SS] Rename recentProgresses to
 recentProgress

Based on an informal survey, users find this option easier to understand / remember.

Author: Michael Armbrust <michael@databricks.com>

Closes #16182 from marmbrus/renameRecentProgress.

(cherry picked from commit 70b2bf717d367d598c5a238d569d62c777e63fde)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../spark/sql/kafka010/KafkaSourceSuite.scala  |  2 +-
 project/MimaExcludes.scala                     |  2 +-
 python/pyspark/sql/streaming.py                |  6 +++---
 python/pyspark/sql/tests.py                    |  4 ++--
 .../execution/streaming/ProgressReporter.scala |  2 +-
 .../apache/spark/sql/internal/SQLConf.scala    |  2 +-
 .../spark/sql/streaming/StreamingQuery.scala   |  4 ++--
 .../execution/streaming/ForeachSinkSuite.scala |  4 ++--
 .../sql/streaming/FileStreamSourceSuite.scala  |  2 +-
 .../StreamingQueryListenerSuite.scala          |  4 ++--
 .../sql/streaming/StreamingQuerySuite.scala    | 18 +++++++++---------
 11 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 0e40abac65251..544fbc5ec36a2 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -448,7 +448,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       AddKafkaData(Set(topic), 1, 2, 3),
       CheckAnswer(2, 3, 4),
       AssertOnQuery { query =>
-        val recordsRead = query.recentProgresses.map(_.numInputRows).sum
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
         recordsRead == 3
       }
     )
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 6650aad0be594..978a328f3e2d0 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -85,7 +85,7 @@ object MimaExcludes {
       ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.sourceStatuses"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQuery.id"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.lastProgress"),
-      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.recentProgresses"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.recentProgress"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQuery.id"),
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.get"),
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index ee7a26d00df4b..9cfb3fe25cdcc 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -114,12 +114,12 @@ def status(self):
 
     @property
     @since(2.1)
-    def recentProgresses(self):
+    def recentProgress(self):
         """Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.
         The number of progress updates retained for each stream is configured by Spark session
-        configuration `spark.sql.streaming.numRecentProgresses`.
+        configuration `spark.sql.streaming.numRecentProgressUpdates`.
         """
-        return [json.loads(p.json()) for p in self._jsq.recentProgresses()]
+        return [json.loads(p.json()) for p in self._jsq.recentProgress()]
 
     @property
     @since(2.1)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 66a3490a640ba..50df68b14483d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1116,11 +1116,11 @@ def test_stream_status_and_progress(self):
         try:
             q.processAllAvailable()
             lastProgress = q.lastProgress
-            recentProgresses = q.recentProgresses
+            recentProgress = q.recentProgress
             status = q.status
             self.assertEqual(lastProgress['name'], q.name)
             self.assertEqual(lastProgress['id'], q.id)
-            self.assertTrue(any(p == lastProgress for p in recentProgresses))
+            self.assertTrue(any(p == lastProgress for p in recentProgress))
             self.assertTrue(
                 "message" in status and
                 "isDataAvailable" in status and
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 12d0c1e9b49f0..40e3151337af6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -94,7 +94,7 @@ trait ProgressReporter extends Logging {
   def status: StreamingQueryStatus = currentStatus
 
   /** Returns an array containing the most recent query progress updates. */
-  def recentProgresses: Array[StreamingQueryProgress] = progressBuffer.synchronized {
+  def recentProgress: Array[StreamingQueryProgress] = progressBuffer.synchronized {
     progressBuffer.toArray
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 581f99e9c155c..0280a3b87a3a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -591,7 +591,7 @@ object SQLConf {
       .createWithDefault(false)
 
   val STREAMING_PROGRESS_RETENTION =
-    SQLConfigBuilder("spark.sql.streaming.numRecentProgresses")
+    SQLConfigBuilder("spark.sql.streaming.numRecentProgressUpdates")
       .doc("The number of progress updates to retain for a streaming query")
       .intConf
       .createWithDefault(100)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
index 1794e75462cfd..596bd90140cc9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala
@@ -87,11 +87,11 @@ trait StreamingQuery {
   /**
    * Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.
    * The number of progress updates retained for each stream is configured by Spark session
-   * configuration `spark.sql.streaming.numRecentProgresses`.
+   * configuration `spark.sql.streaming.numRecentProgressUpdates`.
    *
    * @since 2.1.0
    */
-  def recentProgresses: Array[StreamingQueryProgress]
+  def recentProgress: Array[StreamingQueryProgress]
 
   /**
    * Returns the most recent [[StreamingQueryProgress]] update of this streaming query.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
index 4a3eeb70b1702..9137d650e906b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/ForeachSinkSuite.scala
@@ -263,9 +263,9 @@ class ForeachSinkSuite extends StreamTest with SharedSQLContext with BeforeAndAf
     try {
       inputData.addData(10, 11, 12)
       query.processAllAvailable()
-      val recentProgress = query.recentProgresses.filter(_.numInputRows != 0).headOption
+      val recentProgress = query.recentProgress.filter(_.numInputRows != 0).headOption
       assert(recentProgress.isDefined && recentProgress.get.numInputRows === 3,
-        s"recentProgresses[${query.recentProgresses.toList}] doesn't contain correct metrics")
+        s"recentProgress[${query.recentProgress.toList}] doesn't contain correct metrics")
     } finally {
       query.stop()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index ff1f3e26f1593..7b6fe83b9a597 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -1006,7 +1006,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
         AddTextFileData("100", src, tmp),
         CheckAnswer("100"),
         AssertOnQuery { query =>
-          val actualProgress = query.recentProgresses
+          val actualProgress = query.recentProgress
               .find(_.numInputRows > 0)
               .getOrElse(sys.error("Could not find records with data."))
           assert(actualProgress.numInputRows === 1)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 1cd503c6de696..b78d1353e8dcb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -237,9 +237,9 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
           }
           true
         }
-        // `recentProgresses` should not receive too many no data events
+        // `recentProgress` should not receive too many no data events
         actions += AssertOnQuery { q =>
-          q.recentProgresses.size > 1 && q.recentProgresses.size <= 11
+          q.recentProgress.size > 1 && q.recentProgress.size <= 11
         }
         testStream(input.toDS)(actions: _*)
         spark.sparkContext.listenerBus.waitUntilEmpty(10000)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 55dd1a5d51e37..7be2f216919b0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -152,7 +152,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     )
   }
 
-  testQuietly("status, lastProgress, and recentProgresses") {
+  testQuietly("status, lastProgress, and recentProgress") {
     import StreamingQuerySuite._
     clock = new StreamManualClock
 
@@ -201,7 +201,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.isDataAvailable === false),
       AssertOnQuery(_.status.isTriggerActive === false),
       AssertOnQuery(_.status.message === "Waiting for next trigger"),
-      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
 
       // Test status and progress while offset is being fetched
       AddData(inputData, 1, 2),
@@ -210,7 +210,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.isDataAvailable === false),
       AssertOnQuery(_.status.isTriggerActive === true),
       AssertOnQuery(_.status.message.startsWith("Getting offsets from")),
-      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
 
       // Test status and progress while batch is being fetched
       AdvanceManualClock(200), // time = 300 to unblock getOffset, will block on getBatch
@@ -218,14 +218,14 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.isDataAvailable === true),
       AssertOnQuery(_.status.isTriggerActive === true),
       AssertOnQuery(_.status.message === "Processing new data"),
-      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
 
       // Test status and progress while batch is being processed
       AdvanceManualClock(300), // time = 600 to unblock getBatch, will block in Spark job
       AssertOnQuery(_.status.isDataAvailable === true),
       AssertOnQuery(_.status.isTriggerActive === true),
       AssertOnQuery(_.status.message === "Processing new data"),
-      AssertOnQuery(_.recentProgresses.count(_.numInputRows > 0) === 0),
+      AssertOnQuery(_.recentProgress.count(_.numInputRows > 0) === 0),
 
       // Test status and progress while batch processing has completed
       AdvanceManualClock(500), // time = 1100 to unblock job
@@ -236,8 +236,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.message === "Waiting for next trigger"),
       AssertOnQuery { query =>
         assert(query.lastProgress != null)
-        assert(query.recentProgresses.exists(_.numInputRows > 0))
-        assert(query.recentProgresses.last.eq(query.lastProgress))
+        assert(query.recentProgress.exists(_.numInputRows > 0))
+        assert(query.recentProgress.last.eq(query.lastProgress))
 
         val progress = query.lastProgress
         assert(progress.id === query.id)
@@ -274,7 +274,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
       AssertOnQuery(_.status.isTriggerActive === false),
       AssertOnQuery(_.status.message === "Waiting for next trigger"),
       AssertOnQuery { query =>
-        assert(query.recentProgresses.last.eq(query.lastProgress))
+        assert(query.recentProgress.last.eq(query.lastProgress))
         assert(query.lastProgress.batchId === 1)
         assert(query.lastProgress.sources(0).inputRowsPerSecond === 1.818)
         true
@@ -408,7 +408,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     try {
       val q = streamingDF.writeStream.format("memory").queryName("test").start()
       q.processAllAvailable()
-      q.recentProgresses.head
+      q.recentProgress.head
     } finally {
       spark.streams.active.map(_.stop())
     }

From 839c2eb9723ba51baf6022fea8c29caecf7c0612 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 7 Dec 2016 18:12:49 -0800
Subject: [PATCH 303/534] [SPARK-18633][ML][EXAMPLE] Add multiclass logistic
 regression summary python example and document

## What changes were proposed in this pull request?
Logistic Regression summary is added in Python API. We need to add example and document for summary.

The newly added example is consistent with Scala and Java examples.

## How was this patch tested?

Manually tests: Run the example with spark-submit; copy & paste code into pyspark; build document and check the document.

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #16064 from wangmiao1981/py.

(cherry picked from commit aad11209eb4db585f991ba09d08d90576f315bb4)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 docs/ml-classification-regression.md          | 10 ++-
 .../ml/logistic_regression_summary_example.py | 68 +++++++++++++++++++
 2 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 examples/src/main/python/ml/logistic_regression_summary_example.py

diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index 5148ad02d93aa..557a53cc2314a 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -114,9 +114,15 @@ Continuing the earlier example:
 {% include_example java/org/apache/spark/examples/ml/JavaLogisticRegressionSummaryExample.java %}
 </div>
 
-<!--- TODO: Add python model summaries once implemented -->
 <div data-lang="python" markdown="1">
-Logistic regression model summary is not yet supported in Python.
+[`LogisticRegressionTrainingSummary`](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionSummary)
+provides a summary for a
+[`LogisticRegressionModel`](api/python/pyspark.ml.html#pyspark.ml.classification.LogisticRegressionModel).
+Currently, only binary classification is supported. Support for multiclass model summaries will be added in the future.
+
+Continuing the earlier example:
+
+{% include_example python/ml/logistic_regression_summary_example.py %}
 </div>
 
 </div>
diff --git a/examples/src/main/python/ml/logistic_regression_summary_example.py b/examples/src/main/python/ml/logistic_regression_summary_example.py
new file mode 100644
index 0000000000000..bd440a1fbe8df
--- /dev/null
+++ b/examples/src/main/python/ml/logistic_regression_summary_example.py
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.classification import LogisticRegression
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+An example demonstrating Logistic Regression Summary.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/logistic_regression_summary_example.py
+"""
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("LogisticRegressionSummary") \
+        .getOrCreate()
+
+    # Load training data
+    training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
+
+    # Fit the model
+    lrModel = lr.fit(training)
+
+    # $example on$
+    # Extract the summary from the returned LogisticRegressionModel instance trained
+    # in the earlier example
+    trainingSummary = lrModel.summary
+
+    # Obtain the objective per iteration
+    objectiveHistory = trainingSummary.objectiveHistory
+    print("objectiveHistory:")
+    for objective in objectiveHistory:
+        print(objective)
+
+    # Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
+    trainingSummary.roc.show()
+    print("areaUnderROC: " + str(trainingSummary.areaUnderROC))
+
+    # Set the model threshold to maximize F-Measure
+    fMeasure = trainingSummary.fMeasureByThreshold
+    maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
+    bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
+        .select('threshold').head()['threshold']
+    lr.setThreshold(bestThreshold)
+    # $example off$
+
+    spark.stop()

From 617ce3ba765e13e354eaa9b7e13851aef40c9ceb Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 7 Dec 2016 19:23:27 -0800
Subject: [PATCH 304/534] [SPARK-18758][SS] StreamingQueryListener events from
 a StreamingQuery should be sent only to the listeners in the same session as
 the query

## What changes were proposed in this pull request?

Listeners added with `sparkSession.streams.addListener(l)` are added to a SparkSession. So events only from queries in the same session as a listener should be posted to the listener. Currently, all the events gets rerouted through the Spark's main listener bus, that is,
- StreamingQuery posts event to StreamingQueryListenerBus. Only the queries associated with the same session as the bus posts events to it.
- StreamingQueryListenerBus posts event to Spark's main LiveListenerBus as a SparkEvent.
- StreamingQueryListenerBus also subscribes to LiveListenerBus events thus getting back the posted event in a different thread.
- The received is posted to the registered listeners.

The problem is that *all StreamingQueryListenerBuses in all sessions* gets the events and posts them to their listeners. This is wrong.

In this PR, I solve it by making StreamingQueryListenerBus track active queries (by their runIds) when a query posts the QueryStarted event to the bus. This allows the rerouted events to be filtered using the tracked queries.

Note that this list needs to be maintained separately
from the `StreamingQueryManager.activeQueries` because a terminated query is cleared from
`StreamingQueryManager.activeQueries` as soon as it is stopped, but the this ListenerBus must
clear a query only after the termination event of that query has been posted lazily, much after the query has been terminated.

Credit goes to zsxwing for coming up with the initial idea.

## How was this patch tested?
Updated test harness code to use the correct session, and added new unit test.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16186 from tdas/SPARK-18758.

(cherry picked from commit 9ab725eabbb4ad515a663b395bd2f91bb5853a23)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/StreamingQueryListenerBus.scala | 54 +++++++++++++--
 .../sql/execution/streaming/memory.scala      |  4 +-
 .../spark/sql/streaming/StreamTest.scala      | 15 ++--
 .../StreamingQueryListenerSuite.scala         | 69 +++++++++++++++++--
 4 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
index 22e4c6380fcd5..a2153d27e9fef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.UUID
+
+import scala.collection.mutable
+
 import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent}
 import org.apache.spark.sql.streaming.StreamingQueryListener
 import org.apache.spark.util.ListenerBus
@@ -25,7 +29,11 @@ import org.apache.spark.util.ListenerBus
  * A bus to forward events to [[StreamingQueryListener]]s. This one will send received
  * [[StreamingQueryListener.Event]]s to the Spark listener bus. It also registers itself with
  * Spark listener bus, so that it can receive [[StreamingQueryListener.Event]]s and dispatch them
- * to StreamingQueryListener.
+ * to StreamingQueryListeners.
+ *
+ * Note that each bus and its registered listeners are associated with a single SparkSession
+ * and StreamingQueryManager. So this bus will dispatch events to registered listeners for only
+ * those queries that were started in the associated SparkSession.
  */
 class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
   extends SparkListener with ListenerBus[StreamingQueryListener, StreamingQueryListener.Event] {
@@ -35,12 +43,30 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
   sparkListenerBus.addListener(this)
 
   /**
-   * Post a StreamingQueryListener event to the Spark listener bus asynchronously. This event will
-   * be dispatched to all StreamingQueryListener in the thread of the Spark listener bus.
+   * RunIds of active queries whose events are supposed to be forwarded by this ListenerBus
+   * to registered `StreamingQueryListeners`.
+   *
+   * Note 1: We need to track runIds instead of ids because the runId is unique for every started
+   * query, even it its a restart. So even if a query is restarted, this bus will identify them
+   * separately and correctly account for the restart.
+   *
+   * Note 2: This list needs to be maintained separately from the
+   * `StreamingQueryManager.activeQueries` because a terminated query is cleared from
+   * `StreamingQueryManager.activeQueries` as soon as it is stopped, but the this ListenerBus
+   * must clear a query only after the termination event of that query has been posted.
+   */
+  private val activeQueryRunIds = new mutable.HashSet[UUID]
+
+  /**
+   * Post a StreamingQueryListener event to the added StreamingQueryListeners.
+   * Note that only the QueryStarted event is posted to the listener synchronously. Other events
+   * are dispatched to Spark listener bus. This method is guaranteed to be called by queries in
+   * the same SparkSession as this listener.
    */
   def post(event: StreamingQueryListener.Event) {
     event match {
       case s: QueryStartedEvent =>
+        activeQueryRunIds.synchronized { activeQueryRunIds += s.runId }
         sparkListenerBus.post(s)
         // post to local listeners to trigger callbacks
         postToAll(s)
@@ -63,18 +89,32 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
     }
   }
 
+  /**
+   * Dispatch events to registered StreamingQueryListeners. Only the events associated queries
+   * started in the same SparkSession as this ListenerBus will be dispatched to the listeners.
+   */
   override protected def doPostEvent(
       listener: StreamingQueryListener,
       event: StreamingQueryListener.Event): Unit = {
+    def shouldReport(runId: UUID): Boolean = {
+      activeQueryRunIds.synchronized { activeQueryRunIds.contains(runId) }
+    }
+
     event match {
       case queryStarted: QueryStartedEvent =>
-        listener.onQueryStarted(queryStarted)
+        if (shouldReport(queryStarted.runId)) {
+          listener.onQueryStarted(queryStarted)
+        }
       case queryProgress: QueryProgressEvent =>
-        listener.onQueryProgress(queryProgress)
+        if (shouldReport(queryProgress.progress.runId)) {
+          listener.onQueryProgress(queryProgress)
+        }
       case queryTerminated: QueryTerminatedEvent =>
-        listener.onQueryTerminated(queryTerminated)
+        if (shouldReport(queryTerminated.runId)) {
+          listener.onQueryTerminated(queryTerminated)
+          activeQueryRunIds.synchronized { activeQueryRunIds -= queryTerminated.runId }
+        }
       case _ =>
     }
   }
-
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index b370845481ed7..b699be217e67f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -70,11 +70,11 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
 
   def schema: StructType = encoder.schema
 
-  def toDS()(implicit sqlContext: SQLContext): Dataset[A] = {
+  def toDS(): Dataset[A] = {
     Dataset(sqlContext.sparkSession, logicalPlan)
   }
 
-  def toDF()(implicit sqlContext: SQLContext): DataFrame = {
+  def toDF(): DataFrame = {
     Dataset.ofRows(sqlContext.sparkSession, logicalPlan)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 43322651296b9..10f267e115320 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -231,8 +231,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
       outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = {
 
     val stream = _stream.toDF()
+    val sparkSession = stream.sparkSession  // use the session in DF, not the default session
     var pos = 0
-    var currentPlan: LogicalPlan = stream.logicalPlan
     var currentStream: StreamExecution = null
     var lastStream: StreamExecution = null
     val awaiting = new mutable.HashMap[Int, Offset]() // source index -> offset to wait for
@@ -319,7 +319,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
          """.stripMargin)
     }
 
-    val testThread = Thread.currentThread()
     val metadataRoot = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
     var manualClockExpectedTime = -1L
     try {
@@ -337,14 +336,16 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
 
             additionalConfs.foreach(pair => {
               val value =
-                if (spark.conf.contains(pair._1)) Some(spark.conf.get(pair._1)) else None
+                if (sparkSession.conf.contains(pair._1)) {
+                  Some(sparkSession.conf.get(pair._1))
+                } else None
               resetConfValues(pair._1) = value
-              spark.conf.set(pair._1, pair._2)
+              sparkSession.conf.set(pair._1, pair._2)
             })
 
             lastStream = currentStream
             currentStream =
-              spark
+              sparkSession
                 .streams
                 .startQuery(
                   None,
@@ -518,8 +519,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with Timeouts {
 
       // Rollback prev configuration values
       resetConfValues.foreach {
-        case (key, Some(value)) => spark.conf.set(key, value)
-        case (key, None) => spark.conf.unset(key)
+        case (key, Some(value)) => sparkSession.conf.set(key, value)
+        case (key, None) => sparkSession.conf.unset(key)
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index b78d1353e8dcb..f75f5b537e41b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.streaming
 import java.util.UUID
 
 import scala.collection.mutable
+import scala.concurrent.duration._
 
 import org.scalactic.TolerantNumerics
 import org.scalatest.concurrent.AsyncAssertions.Waiter
@@ -30,6 +31,7 @@ import org.scalatest.PrivateMethodTester._
 
 import org.apache.spark.SparkException
 import org.apache.spark.scheduler._
+import org.apache.spark.sql.{Encoder, SparkSession}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -45,7 +47,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
   after {
     spark.streams.active.foreach(_.stop())
     assert(spark.streams.active.isEmpty)
-    assert(addedListeners.isEmpty)
+    assert(addedListeners().isEmpty)
     // Make sure we don't leak any events to the next test
     spark.sparkContext.listenerBus.waitUntilEmpty(10000)
   }
@@ -148,7 +150,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
       assert(isListenerActive(listener1) === false)
       assert(isListenerActive(listener2) === true)
     } finally {
-      addedListeners.foreach(spark.streams.removeListener)
+      addedListeners().foreach(spark.streams.removeListener)
     }
   }
 
@@ -251,6 +253,57 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
+  test("listener only posts events from queries started in the related sessions") {
+    val session1 = spark.newSession()
+    val session2 = spark.newSession()
+    val collector1 = new EventCollector
+    val collector2 = new EventCollector
+
+    def runQuery(session: SparkSession): Unit = {
+      collector1.reset()
+      collector2.reset()
+      val mem = MemoryStream[Int](implicitly[Encoder[Int]], session.sqlContext)
+      testStream(mem.toDS)(
+        AddData(mem, 1, 2, 3),
+        CheckAnswer(1, 2, 3)
+      )
+      session.sparkContext.listenerBus.waitUntilEmpty(5000)
+    }
+
+    def assertEventsCollected(collector: EventCollector): Unit = {
+      assert(collector.startEvent !== null)
+      assert(collector.progressEvents.nonEmpty)
+      assert(collector.terminationEvent !== null)
+    }
+
+    def assertEventsNotCollected(collector: EventCollector): Unit = {
+      assert(collector.startEvent === null)
+      assert(collector.progressEvents.isEmpty)
+      assert(collector.terminationEvent === null)
+    }
+
+    assert(session1.ne(session2))
+    assert(session1.streams.ne(session2.streams))
+
+    withListenerAdded(collector1, session1) {
+      assert(addedListeners(session1).nonEmpty)
+
+      withListenerAdded(collector2, session2) {
+        assert(addedListeners(session2).nonEmpty)
+
+        // query on session1 should send events only to collector1
+        runQuery(session1)
+        assertEventsCollected(collector1)
+        assertEventsNotCollected(collector2)
+
+        // query on session2 should send events only to collector2
+        runQuery(session2)
+        assertEventsCollected(collector2)
+        assertEventsNotCollected(collector1)
+      }
+    }
+  }
+
   testQuietly("ReplayListenerBus should ignore broken event jsons generated in 2.0.0") {
     // query-event-logs-version-2.0.0.txt has all types of events generated by
     // Structured Streaming in Spark 2.0.0.
@@ -298,21 +351,23 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
-  private def withListenerAdded(listener: StreamingQueryListener)(body: => Unit): Unit = {
+  private def withListenerAdded(
+      listener: StreamingQueryListener,
+      session: SparkSession = spark)(body: => Unit): Unit = {
     try {
       failAfter(streamingTimeout) {
-        spark.streams.addListener(listener)
+        session.streams.addListener(listener)
         body
       }
     } finally {
-      spark.streams.removeListener(listener)
+      session.streams.removeListener(listener)
     }
   }
 
-  private def addedListeners(): Array[StreamingQueryListener] = {
+  private def addedListeners(session: SparkSession = spark): Array[StreamingQueryListener] = {
     val listenerBusMethod =
       PrivateMethod[StreamingQueryListenerBus]('listenerBus)
-    val listenerBus = spark.streams invokePrivate listenerBusMethod()
+    val listenerBus = session.streams invokePrivate listenerBusMethod()
     listenerBus.listeners.toArray.map(_.asInstanceOf[StreamingQueryListener])
   }
 

From ab865cfd9dc87154e7d4fc5d09168868c88db6b0 Mon Sep 17 00:00:00 2001
From: sethah <seth.hendrickson16@gmail.com>
Date: Wed, 7 Dec 2016 19:41:32 -0800
Subject: [PATCH 305/534] [SPARK-18705][ML][DOC] Update user guide to reflect
 one pass solver for L1 and elastic-net

## What changes were proposed in this pull request?

WeightedLeastSquares now supports L1 and elastic net penalties and has an additional solver option: QuasiNewton. The docs are updated to reflect this change.

## How was this patch tested?

Docs only. Generated documentation to make sure Latex looks ok.

Author: sethah <seth.hendrickson16@gmail.com>

Closes #16139 from sethah/SPARK-18705.

(cherry picked from commit 82253617f5b3cdbd418c48f94e748651ee80077e)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 docs/ml-advanced.md | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 12a03d3c91984..2747f2df7cb10 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -59,17 +59,25 @@ Given $n$ weighted observations $(w_i, a_i, b_i)$:
 
 The number of features for each observation is $m$. We use the following weighted least squares formulation:
 `\[   
-minimize_{x}\frac{1}{2} \sum_{i=1}^n \frac{w_i(a_i^T x -b_i)^2}{\sum_{k=1}^n w_k} + \frac{1}{2}\frac{\lambda}{\delta}\sum_{j=1}^m(\sigma_{j} x_{j})^2
+\min_{\mathbf{x}}\frac{1}{2} \sum_{i=1}^n \frac{w_i(\mathbf{a}_i^T \mathbf{x} -b_i)^2}{\sum_{k=1}^n w_k} + \frac{\lambda}{\delta}\left[\frac{1}{2}(1 - \alpha)\sum_{j=1}^m(\sigma_j x_j)^2 + \alpha\sum_{j=1}^m |\sigma_j x_j|\right]
 \]`
-where $\lambda$ is the regularization parameter, $\delta$ is the population standard deviation of the label
+where $\lambda$ is the regularization parameter, $\alpha$ is the elastic-net mixing parameter, $\delta$ is the population standard deviation of the label
 and $\sigma_j$ is the population standard deviation of the j-th feature column.
 
-This objective function has an analytic solution and it requires only one pass over the data to collect necessary statistics to solve.
-Unlike the original dataset which can only be stored in a distributed system,
-these statistics can be loaded into memory on a single machine if the number of features is relatively small, and then we can solve the objective function through Cholesky factorization on the driver.
+This objective function requires only one pass over the data to collect the statistics necessary to solve it. For an
+$n \times m$ data matrix, these statistics require only $O(m^2)$ storage and so can be stored on a single machine when $m$ (the number of features) is
+relatively small. We can then solve the normal equations on a single machine using local methods like direct Cholesky factorization or iterative optimization programs.
 
-WeightedLeastSquares only supports L2 regularization and provides options to enable or disable regularization and standardization.
-In order to make the normal equation approach efficient, WeightedLeastSquares requires that the number of features be no more than 4096. For larger problems, use L-BFGS instead.
+Spark MLlib currently supports two types of solvers for the normal equations: Cholesky factorization and Quasi-Newton methods (L-BFGS/OWL-QN). Cholesky factorization
+depends on a positive definite covariance matrix (i.e. columns of the data matrix must be linearly independent) and will fail if this condition is violated. Quasi-Newton methods
+are still capable of providing a reasonable solution even when the covariance matrix is not positive definite, so the normal equation solver can also fall back to 
+Quasi-Newton methods in this case. This fallback is currently always enabled for the `LinearRegression` and `GeneralizedLinearRegression` estimators.
+
+`WeightedLeastSquares` supports L1, L2, and elastic-net regularization and provides options to enable or disable regularization and standardization. In the case where no 
+L1 regularization is applied (i.e. $\alpha = 0$), there exists an analytical solution and either Cholesky or Quasi-Newton solver may be used. When $\alpha > 0$ no analytical 
+solution exists and we instead use the Quasi-Newton solver to find the coefficients iteratively. 
+
+In order to make the normal equation approach efficient, `WeightedLeastSquares` requires that the number of features be no more than 4096. For larger problems, use L-BFGS instead.
 
 ## Iteratively reweighted least squares (IRLS)
 
@@ -83,6 +91,6 @@ It solves certain optimization problems iteratively through the following proced
 * solve a weighted least squares (WLS) problem by WeightedLeastSquares.
 * repeat above steps until convergence.
 
-Since it involves solving a weighted least squares (WLS) problem by WeightedLeastSquares in each iteration,
+Since it involves solving a weighted least squares (WLS) problem by `WeightedLeastSquares` in each iteration,
 it also requires the number of features to be no more than 4096.
 Currently IRLS is used as the default solver of [GeneralizedLinearRegression](api/scala/index.html#org.apache.spark.ml.regression.GeneralizedLinearRegression).

From 1c3f1da82356426b6b550fee67e66dc82eaf1c85 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 7 Dec 2016 20:23:28 -0800
Subject: [PATCH 306/534] [SPARK-18326][SPARKR][ML] Review SparkR ML wrappers
 API for 2.1

## What changes were proposed in this pull request?
Reviewing SparkR ML wrappers API for 2.1 release, mainly two issues:
* Remove ```probabilityCol``` from the argument list of ```spark.logit``` and ```spark.randomForest```. Since it was used when making prediction and should be an argument of ```predict```, and we will work on this at [SPARK-18618](https://issues.apache.org/jira/browse/SPARK-18618) in the next release cycle.
* Fix ```spark.als``` params to make it consistent with MLlib.

## How was this patch tested?
Existing tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16169 from yanboliang/spark-18326.

(cherry picked from commit 97255497d885f0f8ccfc808e868bc8aa5e4d1063)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/mllib.R                               | 23 ++++++++-----------
 R/pkg/inst/tests/testthat/test_mllib.R        |  4 ++--
 .../ml/r/LogisticRegressionWrapper.scala      |  4 +---
 .../r/RandomForestClassificationWrapper.scala |  2 --
 4 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 074e9cbebe1d4..632e4add64572 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -733,7 +733,6 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                  excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
 #'                  is the original probability of that class and t is the class's threshold.
 #' @param weightCol The weight column name.
-#' @param probabilityCol column name for predicted class conditional probabilities.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model
 #' @rdname spark.logit
@@ -772,7 +771,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
                    tol = 1E-6, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL, probabilityCol = "probability") {
+                   thresholds = 0.5, weightCol = NULL) {
             formula <- paste(deparse(formula), collapse = "")
 
             if (is.null(weightCol)) {
@@ -784,7 +783,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
                                 as.numeric(elasticNetParam), as.integer(maxIter),
                                 as.numeric(tol), as.character(family),
                                 as.logical(standardization), as.array(thresholds),
-                                as.character(weightCol), as.character(probabilityCol))
+                                as.character(weightCol))
             new("LogisticRegressionModel", jobj = jobj)
           })
 
@@ -1425,7 +1424,7 @@ setMethod("predict", signature(object = "GaussianMixtureModel"),
 #' @param userCol column name for user ids. Ids must be (or can be coerced into) integers.
 #' @param itemCol column name for item ids. Ids must be (or can be coerced into) integers.
 #' @param rank rank of the matrix factorization (> 0).
-#' @param reg regularization parameter (>= 0).
+#' @param regParam regularization parameter (>= 0).
 #' @param maxIter maximum number of iterations (>= 0).
 #' @param nonnegative logical value indicating whether to apply nonnegativity constraints.
 #' @param implicitPrefs logical value indicating whether to use implicit preference.
@@ -1464,21 +1463,21 @@ setMethod("predict", signature(object = "GaussianMixtureModel"),
 #'
 #' # set other arguments
 #' modelS <- spark.als(df, "rating", "user", "item", rank = 20,
-#'                     reg = 0.1, nonnegative = TRUE)
+#'                     regParam = 0.1, nonnegative = TRUE)
 #' statsS <- summary(modelS)
 #' }
 #' @note spark.als since 2.1.0
 setMethod("spark.als", signature(data = "SparkDataFrame"),
           function(data, ratingCol = "rating", userCol = "user", itemCol = "item",
-                   rank = 10, reg = 0.1, maxIter = 10, nonnegative = FALSE,
+                   rank = 10, regParam = 0.1, maxIter = 10, nonnegative = FALSE,
                    implicitPrefs = FALSE, alpha = 1.0, numUserBlocks = 10, numItemBlocks = 10,
                    checkpointInterval = 10, seed = 0) {
 
             if (!is.numeric(rank) || rank <= 0) {
               stop("rank should be a positive number.")
             }
-            if (!is.numeric(reg) || reg < 0) {
-              stop("reg should be a nonnegative number.")
+            if (!is.numeric(regParam) || regParam < 0) {
+              stop("regParam should be a nonnegative number.")
             }
             if (!is.numeric(maxIter) || maxIter <= 0) {
               stop("maxIter should be a positive number.")
@@ -1486,7 +1485,7 @@ setMethod("spark.als", signature(data = "SparkDataFrame"),
 
             jobj <- callJStatic("org.apache.spark.ml.r.ALSWrapper",
                                 "fit", data@sdf, ratingCol, userCol, itemCol, as.integer(rank),
-                                reg, as.integer(maxIter), implicitPrefs, alpha, nonnegative,
+                                regParam, as.integer(maxIter), implicitPrefs, alpha, nonnegative,
                                 as.integer(numUserBlocks), as.integer(numItemBlocks),
                                 as.integer(checkpointInterval), as.integer(seed))
             new("ALSModel", jobj = jobj)
@@ -1684,8 +1683,6 @@ print.summary.KSTest <- function(x, ...) {
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
 #'                     can speed up training of deeper trees. Users can set how often should the
 #'                     cache be checkpointed or disable it by setting checkpointInterval.
-#' @param probabilityCol column name for predicted class conditional probabilities, only for
-#'                       classification.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -1720,7 +1717,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                    maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
                    featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0,
                    minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
-                   maxMemoryInMB = 256, cacheNodeIds = FALSE, probabilityCol = "probability") {
+                   maxMemoryInMB = 256, cacheNodeIds = FALSE) {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -1749,7 +1746,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                                          impurity, as.integer(minInstancesPerNode),
                                          as.numeric(minInfoGain), as.integer(checkpointInterval),
                                          as.character(featureSubsetStrategy), seed,
-                                         as.numeric(subsamplingRate), as.character(probabilityCol),
+                                         as.numeric(subsamplingRate),
                                          as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
                      new("RandomForestClassificationModel", jobj = jobj)
                    }
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 9f810befcd40f..db1e4dc7d8458 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -926,10 +926,10 @@ test_that("spark.posterior and spark.perplexity", {
 
 test_that("spark.als", {
   data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
-  list(2, 1, 1.0), list(2, 2, 5.0))
+               list(2, 1, 1.0), list(2, 2, 5.0))
   df <- createDataFrame(data, c("user", "item", "score"))
   model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
-  rank = 10, maxIter = 5, seed = 0, reg = 0.1)
+                     rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
   stats <- summary(model)
   expect_equal(stats$rank, 10)
   test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 7f0f3cea2124a..645bc7247f30f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -96,8 +96,7 @@ private[r] object LogisticRegressionWrapper
       family: String,
       standardization: Boolean,
       thresholds: Array[Double],
-      weightCol: String,
-      probabilityCol: String
+      weightCol: String
       ): LogisticRegressionWrapper = {
 
     val rFormula = new RFormula()
@@ -123,7 +122,6 @@ private[r] object LogisticRegressionWrapper
       .setWeightCol(weightCol)
       .setFeaturesCol(rFormula.getFeaturesCol)
       .setLabelCol(rFormula.getLabelCol)
-      .setProbabilityCol(probabilityCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
 
     if (thresholds.length > 1) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
index 0b860e5af96e3..366f375b58582 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/RandomForestClassificationWrapper.scala
@@ -76,7 +76,6 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       featureSubsetStrategy: String,
       seed: String,
       subsamplingRate: Double,
-      probabilityCol: String,
       maxMemoryInMB: Int,
       cacheNodeIds: Boolean): RandomForestClassifierWrapper = {
 
@@ -102,7 +101,6 @@ private[r] object RandomForestClassifierWrapper extends MLReadable[RandomForestC
       .setSubsamplingRate(subsamplingRate)
       .setMaxMemoryInMB(maxMemoryInMB)
       .setCacheNodeIds(cacheNodeIds)
-      .setProbabilityCol(probabilityCol)
       .setFeaturesCol(rFormula.getFeaturesCol)
       .setLabelCol(rFormula.getLabelCol)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)

From 080717497365b83bc202ab16812ced93eb1ea7bd Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Wed, 7 Dec 2016 22:29:49 -0800
Subject: [PATCH 307/534] Preparing Spark release v2.1.0-rc2

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 46fb178112802..981ae1246476b 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SparkR
 Type: Package
 Title: R Frontend for Apache Spark
-Version: 2.1.1
+Version: 2.1.0
 Date: 2016-11-06
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 29522fd3fd829..aebfd12227751 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 85644c4a37bbe..67d78d5f102fb 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index e15ede974cf8c..93790979d7b26 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index c93a355b84d0b..53cb8dd815d81 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 7c9870a8cb85e..89bee8567fc74 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8f949b94fd233..7b45b23e9c546 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a9b858e27150f..9b84f1e0c1dfc 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index d24ef118a5c1e..bbe07006109ea 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 84ad5500c0a7d..cd5849b37453c 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.1.1
+SPARK_VERSION: 2.1.0
+SPARK_VERSION_SHORT: 2.1.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 8a9e6cfcfcc70..2fb42413aca81 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 3849c02ffb03c..4061c5f089c54 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 964e45f31b741..6cfc47ef00e2a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index eec7a889ca1ff..58caf35f65a16 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index a7622d08151fe..ed32fc0ec4c18 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index e862126e48dbe..a3f3907573f21 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index be8e73e41b947..9ae4461db64a2 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index fdfd2ccd4327a..f7276d0bd2197 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index e5bf070124b6a..52c88150137e3 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index c0a94f5950d5c..93b49bcf615b6 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index a02e23c69171d..cdfd29e3a9208 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index d7bb1acdc1d81..c6a79aa86bcf0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c53b72eefe84d..3fa28aa81f214 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 41b16500dd2bc..5c828780600cd 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 96e34cacff8b0..1818bc80ea78a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index c0b70dfdc3364..d60a633b87699 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index 532d6073343ba..f8e43d2c43ec2 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6c3a35eeb9ecd..6dcb44cebb254 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 757906d137c29..5cf3a7f3e0f5e 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 555324524ee82..49f12703c04df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.1-SNAPSHOT</version>
+  <version>2.1.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 6ae3609ae7fae..e91e778cb518c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.1.dev0"
+__version__ = "2.1.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 705316a944e28..1e7db9b10f045 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 72be7e1005f64..c58e0f43b2ac7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d7989c2413040..37e7dccd2e27d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 34e0ae5bbc229..468d758a77884 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index c543a3e049531..7bf4fc0df45e8 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fba6a5d7734a4..06569e6ee2231 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 0c4c9c9f51828..35d53b30191a5 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 85ec270bf9965..38374b5ae5a3b 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 48aa6775d6b54ccecdbe2287ae75d99c00b02d18 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Wed, 7 Dec 2016 22:29:55 -0800
Subject: [PATCH 308/534] Preparing development version 2.1.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 981ae1246476b..46fb178112802 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SparkR
 Type: Package
 Title: R Frontend for Apache Spark
-Version: 2.1.0
+Version: 2.1.1
 Date: 2016-11-06
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index aebfd12227751..29522fd3fd829 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 67d78d5f102fb..85644c4a37bbe 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 93790979d7b26..e15ede974cf8c 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 53cb8dd815d81..c93a355b84d0b 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 89bee8567fc74..7c9870a8cb85e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 7b45b23e9c546..8f949b94fd233 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 9b84f1e0c1dfc..a9b858e27150f 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index bbe07006109ea..d24ef118a5c1e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index cd5849b37453c..84ad5500c0a7d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.0
-SPARK_VERSION_SHORT: 2.1.0
+SPARK_VERSION: 2.1.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.1.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 2fb42413aca81..8a9e6cfcfcc70 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 4061c5f089c54..3849c02ffb03c 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6cfc47ef00e2a..964e45f31b741 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 58caf35f65a16..eec7a889ca1ff 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index ed32fc0ec4c18..a7622d08151fe 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index a3f3907573f21..e862126e48dbe 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 9ae4461db64a2..be8e73e41b947 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index f7276d0bd2197..fdfd2ccd4327a 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 52c88150137e3..e5bf070124b6a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 93b49bcf615b6..c0a94f5950d5c 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index cdfd29e3a9208..a02e23c69171d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index c6a79aa86bcf0..d7bb1acdc1d81 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fa28aa81f214..c53b72eefe84d 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 5c828780600cd..41b16500dd2bc 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 1818bc80ea78a..96e34cacff8b0 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d60a633b87699..c0b70dfdc3364 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index f8e43d2c43ec2..532d6073343ba 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6dcb44cebb254..6c3a35eeb9ecd 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5cf3a7f3e0f5e..757906d137c29 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 49f12703c04df..555324524ee82 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0</version>
+  <version>2.1.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index e91e778cb518c..6ae3609ae7fae 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.0"
+__version__ = "2.1.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1e7db9b10f045..705316a944e28 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index c58e0f43b2ac7..72be7e1005f64 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 37e7dccd2e27d..d7989c2413040 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 468d758a77884..34e0ae5bbc229 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 7bf4fc0df45e8..c543a3e049531 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 06569e6ee2231..fba6a5d7734a4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 35d53b30191a5..0c4c9c9f51828 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 38374b5ae5a3b..85ec270bf9965 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 9095c152e7fedf469dcc4887f5b6a1882cd74c28 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 8 Dec 2016 06:19:38 -0800
Subject: [PATCH 309/534] [SPARK-18325][SPARKR][ML] SparkR ML wrappers example
 code and user guide

## What changes were proposed in this pull request?
* Add all R examples for ML wrappers which were added during 2.1 release cycle.
* Split the whole ```ml.R``` example file into individual example for each algorithm, which will be convenient for users to rerun them.
* Add corresponding examples to ML user guide.
* Update ML section of SparkR user guide.

Note: MLlib Scala/Java/Python examples will be consistent, however, SparkR examples may different from them, since R users may use the algorithms in a different way, for example, using R ```formula``` to specify ```featuresCol``` and ```labelCol```.

## How was this patch tested?
Run all examples manually.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #16148 from yanboliang/spark-18325.

(cherry picked from commit 9bf8f3cd4f62f921c32fb50b8abf49576a80874f)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 docs/ml-classification-regression.md     |  67 +++++++++-
 docs/ml-clustering.md                    |  18 ++-
 docs/ml-collaborative-filtering.md       |   8 ++
 docs/sparkr.md                           |  46 +++----
 examples/src/main/r/ml.R                 | 148 -----------------------
 examples/src/main/r/ml/als.R             |  45 +++++++
 examples/src/main/r/ml/gaussianMixture.R |  42 +++++++
 examples/src/main/r/ml/gbt.R             |  63 ++++++++++
 examples/src/main/r/ml/glm.R             |  57 +++++++++
 examples/src/main/r/ml/isoreg.R          |  42 +++++++
 examples/src/main/r/ml/kmeans.R          |  44 +++++++
 examples/src/main/r/ml/kstest.R          |  39 ++++++
 examples/src/main/r/ml/lda.R             |  46 +++++++
 examples/src/main/r/ml/logit.R           |  63 ++++++++++
 examples/src/main/r/ml/ml.R              |  65 ++++++++++
 examples/src/main/r/ml/mlp.R             |  48 ++++++++
 examples/src/main/r/ml/naiveBayes.R      |  41 +++++++
 examples/src/main/r/ml/randomForest.R    |  63 ++++++++++
 examples/src/main/r/ml/survreg.R         |  43 +++++++
 19 files changed, 810 insertions(+), 178 deletions(-)
 delete mode 100644 examples/src/main/r/ml.R
 create mode 100644 examples/src/main/r/ml/als.R
 create mode 100644 examples/src/main/r/ml/gaussianMixture.R
 create mode 100644 examples/src/main/r/ml/gbt.R
 create mode 100644 examples/src/main/r/ml/glm.R
 create mode 100644 examples/src/main/r/ml/isoreg.R
 create mode 100644 examples/src/main/r/ml/kmeans.R
 create mode 100644 examples/src/main/r/ml/kstest.R
 create mode 100644 examples/src/main/r/ml/lda.R
 create mode 100644 examples/src/main/r/ml/logit.R
 create mode 100644 examples/src/main/r/ml/ml.R
 create mode 100644 examples/src/main/r/ml/mlp.R
 create mode 100644 examples/src/main/r/ml/naiveBayes.R
 create mode 100644 examples/src/main/r/ml/randomForest.R
 create mode 100644 examples/src/main/r/ml/survreg.R

diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index 557a53cc2314a..2ffea64178630 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -75,6 +75,13 @@ More details on parameters can be found in the [Python API documentation](api/py
 {% include_example python/ml/logistic_regression_with_elastic_net.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+More details on parameters can be found in the [R API documentation](api/R/spark.logit.html).
+
+{% include_example binomial r/ml/logit.R %}
+</div>
+
 </div>
 
 The `spark.ml` implementation of logistic regression also supports
@@ -171,6 +178,13 @@ model with elastic net regularization.
 {% include_example python/ml/multiclass_logistic_regression_with_elastic_net.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+More details on parameters can be found in the [R API documentation](api/R/spark.logit.html).
+
+{% include_example multinomial r/ml/logit.R %}
+</div>
+
 </div>
 
 
@@ -242,6 +256,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 {% include_example python/ml/random_forest_classifier_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.randomForest.html) for more details.
+
+{% include_example classification r/ml/randomForest.R %}
+</div>
+
 </div>
 
 ## Gradient-boosted tree classifier
@@ -275,6 +297,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 {% include_example python/ml/gradient_boosted_tree_classifier_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.gbt.html) for more details.
+
+{% include_example classification r/ml/gbt.R %}
+</div>
+
 </div>
 
 ## Multilayer perceptron classifier
@@ -324,6 +354,13 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 {% include_example python/ml/multilayer_perceptron_classification.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.mlp.html) for more details.
+
+{% include_example r/ml/mlp.R %}
+</div>
+
 </div>
 
 
@@ -400,7 +437,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat
 
 Refer to the [R API docs](api/R/spark.naiveBayes.html) for more details.
 
-{% include_example naiveBayes r/ml.R %}
+{% include_example r/ml/naiveBayes.R %}
 </div>
 
 </div>
@@ -584,7 +621,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 
 Refer to the [R API docs](api/R/spark.glm.html) for more details.
 
-{% include_example glm r/ml.R %}
+{% include_example r/ml/glm.R %}
 </div>
 
 </div>
@@ -656,6 +693,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 
 {% include_example python/ml/random_forest_regressor_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.randomForest.html) for more details.
+
+{% include_example regression r/ml/randomForest.R %}
+</div>
+
 </div>
 
 ## Gradient-boosted tree regression
@@ -689,6 +734,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 
 {% include_example python/ml/gradient_boosted_tree_regressor_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.gbt.html) for more details.
+
+{% include_example regression r/ml/gbt.R %}
+</div>
+
 </div>
 
 
@@ -780,7 +833,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.
 
 Refer to the [R API docs](api/R/spark.survreg.html) for more details.
 
-{% include_example survreg r/ml.R %}
+{% include_example r/ml/survreg.R %}
 </div>
 
 </div>
@@ -853,6 +906,14 @@ Refer to the [`IsotonicRegression` Python docs](api/python/pyspark.ml.html#pyspa
 
 {% include_example python/ml/isotonic_regression_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [`IsotonicRegression` R API docs](api/R/spark.isoreg.html) for more details on the API.
+
+{% include_example r/ml/isoreg.R %}
+</div>
+
 </div>
 
 # Linear methods
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 4731abc7dcdd6..d10db51d2309c 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -91,7 +91,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 Refer to the [R API docs](api/R/spark.kmeans.html) for more details.
 
-{% include_example kmeans r/ml.R %}
+{% include_example r/ml/kmeans.R %}
 </div>
 
 </div>
@@ -124,6 +124,14 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 {% include_example python/ml/lda_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.lda.html) for more details.
+
+{% include_example r/ml/lda.R %}
+</div>
+
 </div>
 
 ## Bisecting k-means
@@ -239,4 +247,12 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
 
 {% include_example python/ml/gaussian_mixture_example.py %}
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.gaussianMixture.html) for more details.
+
+{% include_example r/ml/gaussianMixture.R %}
+</div>
+
 </div>
diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md
index 1d02d6933cb48..7933a1f5d7fa4 100644
--- a/docs/ml-collaborative-filtering.md
+++ b/docs/ml-collaborative-filtering.md
@@ -149,4 +149,12 @@ als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True,
 {% endhighlight %}
 
 </div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.als.html) for more details.
+
+{% include_example r/ml/als.R %}
+</div>
+
 </div>
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 60cd01a9fea71..d2db78282aa8f 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -512,39 +512,33 @@ head(teenagers)
 
 # Machine Learning
 
-SparkR supports the following machine learning algorithms currently: `Generalized Linear Model`, `Accelerated Failure Time (AFT) Survival Regression Model`, `Naive Bayes Model` and `KMeans Model`.
-Under the hood, SparkR uses MLlib to train the model.
-Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models.
-SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘.
-
 ## Algorithms
 
-### Generalized Linear Model
-
-[spark.glm()](api/R/spark.glm.html) or [glm()](api/R/glm.html) fits generalized linear model against a Spark DataFrame.
-Currently "gaussian", "binomial", "poisson" and "gamma" families are supported.
-{% include_example glm r/ml.R %}
-
-### Accelerated Failure Time (AFT) Survival Regression Model
-
-[spark.survreg()](api/R/spark.survreg.html) fits an accelerated failure time (AFT) survival regression model on a SparkDataFrame.
-Note that the formula of [spark.survreg()](api/R/spark.survreg.html) does not support operator '.' currently.
-{% include_example survreg r/ml.R %}
-
-### Naive Bayes Model
-
-[spark.naiveBayes()](api/R/spark.naiveBayes.html) fits a Bernoulli naive Bayes model against a SparkDataFrame. Only categorical data is supported.
-{% include_example naiveBayes r/ml.R %}
-
-### KMeans Model
+SparkR supports the following machine learning algorithms currently:
+
+* [`spark.glm`](api/R/spark.glm.html) or [`glm`](api/R/glm.html): [`Generalized Linear Model`](ml-classification-regression.html#generalized-linear-regression)
+* [`spark.survreg`](api/R/spark.survreg.html): [`Accelerated Failure Time (AFT) Survival Regression Model`](ml-classification-regression.html#survival-regression)
+* [`spark.naiveBayes`](api/R/spark.naiveBayes.html): [`Naive Bayes Model`](ml-classification-regression.html#naive-bayes)
+* [`spark.kmeans`](api/R/spark.kmeans.html): [`K-Means Model`](ml-clustering.html#k-means)
+* [`spark.logit`](api/R/spark.logit.html): [`Logistic Regression Model`](ml-classification-regression.html#logistic-regression)
+* [`spark.isoreg`](api/R/spark.isoreg.html): [`Isotonic Regression Model`](ml-classification-regression.html#isotonic-regression)
+* [`spark.gaussianMixture`](api/R/spark.gaussianMixture.html): [`Gaussian Mixture Model`](ml-clustering.html#gaussian-mixture-model-gmm)
+* [`spark.lda`](api/R/spark.lda.html): [`Latent Dirichlet Allocation (LDA) Model`](ml-clustering.html#latent-dirichlet-allocation-lda)
+* [`spark.mlp`](api/R/spark.mlp.html): [`Multilayer Perceptron Classification Model`](ml-classification-regression.html#multilayer-perceptron-classifier)
+* [`spark.gbt`](api/R/spark.gbt.html): `Gradient Boosted Tree Model for` [`Regression`](ml-classification-regression.html#gradient-boosted-tree-regression) `and` [`Classification`](ml-classification-regression.html#gradient-boosted-tree-classifier)
+* [`spark.randomForest`](api/R/spark.randomForest.html): `Random Forest Model for` [`Regression`](ml-classification-regression.html#random-forest-regression) `and` [`Classification`](ml-classification-regression.html#random-forest-classifier)
+* [`spark.als`](api/R/spark.als.html): [`Alternating Least Squares (ALS) matrix factorization Model`](ml-collaborative-filtering.html#collaborative-filtering)
+* [`spark.kstest`](api/R/spark.kstest.html): `Kolmogorov-Smirnov Test`
+
+Under the hood, SparkR uses MLlib to train the model. Please refer to the corresponding section of MLlib user guide for example code.
+Users can call `summary` to print a summary of the fitted model, [predict](api/R/predict.html) to make predictions on new data, and [write.ml](api/R/write.ml.html)/[read.ml](api/R/read.ml.html) to save/load fitted models.
+SparkR supports a subset of the available R formula operators for model fitting, including ‘~’, ‘.’, ‘:’, ‘+’, and ‘-‘.
 
-[spark.kmeans()](api/R/spark.kmeans.html) fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans().
-{% include_example kmeans r/ml.R %}
 
 ## Model persistence
 
 The following example shows how to save/load a MLlib model by SparkR.
-{% include_example read_write r/ml.R %}
+{% include_example read_write r/ml/ml.R %}
 
 # R Function Name Conflicts
 
diff --git a/examples/src/main/r/ml.R b/examples/src/main/r/ml.R
deleted file mode 100644
index a8a1274ac902a..0000000000000
--- a/examples/src/main/r/ml.R
+++ /dev/null
@@ -1,148 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# To run this example use
-# ./bin/spark-submit examples/src/main/r/ml.R
-
-# Load SparkR library into your R session
-library(SparkR)
-
-# Initialize SparkSession
-sparkR.session(appName = "SparkR-ML-example")
-
-############################ spark.glm and glm ##############################################
-# $example on:glm$
-irisDF <- suppressWarnings(createDataFrame(iris))
-# Fit a generalized linear model of family "gaussian" with spark.glm
-gaussianDF <- irisDF
-gaussianTestDF <- irisDF
-gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
-
-# Model summary
-summary(gaussianGLM)
-
-# Prediction
-gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
-showDF(gaussianPredictions)
-
-# Fit a generalized linear model with glm (R-compliant)
-gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
-summary(gaussianGLM2)
-
-# Fit a generalized linear model of family "binomial" with spark.glm
-binomialDF <- filter(irisDF, irisDF$Species != "setosa")
-binomialTestDF <- binomialDF
-binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
-
-# Model summary
-summary(binomialGLM)
-
-# Prediction
-binomialPredictions <- predict(binomialGLM, binomialTestDF)
-showDF(binomialPredictions)
-# $example off:glm$
-############################ spark.survreg ##############################################
-# $example on:survreg$
-# Use the ovarian dataset available in R survival package
-library(survival)
-
-# Fit an accelerated failure time (AFT) survival regression model with spark.survreg
-ovarianDF <- suppressWarnings(createDataFrame(ovarian))
-aftDF <- ovarianDF
-aftTestDF <- ovarianDF
-aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
-
-# Model summary
-summary(aftModel)
-
-# Prediction
-aftPredictions <- predict(aftModel, aftTestDF)
-showDF(aftPredictions)
-# $example off:survreg$
-############################ spark.naiveBayes ##############################################
-# $example on:naiveBayes$
-# Fit a Bernoulli naive Bayes model with spark.naiveBayes
-titanic <- as.data.frame(Titanic)
-titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5])
-nbDF <- titanicDF
-nbTestDF <- titanicDF
-nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
-
-# Model summary
-summary(nbModel)
-
-# Prediction
-nbPredictions <- predict(nbModel, nbTestDF)
-showDF(nbPredictions)
-# $example off:naiveBayes$
-############################ spark.kmeans ##############################################
-# $example on:kmeans$
-# Fit a k-means model with spark.kmeans
-irisDF <- suppressWarnings(createDataFrame(iris))
-kmeansDF <- irisDF
-kmeansTestDF <- irisDF
-kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
-                            k = 3)
-
-# Model summary
-summary(kmeansModel)
-
-# Get fitted result from the k-means model
-showDF(fitted(kmeansModel))
-
-# Prediction
-kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
-showDF(kmeansPredictions)
-# $example off:kmeans$
-############################ model read/write ##############################################
-# $example on:read_write$
-irisDF <- suppressWarnings(createDataFrame(iris))
-# Fit a generalized linear model of family "gaussian" with spark.glm
-gaussianDF <- irisDF
-gaussianTestDF <- irisDF
-gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
-
-# Save and then load a fitted MLlib model
-modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
-write.ml(gaussianGLM, modelPath)
-gaussianGLM2 <- read.ml(modelPath)
-
-# Check model summary
-summary(gaussianGLM2)
-
-# Check model prediction
-gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
-showDF(gaussianPredictions)
-
-unlink(modelPath)
-# $example off:read_write$
-############################ fit models with spark.lapply #####################################
-
-# Perform distributed training of multiple models with spark.lapply
-families <- c("gaussian", "poisson")
-train <- function(family) {
-  model <- glm(Sepal.Length ~ Sepal.Width + Species, iris, family = family)
-  summary(model)
-}
-model.summaries <- spark.lapply(families, train)
-
-# Print the summary of each model
-print(model.summaries)
-
-
-# Stop the SparkSession now
-sparkR.session.stop()
diff --git a/examples/src/main/r/ml/als.R b/examples/src/main/r/ml/als.R
new file mode 100644
index 0000000000000..383bbba1908eb
--- /dev/null
+++ b/examples/src/main/r/ml/als.R
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/als.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-als-example")
+
+# $example on$
+# Load training data
+data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0),
+             list(1, 2, 4.0), list(2, 1, 1.0), list(2, 2, 5.0))
+df <- createDataFrame(data, c("userId", "movieId", "rating"))
+training <- df
+test <- df
+
+# Fit a recommendation model using ALS with spark.als
+model <- spark.als(training, maxIter = 5, regParam = 0.01, userCol = "userId",
+                   itemCol = "movieId", ratingCol = "rating")
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/gaussianMixture.R b/examples/src/main/r/ml/gaussianMixture.R
new file mode 100644
index 0000000000000..54b69acc83d97
--- /dev/null
+++ b/examples/src/main/r/ml/gaussianMixture.R
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/gaussianMixture.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-gaussianMixture-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_kmeans_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a gaussian mixture clustering model with spark.gaussianMixture
+model <- spark.gaussianMixture(training, ~ features, k = 2)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/gbt.R b/examples/src/main/r/ml/gbt.R
new file mode 100644
index 0000000000000..be16c2aa66330
--- /dev/null
+++ b/examples/src/main/r/ml/gbt.R
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/gbt.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-gbt-example")
+
+# GBT classification model
+
+# $example on:classification$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a GBT classification model with spark.gbt
+model <- spark.gbt(training, label ~ features, "classification", maxIter = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:classification$
+
+# GBT regression model
+
+# $example on:regression$
+# Load training data
+df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a GBT regression model with spark.gbt
+model <- spark.gbt(training, label ~ features, "regression", maxIter = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:regression$
diff --git a/examples/src/main/r/ml/glm.R b/examples/src/main/r/ml/glm.R
new file mode 100644
index 0000000000000..599071790a2c3
--- /dev/null
+++ b/examples/src/main/r/ml/glm.R
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/glm.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-glm-example")
+
+# $example on$
+irisDF <- suppressWarnings(createDataFrame(iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
+
+# Model summary
+summary(gaussianGLM)
+
+# Prediction
+gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
+showDF(gaussianPredictions)
+
+# Fit a generalized linear model with glm (R-compliant)
+gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
+summary(gaussianGLM2)
+
+# Fit a generalized linear model of family "binomial" with spark.glm
+# Note: Filter out "setosa" from label column (two labels left) to match "binomial" family.
+binomialDF <- filter(irisDF, irisDF$Species != "setosa")
+binomialTestDF <- binomialDF
+binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
+
+# Model summary
+summary(binomialGLM)
+
+# Prediction
+binomialPredictions <- predict(binomialGLM, binomialTestDF)
+showDF(binomialPredictions)
+# $example off$
diff --git a/examples/src/main/r/ml/isoreg.R b/examples/src/main/r/ml/isoreg.R
new file mode 100644
index 0000000000000..75dce97ed9931
--- /dev/null
+++ b/examples/src/main/r/ml/isoreg.R
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/isoreg.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-isoreg-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_isotonic_regression_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit an isotonic regression model with spark.isoreg
+model <- spark.isoreg(training, label ~ features, isotonic = FALSE)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/kmeans.R b/examples/src/main/r/ml/kmeans.R
new file mode 100644
index 0000000000000..043b21b0385d7
--- /dev/null
+++ b/examples/src/main/r/ml/kmeans.R
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/kmeans.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-kmeans-example")
+
+# $example on$
+# Fit a k-means model with spark.kmeans
+irisDF <- suppressWarnings(createDataFrame(iris))
+kmeansDF <- irisDF
+kmeansTestDF <- irisDF
+kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+                            k = 3)
+
+# Model summary
+summary(kmeansModel)
+
+# Get fitted result from the k-means model
+showDF(fitted(kmeansModel))
+
+# Prediction
+kmeansPredictions <- predict(kmeansModel, kmeansTestDF)
+showDF(kmeansPredictions)
+# $example off$
diff --git a/examples/src/main/r/ml/kstest.R b/examples/src/main/r/ml/kstest.R
new file mode 100644
index 0000000000000..12625f7d3e635
--- /dev/null
+++ b/examples/src/main/r/ml/kstest.R
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/kstest.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-kstest-example")
+
+# $example on$
+# Load training data
+data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
+df <- createDataFrame(data)
+training <- df
+test <- df
+
+# Conduct the two-sided Kolmogorov-Smirnov (KS) test with spark.kstest
+model <- spark.kstest(df, "test", "norm")
+
+# Model summary
+summary(model)
+# $example off$
diff --git a/examples/src/main/r/ml/lda.R b/examples/src/main/r/ml/lda.R
new file mode 100644
index 0000000000000..7b187d155a4cb
--- /dev/null
+++ b/examples/src/main/r/ml/lda.R
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/lda.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-lda-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_lda_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a latent dirichlet allocation model with spark.lda
+model <- spark.lda(training, k = 10, maxIter = 10)
+
+# Model summary
+summary(model)
+
+# Posterior probabilities
+posterior <- spark.posterior(model, test)
+showDF(posterior)
+
+# The log perplexity of the LDA model
+logPerplexity <- spark.perplexity(model, test)
+print(paste0("The upper bound bound on perplexity: ", logPerplexity))
+# $example off$
diff --git a/examples/src/main/r/ml/logit.R b/examples/src/main/r/ml/logit.R
new file mode 100644
index 0000000000000..a2ac882ed022c
--- /dev/null
+++ b/examples/src/main/r/ml/logit.R
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/logit.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-logit-example")
+
+# Binomial logistic regression
+
+# $example on:binomial$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit an binomial logistic regression model with spark.logit
+model <- spark.logit(training, label ~ features, maxIter = 10, regParam = 0.3, elasticNetParam = 0.8)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:binomial$
+
+# Multinomial logistic regression
+
+# $example on:multinomial$
+# Load training data
+df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a multinomial logistic regression model with spark.logit
+model <- spark.logit(training, label ~ features, maxIter = 10, regParam = 0.3, elasticNetParam = 0.8)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:multinomial$
diff --git a/examples/src/main/r/ml/ml.R b/examples/src/main/r/ml/ml.R
new file mode 100644
index 0000000000000..d601590c22a89
--- /dev/null
+++ b/examples/src/main/r/ml/ml.R
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/ml.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-example")
+
+############################ model read/write ##############################################
+# $example on:read_write$
+irisDF <- suppressWarnings(createDataFrame(iris))
+# Fit a generalized linear model of family "gaussian" with spark.glm
+gaussianDF <- irisDF
+gaussianTestDF <- irisDF
+gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
+
+# Save and then load a fitted MLlib model
+modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
+write.ml(gaussianGLM, modelPath)
+gaussianGLM2 <- read.ml(modelPath)
+
+# Check model summary
+summary(gaussianGLM2)
+
+# Check model prediction
+gaussianPredictions <- predict(gaussianGLM2, gaussianTestDF)
+showDF(gaussianPredictions)
+
+unlink(modelPath)
+# $example off:read_write$
+
+############################ fit models with spark.lapply #####################################
+# Perform distributed training of multiple models with spark.lapply
+costs <- exp(seq(from = log(1), to = log(1000), length.out = 5))
+train <- function(cost) {
+  stopifnot(requireNamespace("e1071", quietly = TRUE))
+  model <- e1071::svm(Species ~ ., data = iris, cost = cost)
+  summary(model)
+}
+
+model.summaries <- spark.lapply(costs, train)
+
+# Print the summary of each model
+print(model.summaries)
+
+# Stop the SparkSession now
+sparkR.session.stop()
diff --git a/examples/src/main/r/ml/mlp.R b/examples/src/main/r/ml/mlp.R
new file mode 100644
index 0000000000000..d28fc069bd118
--- /dev/null
+++ b/examples/src/main/r/ml/mlp.R
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/mlp.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-mlp-example")
+
+# $example on$
+# Load training data
+df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# specify layers for the neural network:
+# input layer of size 4 (features), two intermediate of size 5 and 4
+# and output of size 3 (classes)
+layers = c(4, 5, 4, 3)
+
+# Fit a multi-layer perceptron neural network model with spark.mlp
+model <- spark.mlp(training, label ~ features, maxIter = 100,
+                   layers = layers, blockSize = 128, seed = 1234)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off$
diff --git a/examples/src/main/r/ml/naiveBayes.R b/examples/src/main/r/ml/naiveBayes.R
new file mode 100644
index 0000000000000..9c416599b4d78
--- /dev/null
+++ b/examples/src/main/r/ml/naiveBayes.R
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/naiveBayes.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-naiveBayes-example")
+
+# $example on$
+# Fit a Bernoulli naive Bayes model with spark.naiveBayes
+titanic <- as.data.frame(Titanic)
+titanicDF <- createDataFrame(titanic[titanic$Freq > 0, -5])
+nbDF <- titanicDF
+nbTestDF <- titanicDF
+nbModel <- spark.naiveBayes(nbDF, Survived ~ Class + Sex + Age)
+
+# Model summary
+summary(nbModel)
+
+# Prediction
+nbPredictions <- predict(nbModel, nbTestDF)
+showDF(nbPredictions)
+# $example off$
diff --git a/examples/src/main/r/ml/randomForest.R b/examples/src/main/r/ml/randomForest.R
new file mode 100644
index 0000000000000..d1b96b62a0e3b
--- /dev/null
+++ b/examples/src/main/r/ml/randomForest.R
@@ -0,0 +1,63 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/randomForest.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-randomForest-example")
+
+# Random forest classification model
+
+# $example on:classification$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a random forest classification model with spark.randomForest
+model <- spark.randomForest(training, label ~ features, "classification", numTrees = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:classification$
+
+# Random forest regression model
+
+# $example on:regression$
+# Load training data
+df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a random forest regression model with spark.randomForest
+model <- spark.randomForest(training, label ~ features, "regression", numTrees = 10)
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+showDF(predictions)
+# $example off:regression$
diff --git a/examples/src/main/r/ml/survreg.R b/examples/src/main/r/ml/survreg.R
new file mode 100644
index 0000000000000..f728b8b5d8c06
--- /dev/null
+++ b/examples/src/main/r/ml/survreg.R
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/survreg.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-survreg-example")
+
+# $example on$
+# Use the ovarian dataset available in R survival package
+library(survival)
+
+# Fit an accelerated failure time (AFT) survival regression model with spark.survreg
+ovarianDF <- suppressWarnings(createDataFrame(ovarian))
+aftDF <- ovarianDF
+aftTestDF <- ovarianDF
+aftModel <- spark.survreg(aftDF, Surv(futime, fustat) ~ ecog_ps + rx)
+
+# Model summary
+summary(aftModel)
+
+# Prediction
+aftPredictions <- predict(aftModel, aftTestDF)
+showDF(aftPredictions)
+# $example off$

From 726217eb7f783e10571a043546694b5b3c90ac77 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 8 Dec 2016 23:22:18 +0800
Subject: [PATCH 310/534] [SPARK-18667][PYSPARK][SQL] Change the way to group
 row in BatchEvalPythonExec so input_file_name function can work with UDF in
 pyspark

## What changes were proposed in this pull request?

`input_file_name` doesn't return filename when working with UDF in PySpark. An example shows the problem:

    from pyspark.sql.functions import *
    from pyspark.sql.types import *

    def filename(path):
        return path

    sourceFile = udf(filename, StringType())
    spark.read.json("tmp.json").select(sourceFile(input_file_name())).show()

    +---------------------------+
    |filename(input_file_name())|
    +---------------------------+
    |                           |
    +---------------------------+

The cause of this issue is, we group rows in `BatchEvalPythonExec` for batching processing of PythonUDF. Currently we group rows first and then evaluate expressions on the rows. If the data is less than the required number of rows for a group, the iterator will be consumed to the end before the evaluation. However, once the iterator reaches the end, we will unset input filename. So the input_file_name expression can't return correct filename.

This patch fixes the approach to group the batch of rows. We evaluate the expression first and then group evaluated results to batch.

## How was this patch tested?

Added unit test to PySpark.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #16115 from viirya/fix-py-udf-input-filename.

(cherry picked from commit 6a5a7254dc37952505989e9e580a14543adb730c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 python/pyspark/sql/tests.py                   |  8 +++++
 .../python/BatchEvalPythonExec.scala          | 35 +++++++++----------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 50df68b14483d..66320bd050c14 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -412,6 +412,14 @@ def test_udf_with_order_by_and_limit(self):
         res.explain(True)
         self.assertEqual(res.collect(), [Row(id=0, copy=0)])
 
+    def test_udf_with_input_file_name(self):
+        from pyspark.sql.functions import udf, input_file_name
+        from pyspark.sql.types import StringType
+        sourceFile = udf(lambda path: path, StringType())
+        filePath = "python/test_support/sql/people1.json"
+        row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first()
+        self.assertTrue(row[0].find("people1.json") != -1)
+
     def test_basic_functions(self):
         rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
         df = self.spark.read.json(rdd)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index dcaf2c76d479d..7a5ac48f1b69d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -119,26 +119,23 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       val pickle = new Pickler(needConversion)
       // Input iterator to Python: input rows are grouped so we send them in batches to Python.
       // For each row, add it to the queue.
-      val inputIterator = iter.grouped(100).map { inputRows =>
-        val toBePickled = inputRows.map { inputRow =>
-          queue.add(inputRow.asInstanceOf[UnsafeRow])
-          val row = projection(inputRow)
-          if (needConversion) {
-            EvaluatePython.toJava(row, schema)
-          } else {
-            // fast path for these types that does not need conversion in Python
-            val fields = new Array[Any](row.numFields)
-            var i = 0
-            while (i < row.numFields) {
-              val dt = dataTypes(i)
-              fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
-              i += 1
-            }
-            fields
+      val inputIterator = iter.map { inputRow =>
+        queue.add(inputRow.asInstanceOf[UnsafeRow])
+        val row = projection(inputRow)
+        if (needConversion) {
+          EvaluatePython.toJava(row, schema)
+        } else {
+          // fast path for these types that does not need conversion in Python
+          val fields = new Array[Any](row.numFields)
+          var i = 0
+          while (i < row.numFields) {
+            val dt = dataTypes(i)
+            fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
+            i += 1
           }
-        }.toArray
-        pickle.dumps(toBePickled)
-      }
+          fields
+        }
+      }.grouped(100).map(x => pickle.dumps(x.toArray))
 
       val context = TaskContext.get()
 

From e0173f14e3ea28d83c1c46bf97f7d3755960a8fc Mon Sep 17 00:00:00 2001
From: Andrew Ray <ray.andrew@gmail.com>
Date: Thu, 8 Dec 2016 11:08:12 -0800
Subject: [PATCH 311/534] [SPARK-16589] [PYTHON] Chained cartesian produces
 incorrect number of records

## What changes were proposed in this pull request?

Fixes a bug in the python implementation of rdd cartesian product related to batching that showed up in repeated cartesian products with seemingly random results. The root cause being multiple iterators pulling from the same stream in the wrong order because of logic that ignored batching.

`CartesianDeserializer` and `PairDeserializer` were changed to implement `_load_stream_without_unbatching` and borrow the one line implementation of `load_stream` from `BatchedSerializer`. The default implementation of `_load_stream_without_unbatching` was changed to give consistent results (always an iterable) so that it could be used without additional checks.

`PairDeserializer` no longer extends `CartesianDeserializer` as it was not really proper. If wanted a new common super class could be added.

Both `CartesianDeserializer` and `PairDeserializer` now only extend `Serializer` (which has no `dump_stream` implementation) since they are only meant for *de*serialization.

## How was this patch tested?

Additional unit tests (sourced from #14248) plus one for testing a cartesian with zip.

Author: Andrew Ray <ray.andrew@gmail.com>

Closes #16121 from aray/fix-cartesian.

(cherry picked from commit 3c68944b229aaaeeaee3efcbae3e3be9a2914855)
Signed-off-by: Davies Liu <davies.liu@gmail.com>
---
 python/pyspark/serializers.py | 58 +++++++++++++++++++++--------------
 python/pyspark/tests.py       | 18 +++++++++++
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 2a1326947f4f5..c4f2f08cb4445 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -61,7 +61,7 @@
 if sys.version < '3':
     import cPickle as pickle
     protocol = 2
-    from itertools import izip as zip
+    from itertools import izip as zip, imap as map
 else:
     import pickle
     protocol = 3
@@ -96,7 +96,12 @@ def load_stream(self, stream):
         raise NotImplementedError
 
     def _load_stream_without_unbatching(self, stream):
-        return self.load_stream(stream)
+        """
+        Return an iterator of deserialized batches (lists) of objects from the input stream.
+        if the serializer does not operate on batches the default implementation returns an
+        iterator of single element lists.
+        """
+        return map(lambda x: [x], self.load_stream(stream))
 
     # Note: our notion of "equality" is that output generated by
     # equal serializers can be deserialized using the same serializer.
@@ -278,50 +283,57 @@ def __repr__(self):
         return "AutoBatchedSerializer(%s)" % self.serializer
 
 
-class CartesianDeserializer(FramedSerializer):
+class CartesianDeserializer(Serializer):
 
     """
     Deserializes the JavaRDD cartesian() of two PythonRDDs.
+    Due to pyspark batching we cannot simply use the result of the Java RDD cartesian,
+    we additionally need to do the cartesian within each pair of batches.
     """
 
     def __init__(self, key_ser, val_ser):
-        FramedSerializer.__init__(self)
         self.key_ser = key_ser
         self.val_ser = val_ser
 
-    def prepare_keys_values(self, stream):
-        key_stream = self.key_ser._load_stream_without_unbatching(stream)
-        val_stream = self.val_ser._load_stream_without_unbatching(stream)
-        key_is_batched = isinstance(self.key_ser, BatchedSerializer)
-        val_is_batched = isinstance(self.val_ser, BatchedSerializer)
-        for (keys, vals) in zip(key_stream, val_stream):
-            keys = keys if key_is_batched else [keys]
-            vals = vals if val_is_batched else [vals]
-            yield (keys, vals)
+    def _load_stream_without_unbatching(self, stream):
+        key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
+        val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
+        for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
+            # for correctness with repeated cartesian/zip this must be returned as one batch
+            yield product(key_batch, val_batch)
 
     def load_stream(self, stream):
-        for (keys, vals) in self.prepare_keys_values(stream):
-            for pair in product(keys, vals):
-                yield pair
+        return chain.from_iterable(self._load_stream_without_unbatching(stream))
 
     def __repr__(self):
         return "CartesianDeserializer(%s, %s)" % \
                (str(self.key_ser), str(self.val_ser))
 
 
-class PairDeserializer(CartesianDeserializer):
+class PairDeserializer(Serializer):
 
     """
     Deserializes the JavaRDD zip() of two PythonRDDs.
+    Due to pyspark batching we cannot simply use the result of the Java RDD zip,
+    we additionally need to do the zip within each pair of batches.
     """
 
+    def __init__(self, key_ser, val_ser):
+        self.key_ser = key_ser
+        self.val_ser = val_ser
+
+    def _load_stream_without_unbatching(self, stream):
+        key_batch_stream = self.key_ser._load_stream_without_unbatching(stream)
+        val_batch_stream = self.val_ser._load_stream_without_unbatching(stream)
+        for (key_batch, val_batch) in zip(key_batch_stream, val_batch_stream):
+            if len(key_batch) != len(val_batch):
+                raise ValueError("Can not deserialize PairRDD with different number of items"
+                                 " in batches: (%d, %d)" % (len(key_batch), len(val_batch)))
+            # for correctness with repeated cartesian/zip this must be returned as one batch
+            yield zip(key_batch, val_batch)
+
     def load_stream(self, stream):
-        for (keys, vals) in self.prepare_keys_values(stream):
-            if len(keys) != len(vals):
-                raise ValueError("Can not deserialize RDD with different number of items"
-                                 " in pair: (%d, %d)" % (len(keys), len(vals)))
-            for pair in zip(keys, vals):
-                yield pair
+        return chain.from_iterable(self._load_stream_without_unbatching(stream))
 
     def __repr__(self):
         return "PairDeserializer(%s, %s)" % (str(self.key_ser), str(self.val_ser))
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index ab4bef8329cd0..89fce8ab25baf 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -548,6 +548,24 @@ def test_cartesian_on_textfile(self):
         self.assertEqual(u"Hello World!", x.strip())
         self.assertEqual(u"Hello World!", y.strip())
 
+    def test_cartesian_chaining(self):
+        # Tests for SPARK-16589
+        rdd = self.sc.parallelize(range(10), 2)
+        self.assertSetEqual(
+            set(rdd.cartesian(rdd).cartesian(rdd).collect()),
+            set([((x, y), z) for x in range(10) for y in range(10) for z in range(10)])
+        )
+
+        self.assertSetEqual(
+            set(rdd.cartesian(rdd.cartesian(rdd)).collect()),
+            set([(x, (y, z)) for x in range(10) for y in range(10) for z in range(10)])
+        )
+
+        self.assertSetEqual(
+            set(rdd.cartesian(rdd.zip(rdd)).collect()),
+            set([(x, (y, y)) for x in range(10) for y in range(10)])
+        )
+
     def test_deleting_input_files(self):
         # Regression test for SPARK-1025
         tempFile = tempfile.NamedTemporaryFile(delete=False)

From d69df9073274f7ab3a3598bb182a3233fd7775cd Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Thu, 8 Dec 2016 11:29:31 -0800
Subject: [PATCH 312/534] [SPARK-18590][SPARKR] build R source package when
 making distribution

This PR has 2 key changes. One, we are building source package (aka bundle package) for SparkR which could be released on CRAN. Two, we should include in the official Spark binary distributions SparkR installed from this source package instead (which would have help/vignettes rds needed for those to work when the SparkR package is loaded in R, whereas earlier approach with devtools does not)

But, because of various differences in how R performs different tasks, this PR is a fair bit more complicated. More details below.

This PR also includes a few minor fixes.

These are the additional steps in make-distribution; please see [here](https://github.com/apache/spark/blob/master/R/CRAN_RELEASE.md) on what's going to a CRAN release, which is now run during make-distribution.sh.
1. package needs to be installed because the first code block in vignettes is `library(SparkR)` without lib path
2. `R CMD build` will build vignettes (this process runs Spark/SparkR code and captures outputs into pdf documentation)
3. `R CMD check` on the source package will install package and build vignettes again (this time from source packaged) - this is a key step required to release R package on CRAN
 (will skip tests here but tests will need to pass for CRAN release process to success - ideally, during release signoff we should install from the R source package and run tests)
4. `R CMD Install` on the source package (this is the only way to generate doc/vignettes rds files correctly, not in step # 1)
 (the output of this step is what we package into Spark dist and sparkr.zip)

Alternatively,
   R CMD build should already be installing the package in a temp directory though it might just be finding this location and set it to lib.loc parameter; another approach is perhaps we could try calling `R CMD INSTALL --build pkg` instead.
 But in any case, despite installing the package multiple times this is relatively fast.
Building vignettes takes a while though.

Manually, CI.

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16014 from felixcheung/rdist.

(cherry picked from commit c3d3a9d0e85b834abef87069e4edd27db87fc607)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/CRAN_RELEASE.md                   |  2 +-
 R/check-cran.sh                     | 19 ++++++++++++++++++-
 R/install-dev.sh                    |  2 +-
 R/pkg/.Rbuildignore                 |  3 +++
 R/pkg/DESCRIPTION                   | 13 ++++++-------
 R/pkg/NAMESPACE                     |  2 +-
 dev/create-release/release-build.sh | 27 +++++++++++++++++++++++----
 dev/make-distribution.sh            | 25 +++++++++++++++++++++----
 8 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
index bea8f9fbe4eec..d6084c7a7cc90 100644
--- a/R/CRAN_RELEASE.md
+++ b/R/CRAN_RELEASE.md
@@ -7,7 +7,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple
 
 First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
 
-Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release.
+Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`).
 
 To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
 
diff --git a/R/check-cran.sh b/R/check-cran.sh
index c5f042848c90c..1288e7fc9fb4c 100755
--- a/R/check-cran.sh
+++ b/R/check-cran.sh
@@ -34,8 +34,9 @@ if [ ! -z "$R_HOME" ]
     fi
     R_SCRIPT_PATH="$(dirname $(which R))"
 fi
-echo "USING R_HOME = $R_HOME"
+echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
 
+# Install the package (this is required for code in vignettes to run when building it later)
 # Build the latest docs, but not vignettes, which is built with the package next
 $FWDIR/create-docs.sh
 
@@ -82,4 +83,20 @@ else
   # This will run tests and/or build vignettes, and require SPARK_HOME
   SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
 fi
+
+# Install source package to get it to generate vignettes rds files, etc.
+if [ -n "$CLEAN_INSTALL" ]
+then
+  echo "Removing lib path and installing from source package"
+  LIB_DIR="$FWDIR/lib"
+  rm -rf $LIB_DIR
+  mkdir -p $LIB_DIR
+  "$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
+
+  # Zip the SparkR package so that it can be distributed to worker nodes on YARN
+  pushd $LIB_DIR > /dev/null
+  jar cfM "$LIB_DIR/sparkr.zip" SparkR
+  popd > /dev/null
+fi
+
 popd > /dev/null
diff --git a/R/install-dev.sh b/R/install-dev.sh
index ada6303a722b7..0f881208bcadb 100755
--- a/R/install-dev.sh
+++ b/R/install-dev.sh
@@ -46,7 +46,7 @@ if [ ! -z "$R_HOME" ]
     fi
     R_SCRIPT_PATH="$(dirname $(which R))"
 fi
-echo "USING R_HOME = $R_HOME"
+echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
 
 # Generate Rd files if devtools is installed
 "$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
index 544d203a6dce6..f12f8c275a989 100644
--- a/R/pkg/.Rbuildignore
+++ b/R/pkg/.Rbuildignore
@@ -1,5 +1,8 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.lintr$
+^cran-comments\.md$
+^NEWS\.md$
+^README\.Rmd$
 ^src-native$
 ^html$
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 46fb178112802..0cb3a80a6e892 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: SparkR
 Type: Package
+Version: 2.1.0
 Title: R Frontend for Apache Spark
-Version: 2.1.1
-Date: 2016-11-06
+Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",
@@ -10,19 +10,18 @@ Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
              person("Felix", "Cheung", role = "aut",
                     email = "felixcheung@apache.org"),
              person(family = "The Apache Software Foundation", role = c("aut", "cph")))
+License: Apache License (== 2.0)
 URL: http://www.apache.org/ http://spark.apache.org/
 BugReports: http://spark.apache.org/contributing.html
 Depends:
     R (>= 3.0),
     methods
 Suggests:
+    knitr,
+    rmarkdown,
     testthat,
     e1071,
-    survival,
-    knitr,
-    rmarkdown
-Description: The SparkR package provides an R frontend for Apache Spark.
-License: Apache License (== 2.0)
+    survival
 Collate:
     'schema.R'
     'generics.R'
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index daee09de88263..377f9429ae5c1 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -3,7 +3,7 @@
 importFrom("methods", "setGeneric", "setMethod", "setOldClass")
 importFrom("methods", "is", "new", "signature", "show")
 importFrom("stats", "gaussian", "setNames")
-importFrom("utils", "download.file", "object.size", "packageVersion", "untar")
+importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839
diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index aa42750f26679..8863ee6cd7923 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -150,7 +150,7 @@ if [[ "$1" == "package" ]]; then
     NAME=$1
     FLAGS=$2
     ZINC_PORT=$3
-    BUILD_PIP_PACKAGE=$4
+    BUILD_PACKAGE=$4
     cp -r spark spark-$SPARK_VERSION-bin-$NAME
 
     cd spark-$SPARK_VERSION-bin-$NAME
@@ -172,11 +172,30 @@ if [[ "$1" == "package" ]]; then
     MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
 
 
-    if [ -z "$BUILD_PIP_PACKAGE" ]; then
-      echo "Creating distribution without PIP package"
+    if [ -z "$BUILD_PACKAGE" ]; then
+      echo "Creating distribution without PIP/R package"
       ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
         -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
       cd ..
+    elif [[ "$BUILD_PACKAGE" == "withr" ]]; then
+      echo "Creating distribution with R package"
+      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --r $FLAGS \
+        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+      cd ..
+
+      echo "Copying and signing R source package"
+      R_DIST_NAME=SparkR_$SPARK_VERSION.tar.gz
+      cp spark-$SPARK_VERSION-bin-$NAME/R/$R_DIST_NAME .
+
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour \
+        --output $R_DIST_NAME.asc \
+        --detach-sig $R_DIST_NAME
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        MD5 $R_DIST_NAME > \
+        $R_DIST_NAME.md5
+      echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
+        SHA512 $R_DIST_NAME > \
+        $R_DIST_NAME.sha
     else
       echo "Creating distribution with PIP package"
       ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
@@ -222,7 +241,7 @@ if [[ "$1" == "package" ]]; then
   make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" &
   make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" &
   make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" &
-  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" &
+  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" "withr" &
   wait
   rm -rf spark-$SPARK_VERSION-bin-*/
 
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 49b46fbc3fb27..fe281bbaa2023 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -34,6 +34,7 @@ DISTDIR="$SPARK_HOME/dist"
 
 MAKE_TGZ=false
 MAKE_PIP=false
+MAKE_R=false
 NAME=none
 MVN="$SPARK_HOME/build/mvn"
 
@@ -41,7 +42,7 @@ function exit_with_usage {
   echo "make-distribution.sh - tool for making binary distributions of Spark"
   echo ""
   echo "usage:"
-  cl_options="[--name] [--tgz] [--pip] [--mvn <mvn-command>]"
+  cl_options="[--name] [--tgz] [--pip] [--r] [--mvn <mvn-command>]"
   echo "make-distribution.sh $cl_options <maven build options>"
   echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
@@ -71,6 +72,9 @@ while (( "$#" )); do
     --pip)
       MAKE_PIP=true
       ;;
+    --r)
+      MAKE_R=true
+      ;;
     --mvn)
       MVN="$2"
       shift
@@ -208,11 +212,24 @@ cp -r "$SPARK_HOME/data" "$DISTDIR"
 # Make pip package
 if [ "$MAKE_PIP" == "true" ]; then
   echo "Building python distribution package"
-  cd $SPARK_HOME/python
+  pushd "$SPARK_HOME/python" > /dev/null
   python setup.py sdist
-  cd ..
+  popd > /dev/null
+else
+  echo "Skipping building python distribution package"
+fi
+
+# Make R package - this is used for both CRAN release and packing R layout into distribution
+if [ "$MAKE_R" == "true" ]; then
+  echo "Building R source package"
+  pushd "$SPARK_HOME/R" > /dev/null
+  # Build source package and run full checks
+  # Install source package to get it to generate vignettes, etc.
+  # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
+  NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh
+  popd > /dev/null
 else
-  echo "Skipping creating pip installable PySpark"
+  echo "Skipping building R source package"
 fi
 
 # Copy other things

From a035644182646a2160ac16ecd6c7f4d98be2caad Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 8 Dec 2016 11:54:04 -0800
Subject: [PATCH 313/534] [SPARK-18751][CORE] Fix deadlock when
 SparkContext.stop is called in Utils.tryOrStopSparkContext

## What changes were proposed in this pull request?

When `SparkContext.stop` is called in `Utils.tryOrStopSparkContext` (the following three places), it will cause deadlock because the `stop` method needs to wait for the thread running `stop` to exit.

- ContextCleaner.keepCleaning
- LiveListenerBus.listenerThread.run
- TaskSchedulerImpl.start

This PR adds `SparkContext.stopInNewThread` and uses it to eliminate the potential deadlock. I also removed my changes in #15775 since they are not necessary now.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16178 from zsxwing/fix-stop-deadlock.

(cherry picked from commit 26432df9cc6ffe569583aa628c6ecd7050b38316)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../scala/org/apache/spark/SparkContext.scala | 35 +++++++++++--------
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |  5 ---
 .../apache/spark/rpc/netty/Dispatcher.scala   |  1 -
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  |  5 ---
 .../apache/spark/scheduler/DAGScheduler.scala |  2 +-
 .../cluster/StandaloneSchedulerBackend.scala  |  2 +-
 .../scala/org/apache/spark/util/Utils.scala   |  2 +-
 .../org/apache/spark/rpc/RpcEnvSuite.scala    | 13 -------
 8 files changed, 23 insertions(+), 42 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index b8414b5d099c5..8f8392fa646de 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1757,25 +1757,30 @@ class SparkContext(config: SparkConf) extends Logging {
   def listJars(): Seq[String] = addedJars.keySet.toSeq
 
   /**
-   * Shut down the SparkContext.
+   * When stopping SparkContext inside Spark components, it's easy to cause dead-lock since Spark
+   * may wait for some internal threads to finish. It's better to use this method to stop
+   * SparkContext instead.
    */
-  def stop(): Unit = {
-    if (env.rpcEnv.isInRPCThread) {
-      // `stop` will block until all RPC threads exit, so we cannot call stop inside a RPC thread.
-      // We should launch a new thread to call `stop` to avoid dead-lock.
-      new Thread("stop-spark-context") {
-        setDaemon(true)
-
-        override def run(): Unit = {
-          _stop()
+  private[spark] def stopInNewThread(): Unit = {
+    new Thread("stop-spark-context") {
+      setDaemon(true)
+
+      override def run(): Unit = {
+        try {
+          SparkContext.this.stop()
+        } catch {
+          case e: Throwable =>
+            logError(e.getMessage, e)
+            throw e
         }
-      }.start()
-    } else {
-      _stop()
-    }
+      }
+    }.start()
   }
 
-  private def _stop() {
+  /**
+   * Shut down the SparkContext.
+   */
+  def stop(): Unit = {
     if (LiveListenerBus.withinListenerThread.value) {
       throw new SparkException(
         s"Cannot stop SparkContext within listener thread of ${LiveListenerBus.name}")
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index bbc416381490b..530743c03640b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -146,11 +146,6 @@ private[spark] abstract class RpcEnv(conf: SparkConf) {
    * @param uri URI with location of the file.
    */
   def openChannel(uri: String): ReadableByteChannel
-
-  /**
-   * Return if the current thread is a RPC thread.
-   */
-  def isInRPCThread: Boolean
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index 67baabd2cbff2..a02cf30a5d831 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -201,7 +201,6 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   /** Message loop used for dispatching messages. */
   private class MessageLoop extends Runnable {
     override def run(): Unit = {
-      NettyRpcEnv.rpcThreadFlag.value = true
       try {
         while (true) {
           try {
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 0b8cd144a2161..e56943da1303a 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -407,14 +407,9 @@ private[netty] class NettyRpcEnv(
     }
 
   }
-
-  override def isInRPCThread: Boolean = NettyRpcEnv.rpcThreadFlag.value
 }
 
 private[netty] object NettyRpcEnv extends Logging {
-
-  private[netty] val rpcThreadFlag = new DynamicVariable[Boolean](false)
-
   /**
    * When deserializing the [[NettyRpcEndpointRef]], it needs a reference to [[NettyRpcEnv]].
    * Use `currentEnv` to wrap the deserialization codes. E.g.,
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index f2517401cb76b..01a95c06fc69c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1660,7 +1660,7 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     } catch {
       case t: Throwable => logError("DAGScheduler failed to cancel all jobs.", t)
     }
-    dagScheduler.sc.stop()
+    dagScheduler.sc.stopInNewThread()
   }
 
   override def onStop(): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index 04d40e2907cff..4a9af80f4537b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -139,7 +139,7 @@ private[spark] class StandaloneSchedulerBackend(
         scheduler.error(reason)
       } finally {
         // Ensure the application terminates, as we can no longer run jobs.
-        sc.stop()
+        sc.stopInNewThread()
       }
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index ded3416299e9a..071515134503f 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1249,7 +1249,7 @@ private[spark] object Utils extends Logging {
         val currentThreadName = Thread.currentThread().getName
         if (sc != null) {
           logError(s"uncaught error in thread $currentThreadName, stopping SparkContext", t)
-          sc.stop()
+          sc.stopInNewThread()
         }
         if (!NonFatal(t)) {
           logError(s"throw uncaught fatal error in thread $currentThreadName", t)
diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
index aa0705987d837..acdf21df9a161 100644
--- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala
@@ -870,19 +870,6 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll {
     verify(endpoint, never()).onDisconnected(any())
     verify(endpoint, never()).onNetworkError(any(), any())
   }
-
-  test("isInRPCThread") {
-    val rpcEndpointRef = env.setupEndpoint("isInRPCThread", new RpcEndpoint {
-      override val rpcEnv = env
-
-      override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-        case m => context.reply(rpcEnv.isInRPCThread)
-      }
-    })
-    assert(rpcEndpointRef.askWithRetry[Boolean]("hello") === true)
-    assert(env.isInRPCThread === false)
-    env.stop(rpcEndpointRef)
-  }
 }
 
 class UnserializableClass

From 9483242f4c6cc13001e5a967810718b26beb2361 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 8 Dec 2016 12:52:05 -0800
Subject: [PATCH 314/534] [SPARK-18760][SQL] Consistent format specification
 for FileFormats

## What changes were proposed in this pull request?
This patch fixes the format specification in explain for file sources (Parquet and Text formats are the only two that are different from the rest):

Before:
```
scala> spark.read.text("test.text").explain()
== Physical Plan ==
*FileScan text [value#15] Batched: false, Format: org.apache.spark.sql.execution.datasources.text.TextFileFormatxyz, Location: InMemoryFileIndex[file:/scratch/rxin/spark/test.text], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>
```

After:
```
scala> spark.read.text("test.text").explain()
== Physical Plan ==
*FileScan text [value#15] Batched: false, Format: Text, Location: InMemoryFileIndex[file:/scratch/rxin/spark/test.text], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>
```

Also closes #14680.

## How was this patch tested?
Verified in spark-shell.

Author: Reynold Xin <rxin@databricks.com>

Closes #16187 from rxin/SPARK-18760.

(cherry picked from commit 5f894d23a54ea99f75f8b722e111e5270f7f80cf)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../execution/datasources/parquet/ParquetFileFormat.scala  | 2 +-
 .../sql/execution/datasources/text/TextFileFormat.scala    | 2 ++
 .../apache/spark/sql/streaming/FileStreamSourceSuite.scala | 7 ++++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 031a0fe57893f..0965ffebea962 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -61,7 +61,7 @@ class ParquetFileFormat
 
   override def shortName(): String = "parquet"
 
-  override def toString: String = "ParquetFormat"
+  override def toString: String = "Parquet"
 
   override def hashCode(): Int = getClass.hashCode()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
index 8e043960326df..3e890828e88be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -43,6 +43,8 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
 
   override def shortName(): String = "text"
 
+  override def toString: String = "Text"
+
   private def verifySchema(schema: StructType): Unit = {
     if (schema.size != 1) {
       throw new AnalysisException(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 7b6fe83b9a597..267c462484a32 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -31,7 +31,8 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
-class FileStreamSourceTest extends StreamTest with SharedSQLContext with PrivateMethodTester {
+abstract class FileStreamSourceTest
+  extends StreamTest with SharedSQLContext with PrivateMethodTester {
 
   import testImplicits._
 
@@ -848,13 +849,13 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
         val explainWithoutExtended = q.explainInternal(false)
         // `extended = false` only displays the physical plan.
         assert("Relation.*text".r.findAllMatchIn(explainWithoutExtended).size === 0)
-        assert("TextFileFormat".r.findAllMatchIn(explainWithoutExtended).size === 1)
+        assert(": Text".r.findAllMatchIn(explainWithoutExtended).size === 1)
 
         val explainWithExtended = q.explainInternal(true)
         // `extended = true` displays 3 logical plans (Parsed/Optimized/Optimized) and 1 physical
         // plan.
         assert("Relation.*text".r.findAllMatchIn(explainWithExtended).size === 3)
-        assert("TextFileFormat".r.findAllMatchIn(explainWithExtended).size === 1)
+        assert(": Text".r.findAllMatchIn(explainWithExtended).size === 1)
       } finally {
         q.stop()
       }

From e43209fe2a69fb239dff8bc1a18297d3696f0dcd Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 8 Dec 2016 13:01:46 -0800
Subject: [PATCH 315/534] [SPARK-18590][SPARKR] Change the R source build to
 Hadoop 2.6

This PR changes the SparkR source release tarball to be built using the Hadoop 2.6 profile. Previously it was using the without hadoop profile which leads to an error as discussed in https://github.com/apache/spark/pull/16014#issuecomment-265843991

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #16218 from shivaram/fix-sparkr-release-build.

(cherry picked from commit 202fcd21ce01393fa6dfaa1c2126e18e9b85ee96)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 dev/create-release/release-build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 8863ee6cd7923..1b05b20a14b76 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -238,10 +238,10 @@ if [[ "$1" == "package" ]]; then
   FLAGS="-Psparkr -Phive -Phive-thriftserver -Pyarn -Pmesos"
   make_binary_release "hadoop2.3" "-Phadoop-2.3 $FLAGS" "3033" &
   make_binary_release "hadoop2.4" "-Phadoop-2.4 $FLAGS" "3034" &
-  make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" &
+  make_binary_release "hadoop2.6" "-Phadoop-2.6 $FLAGS" "3035" "withr" &
   make_binary_release "hadoop2.7" "-Phadoop-2.7 $FLAGS" "3036" "withpip" &
   make_binary_release "hadoop2.4-without-hive" "-Psparkr -Phadoop-2.4 -Pyarn -Pmesos" "3037" &
-  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" "withr" &
+  make_binary_release "without-hadoop" "-Psparkr -Phadoop-provided -Pyarn -Pmesos" "3038" &
   wait
   rm -rf spark-$SPARK_VERSION-bin-*/
 

From fcd22e5389a7dffda32be0e143d772f611a0f3d9 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 8 Dec 2016 17:53:34 -0800
Subject: [PATCH 316/534] [SPARK-18776][SS] Make Offset for FileStreamSource
 corrected formatted in json

## What changes were proposed in this pull request?

- Changed FileStreamSource to use new FileStreamSourceOffset rather than LongOffset. The field is named as `logOffset` to make it more clear that this is a offset in the file stream log.
- Fixed bug in FileStreamSourceLog, the field endId in the FileStreamSourceLog.get(startId, endId) was not being used at all. No test caught it earlier. Only my updated tests caught it.

Other minor changes
- Dont use batchId in the FileStreamSource, as calling it batch id is extremely miss leading. With multiple sources, it may happen that a new batch has no new data from a file source. So offset of FileStreamSource != batchId after that batch.

## How was this patch tested?

Updated unit test.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16205 from tdas/SPARK-18776.

(cherry picked from commit 458fa3325e5f8c21c50e406ac8059d6236f93a9c)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../sql/kafka010/KafkaSourceOffsetSuite.scala |  2 +-
 .../streaming/FileStreamSource.scala          | 32 ++++++-----
 .../streaming/FileStreamSourceLog.scala       |  2 +-
 .../streaming/FileStreamSourceOffset.scala    | 53 +++++++++++++++++++
 .../file-source-offset-version-2.1.0-json.txt |  1 +
 ...file-source-offset-version-2.1.0-long.txt} |  0
 .../offset-log-version-2.1.0/0                |  4 +-
 .../streaming/FileStreamSourceSuite.scala     |  2 +-
 .../streaming/OffsetSeqLogSuite.scala         |  2 +-
 .../sql/streaming/FileStreamSourceSuite.scala | 30 +++++++----
 10 files changed, 95 insertions(+), 33 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
 create mode 100644 sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt
 rename sql/core/src/test/resources/structured-streaming/{file-source-offset-version-2.1.0.txt => file-source-offset-version-2.1.0-long.txt} (100%)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
index 22668fd6faaa9..10b35c74f473e 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala
@@ -90,7 +90,7 @@ class KafkaSourceOffsetSuite extends OffsetSuite with SharedSQLContext {
     }
   }
 
-  test("read Spark 2.1.0 log format") {
+  test("read Spark 2.1.0 offset format") {
     val offset = readFromResource("kafka-source-offset-version-2.1.0.txt")
     assert(KafkaSourceOffset(offset) ===
       KafkaSourceOffset(("topic1", 0, 456L), ("topic1", 1, 789L), ("topic2", 0, 0L)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 8494aef004bb5..20e0dcef8ffda 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -57,7 +57,7 @@ class FileStreamSource(
 
   private val metadataLog =
     new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath)
-  private var maxBatchId = metadataLog.getLatest().map(_._1).getOrElse(-1L)
+  private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L)
 
   /** Maximum number of new files to be considered in each batch */
   private val maxFilesPerBatch = sourceOptions.maxFilesPerTrigger
@@ -79,7 +79,7 @@ class FileStreamSource(
    * `synchronized` on this method is for solving race conditions in tests. In the normal usage,
    * there is no race here, so the cost of `synchronized` should be rare.
    */
-  private def fetchMaxOffset(): LongOffset = synchronized {
+  private def fetchMaxOffset(): FileStreamSourceOffset = synchronized {
     // All the new files found - ignore aged files and files that we have seen.
     val newFiles = fetchAllFiles().filter {
       case (path, timestamp) => seenFiles.isNewFile(path, timestamp)
@@ -104,14 +104,14 @@ class FileStreamSource(
        """.stripMargin)
 
     if (batchFiles.nonEmpty) {
-      maxBatchId += 1
-      metadataLog.add(maxBatchId, batchFiles.map { case (path, timestamp) =>
-        FileEntry(path = path, timestamp = timestamp, batchId = maxBatchId)
+      metadataLogCurrentOffset += 1
+      metadataLog.add(metadataLogCurrentOffset, batchFiles.map { case (p, timestamp) =>
+        FileEntry(path = p, timestamp = timestamp, batchId = metadataLogCurrentOffset)
       }.toArray)
-      logInfo(s"Max batch id increased to $maxBatchId with ${batchFiles.size} new files")
+      logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files")
     }
 
-    new LongOffset(maxBatchId)
+    FileStreamSourceOffset(metadataLogCurrentOffset)
   }
 
   /**
@@ -122,21 +122,19 @@ class FileStreamSource(
     func
   }
 
-  /** Return the latest offset in the source */
-  def currentOffset: LongOffset = synchronized {
-    new LongOffset(maxBatchId)
-  }
+  /** Return the latest offset in the [[FileStreamSourceLog]] */
+  def currentLogOffset: Long = synchronized { metadataLogCurrentOffset }
 
   /**
    * Returns the data that is between the offsets (`start`, `end`].
    */
   override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
-    val startId = start.flatMap(LongOffset.convert(_)).getOrElse(LongOffset(-1L)).offset
-    val endId = LongOffset.convert(end).getOrElse(LongOffset(0)).offset
+    val startOffset = start.map(FileStreamSourceOffset(_).logOffset).getOrElse(-1L)
+    val endOffset = FileStreamSourceOffset(end).logOffset
 
-    assert(startId <= endId)
-    val files = metadataLog.get(Some(startId + 1), Some(endId)).flatMap(_._2)
-    logInfo(s"Processing ${files.length} files from ${startId + 1}:$endId")
+    assert(startOffset <= endOffset)
+    val files = metadataLog.get(Some(startOffset + 1), Some(endOffset)).flatMap(_._2)
+    logInfo(s"Processing ${files.length} files from ${startOffset + 1}:$endOffset")
     logTrace(s"Files are:\n\t" + files.mkString("\n\t"))
     val newDataSource =
       DataSource(
@@ -172,7 +170,7 @@ class FileStreamSource(
     files
   }
 
-  override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.offset == -1)
+  override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1)
 
   override def toString: String = s"FileStreamSource[$qualifiedBasePath]"
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
index 327b3ac267766..81908c0cefdfa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceLog.scala
@@ -78,7 +78,7 @@ class FileStreamSourceLog(
 
   override def get(startId: Option[Long], endId: Option[Long]): Array[(Long, Array[FileEntry])] = {
     val startBatchId = startId.getOrElse(0L)
-    val endBatchId = getLatest().map(_._1).getOrElse(0L)
+    val endBatchId = endId.orElse(getLatest().map(_._1)).getOrElse(0L)
 
     val (existedBatches, removedBatches) = (startBatchId to endBatchId).map { id =>
       if (isCompactionBatch(id, compactInterval) && fileEntryCache.containsKey(id)) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
new file mode 100644
index 0000000000000..06d0fe6c18c1e
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import scala.util.control.Exception._
+
+import org.json4s.NoTypeHints
+import org.json4s.jackson.Serialization
+
+/**
+ * Offset for the [[FileStreamSource]].
+ * @param logOffset  Position in the [[FileStreamSourceLog]]
+ */
+case class FileStreamSourceOffset(logOffset: Long) extends Offset {
+  override def json: String = {
+    Serialization.write(this)(FileStreamSourceOffset.format)
+  }
+}
+
+object FileStreamSourceOffset {
+  implicit val format = Serialization.formats(NoTypeHints)
+
+  def apply(offset: Offset): FileStreamSourceOffset = {
+    offset match {
+      case f: FileStreamSourceOffset => f
+      case SerializedOffset(str) =>
+        catching(classOf[NumberFormatException]).opt {
+          FileStreamSourceOffset(str.toLong)
+        }.getOrElse {
+          Serialization.read[FileStreamSourceOffset](str)
+        }
+      case _ =>
+        throw new IllegalArgumentException(
+          s"Invalid conversion from offset of ${offset.getClass} to FileStreamSourceOffset")
+    }
+  }
+}
+
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt
new file mode 100644
index 0000000000000..e266a47368e1c
--- /dev/null
+++ b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-json.txt
@@ -0,0 +1 @@
+{"logOffset":345}
diff --git a/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0.txt b/sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-long.txt
similarity index 100%
rename from sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0.txt
rename to sql/core/src/test/resources/structured-streaming/file-source-offset-version-2.1.0-long.txt
diff --git a/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0 b/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
index fe5c1d44a6e26..988a98a7587d4 100644
--- a/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
+++ b/sql/core/src/test/resources/structured-streaming/offset-log-version-2.1.0/0
@@ -1,4 +1,4 @@
 v1
 {"batchWatermarkMs":0,"batchTimestampMs":1480981499528}
-0
-{"topic-0":{"0":1}}
\ No newline at end of file
+{"logOffset":345}
+{"topic-0":{"0":1}}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
index 4a47c04d3f084..40d0643ba8771 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceSuite.scala
@@ -97,7 +97,7 @@ class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext {
       val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil,
         dir.getAbsolutePath, Map.empty)
       // this method should throw an exception if `fs.exists` is called during resolveRelation
-      newSource.getBatch(None, LongOffset(1))
+      newSource.getBatch(None, FileStreamSourceOffset(1))
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
index d139efaaf824f..bb4274a162e8e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -74,7 +74,7 @@ class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
     val (batchId, offsetSeq) = readFromResource("offset-log-version-2.1.0")
     assert(batchId === 0)
     assert(offsetSeq.offsets === Seq(
-      Some(SerializedOffset("0")),
+      Some(SerializedOffset("""{"logOffset":345}""")),
       Some(SerializedOffset("""{"topic-0":{"0":1}}"""))
     ))
     assert(offsetSeq.metadata === Some(OffsetSeqMetadata(0L, 1480981499528L)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 267c462484a32..bcb68520407bc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -61,7 +61,7 @@ abstract class FileStreamSourceTest
       val source = sources.head
       val newOffset = source.withBatchingLocked {
         addData(source)
-        source.currentOffset + 1
+        new FileStreamSourceOffset(source.currentLogOffset + 1)
       }
       logInfo(s"Added file to $source at offset $newOffset")
       (source, newOffset)
@@ -987,12 +987,17 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
             val _sources = PrivateMethod[Seq[Source]]('sources)
             val fileSource =
               (execution invokePrivate _sources()).head.asInstanceOf[FileStreamSource]
-            assert(fileSource.getBatch(None, LongOffset(2)).as[String].collect() ===
-              List("keep1", "keep2", "keep3"))
-            assert(fileSource.getBatch(Some(LongOffset(0)), LongOffset(2)).as[String].collect() ===
-              List("keep2", "keep3"))
-            assert(fileSource.getBatch(Some(LongOffset(1)), LongOffset(2)).as[String].collect() ===
-              List("keep3"))
+
+            def verify(startId: Option[Int], endId: Int, expected: String*): Unit = {
+              val start = startId.map(new FileStreamSourceOffset(_))
+              val end = FileStreamSourceOffset(endId)
+              assert(fileSource.getBatch(start, end).as[String].collect().toSeq === expected)
+            }
+
+            verify(startId = None, endId = 2, "keep1", "keep2", "keep3")
+            verify(startId = Some(0), endId = 1, "keep2")
+            verify(startId = Some(0), endId = 2, "keep2", "keep3")
+            verify(startId = Some(1), endId = 2, "keep3")
             true
           }
         )
@@ -1023,9 +1028,14 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     assert(options.maxFilesPerTrigger == Some(1))
   }
 
-  test("FileStreamSource offset - read Spark 2.1.0 log format") {
-    val offset = readOffsetFromResource("file-source-offset-version-2.1.0.txt")
-    assert(LongOffset.convert(offset) === Some(LongOffset(345)))
+  test("FileStreamSource offset - read Spark 2.1.0 offset json format") {
+    val offset = readOffsetFromResource("file-source-offset-version-2.1.0-json.txt")
+    assert(FileStreamSourceOffset(offset) === FileStreamSourceOffset(345))
+  }
+
+  test("FileStreamSource offset - read Spark 2.1.0 offset long format") {
+    val offset = readOffsetFromResource("file-source-offset-version-2.1.0-long.txt")
+    assert(FileStreamSourceOffset(offset) === FileStreamSourceOffset(345))
   }
 
   test("FileStreamSourceLog - read Spark 2.1.0 log format") {

From 1cafc76ea1e9eef40b24060d1cd7c4aaf9f16a49 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 8 Dec 2016 17:58:44 -0800
Subject: [PATCH 317/534] [SPARK-18774][CORE][SQL] Ignore non-existing files
 when ignoreCorruptFiles is enabled (branch 2.1)

## What changes were proposed in this pull request?

Backport #16203 to branch 2.1.

## How was this patch tested?

Jennkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16216 from zsxwing/SPARK-18774-2.1.
---
 .../spark/internal/config/package.scala       |  3 +-
 .../org/apache/spark/rdd/HadoopRDD.scala      | 30 +++++++----
 .../org/apache/spark/rdd/NewHadoopRDD.scala   | 50 ++++++++++++-------
 .../execution/datasources/FileScanRDD.scala   |  3 ++
 .../apache/spark/sql/internal/SQLConf.scala   |  3 +-
 5 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 4a3e3d5c79eff..8ce9883ac5531 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -203,7 +203,8 @@ package object config {
 
   private[spark] val IGNORE_CORRUPT_FILES = ConfigBuilder("spark.files.ignoreCorruptFiles")
     .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " +
-      "encountering corrupt files and contents that have been read will still be returned.")
+      "encountering corrupted or non-existing files and contents that have been read will still " +
+      "be returned.")
     .booleanConf
     .createWithDefault(false)
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 3133a28755884..b56ebf4df06e9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -210,12 +210,12 @@ class HadoopRDD[K, V](
   override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
     val iter = new NextIterator[(K, V)] {
 
-      val split = theSplit.asInstanceOf[HadoopPartition]
+      private val split = theSplit.asInstanceOf[HadoopPartition]
       logInfo("Input split: " + split.inputSplit)
-      val jobConf = getJobConf()
+      private val jobConf = getJobConf()
 
-      val inputMetrics = context.taskMetrics().inputMetrics
-      val existingBytesRead = inputMetrics.bytesRead
+      private val inputMetrics = context.taskMetrics().inputMetrics
+      private val existingBytesRead = inputMetrics.bytesRead
 
       // Sets the thread local variable for the file's name
       split.inputSplit.value match {
@@ -225,7 +225,7 @@ class HadoopRDD[K, V](
 
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
-      val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
+      private val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
         case _: FileSplit | _: CombineFileSplit =>
           SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
         case _ => None
@@ -235,23 +235,31 @@ class HadoopRDD[K, V](
       // If we do a coalesce, however, we are likely to compute multiple partitions in the same
       // task and in the same thread, in which case we need to avoid override values written by
       // previous partitions (SPARK-13071).
-      def updateBytesRead(): Unit = {
+      private def updateBytesRead(): Unit = {
         getBytesReadCallback.foreach { getBytesRead =>
           inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
         }
       }
 
-      var reader: RecordReader[K, V] = null
-      val inputFormat = getInputFormat(jobConf)
+      private var reader: RecordReader[K, V] = null
+      private val inputFormat = getInputFormat(jobConf)
       HadoopRDD.addLocalConfiguration(
         new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(createTime),
         context.stageId, theSplit.index, context.attemptNumber, jobConf)
-      reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
 
+      reader =
+        try {
+          inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
+        } catch {
+          case e: IOException if ignoreCorruptFiles =>
+            logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
+            finished = true
+            null
+        }
       // Register an on-task-completion callback to close the input stream.
       context.addTaskCompletionListener{ context => closeIfNeeded() }
-      val key: K = reader.createKey()
-      val value: V = reader.createValue()
+      private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
+      private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()
 
       override def getNext(): (K, V) = {
         try {
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index c6ddb4b090928..6168d979032aa 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -132,12 +132,12 @@ class NewHadoopRDD[K, V](
 
   override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
     val iter = new Iterator[(K, V)] {
-      val split = theSplit.asInstanceOf[NewHadoopPartition]
+      private val split = theSplit.asInstanceOf[NewHadoopPartition]
       logInfo("Input split: " + split.serializableHadoopSplit)
-      val conf = getConf
+      private val conf = getConf
 
-      val inputMetrics = context.taskMetrics().inputMetrics
-      val existingBytesRead = inputMetrics.bytesRead
+      private val inputMetrics = context.taskMetrics().inputMetrics
+      private val existingBytesRead = inputMetrics.bytesRead
 
       // Sets the thread local variable for the file's name
       split.serializableHadoopSplit.value match {
@@ -147,39 +147,51 @@ class NewHadoopRDD[K, V](
 
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
-      val getBytesReadCallback: Option[() => Long] = split.serializableHadoopSplit.value match {
-        case _: FileSplit | _: CombineFileSplit =>
-          SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
-        case _ => None
-      }
+      private val getBytesReadCallback: Option[() => Long] =
+        split.serializableHadoopSplit.value match {
+          case _: FileSplit | _: CombineFileSplit =>
+            SparkHadoopUtil.get.getFSBytesReadOnThreadCallback()
+          case _ => None
+        }
 
       // For Hadoop 2.5+, we get our input bytes from thread-local Hadoop FileSystem statistics.
       // If we do a coalesce, however, we are likely to compute multiple partitions in the same
       // task and in the same thread, in which case we need to avoid override values written by
       // previous partitions (SPARK-13071).
-      def updateBytesRead(): Unit = {
+      private def updateBytesRead(): Unit = {
         getBytesReadCallback.foreach { getBytesRead =>
           inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
         }
       }
 
-      val format = inputFormatClass.newInstance
+      private val format = inputFormatClass.newInstance
       format match {
         case configurable: Configurable =>
           configurable.setConf(conf)
         case _ =>
       }
-      val attemptId = new TaskAttemptID(jobTrackerId, id, TaskType.MAP, split.index, 0)
-      val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
-      private var reader = format.createRecordReader(
-        split.serializableHadoopSplit.value, hadoopAttemptContext)
-      reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
+      private val attemptId = new TaskAttemptID(jobTrackerId, id, TaskType.MAP, split.index, 0)
+      private val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
+      private var finished = false
+      private var reader =
+        try {
+          val _reader = format.createRecordReader(
+            split.serializableHadoopSplit.value, hadoopAttemptContext)
+          _reader.initialize(split.serializableHadoopSplit.value, hadoopAttemptContext)
+          _reader
+        } catch {
+          case e: IOException if ignoreCorruptFiles =>
+            logWarning(
+              s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}",
+              e)
+            finished = true
+            null
+        }
 
       // Register an on-task-completion callback to close the input stream.
       context.addTaskCompletionListener(context => close())
-      var havePair = false
-      var finished = false
-      var recordsSinceMetricsUpdate = 0
+      private var havePair = false
+      private var recordsSinceMetricsUpdate = 0
 
       override def hasNext: Boolean = {
         if (!finished && !havePair) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 237cdabb5f795..69338f7d96615 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -150,6 +150,9 @@ class FileScanRDD(
               currentIterator = readFunction(currentFile)
             }
           } catch {
+            case e: IOException if ignoreCorruptFiles =>
+              logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
+              currentIterator = Iterator.empty
             case e: java.io.FileNotFoundException =>
               throw new java.io.FileNotFoundException(
                 e.getMessage + "\n" +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 0280a3b87a3a9..809b267b884b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -606,7 +606,8 @@ object SQLConf {
 
   val IGNORE_CORRUPT_FILES = SQLConfigBuilder("spark.sql.files.ignoreCorruptFiles")
     .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " +
-      "encountering corrupt files and contents that have been read will still be returned.")
+      "encountering corrupted or non-existing and contents that have been read will still be " +
+      "returned.")
     .booleanConf
     .createWithDefault(false)
 

From ef5646b4c6792a96e85d1dd4bb3103ba8306949b Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 8 Dec 2016 18:26:54 -0800
Subject: [PATCH 318/534] [SPARKR][PYSPARK] Fix R source package name to match
 Spark version. Remove pip tar.gz from distribution

## What changes were proposed in this pull request?

Fixes name of R source package so that the `cp` in release-build.sh works correctly.

Issue discussed in https://github.com/apache/spark/pull/16014#issuecomment-265867125

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #16221 from shivaram/fix-sparkr-release-build-name.

(cherry picked from commit 4ac8b20bf2f962d9b8b6b209468896758d49efe3)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 dev/make-distribution.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index fe281bbaa2023..4da7d573849f8 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -222,11 +222,14 @@ fi
 # Make R package - this is used for both CRAN release and packing R layout into distribution
 if [ "$MAKE_R" == "true" ]; then
   echo "Building R source package"
+  R_PACKAGE_VERSION=`grep Version $SPARK_HOME/R/pkg/DESCRIPTION | awk '{print $NF}'`
   pushd "$SPARK_HOME/R" > /dev/null
   # Build source package and run full checks
   # Install source package to get it to generate vignettes, etc.
   # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
   NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh
+  # Make a copy of R source package matching the Spark release version.
+  cp $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz
   popd > /dev/null
 else
   echo "Skipping building R source package"
@@ -238,6 +241,12 @@ cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
 cp "$SPARK_HOME/README.md" "$DISTDIR"
 cp -r "$SPARK_HOME/bin" "$DISTDIR"
 cp -r "$SPARK_HOME/python" "$DISTDIR"
+
+# Remove the python distribution from dist/ if we built it
+if [ "$MAKE_PIP" == "true" ]; then
+  rm -f $DISTDIR/python/dist/pyspark-*.tar.gz
+fi
+
 cp -r "$SPARK_HOME/sbin" "$DISTDIR"
 # Copy SparkR if it exists
 if [ -d "$SPARK_HOME"/R/lib/SparkR ]; then

From 4ceed95b43d0cd9665004865095a40926efcc289 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Thu, 8 Dec 2016 22:08:19 -0800
Subject: [PATCH 319/534] [SPARK-18349][SPARKR] Update R API documentation on
 ml model summary

## What changes were proposed in this pull request?
In this PR, the document of `summary` method is improved in the format:

returns summary information of the fitted model, which is a list. The list includes .......

Since `summary` in R is mainly about the model, which is not the same as `summary` object on scala side, if there is one, the scala API doc is not pointed here.

In current document, some `return` have `.` and some don't have. `.` is added to missed ones.

Since spark.logit `summary` has a big refactoring, this PR doesn't include this one. It will be changed when the `spark.logit` PR is merged.

## How was this patch tested?

Manual build.

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #16150 from wangmiao1981/audit2.

(cherry picked from commit 86a96034ccb47c5bba2cd739d793240afcfc25f6)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/R/mllib.R                        | 147 ++++++++++++++-----------
 R/pkg/inst/tests/testthat/test_mllib.R |   2 +
 2 files changed, 86 insertions(+), 63 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 632e4add64572..5df843c2b9d5e 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -191,7 +191,7 @@ predict_internal <- function(object, newData) {
 #' @param regParam regularization parameter for L2 regularization.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
-#' @return \code{spark.glm} returns a fitted generalized linear model
+#' @return \code{spark.glm} returns a fitted generalized linear model.
 #' @rdname spark.glm
 #' @name spark.glm
 #' @export
@@ -277,12 +277,12 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
 
 #' @param object a fitted generalized linear model.
-#' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including at least the coefficients matrix (which includes coefficients, standard error
-#'         of coefficients, t value and p value), null/residual deviance, null/residual degrees of
-#'         freedom, AIC and number of iterations IRLS takes. If there are collinear columns
-#'         in you data, the coefficients matrix only provides coefficients.
-#'
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list of components includes at least the \code{coefficients} (coefficients matrix, which includes
+#'         coefficients, standard error of coefficients, t value and p value),
+#'         \code{null.deviance} (null/residual degrees of freedom), \code{aic} (AIC)
+#'         and \code{iter} (number of iterations IRLS takes). If there are collinear columns in the data,
+#'         the coefficients matrix only provides coefficients.
 #' @rdname spark.glm
 #' @export
 #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
@@ -328,7 +328,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
 #  Prints the summary of GeneralizedLinearRegressionModel
 
 #' @rdname spark.glm
-#' @param x summary object of fitted generalized linear model returned by \code{summary} function
+#' @param x summary object of fitted generalized linear model returned by \code{summary} function.
 #' @export
 #' @note print.summary.GeneralizedLinearRegressionModel since 2.0.0
 print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
@@ -361,7 +361,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named
-#'         "prediction"
+#'         "prediction".
 #' @rdname spark.glm
 #' @export
 #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0
@@ -375,7 +375,7 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
-#' "prediction"
+#' "prediction".
 #' @rdname spark.naiveBayes
 #' @export
 #' @note predict(NaiveBayesModel) since 2.0.0
@@ -387,8 +387,9 @@ setMethod("predict", signature(object = "NaiveBayesModel"),
 # Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes}
 
 #' @param object a naive Bayes model fitted by \code{spark.naiveBayes}.
-#' @return \code{summary} returns a list containing \code{apriori}, the label distribution, and
-#'         \code{tables}, conditional probabilities given the target label.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{apriori} (the label distribution) and
+#'         \code{tables} (conditional probabilities given the target label).
 #' @rdname spark.naiveBayes
 #' @export
 #' @note summary(NaiveBayesModel) since 2.0.0
@@ -409,9 +410,9 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 
 # Returns posterior probabilities from a Latent Dirichlet Allocation model produced by spark.lda()
 
-#' @param newData A SparkDataFrame for testing
+#' @param newData A SparkDataFrame for testing.
 #' @return \code{spark.posterior} returns a SparkDataFrame containing posterior probabilities
-#'         vectors named "topicDistribution"
+#'         vectors named "topicDistribution".
 #' @rdname spark.lda
 #' @aliases spark.posterior,LDAModel,SparkDataFrame-method
 #' @export
@@ -425,7 +426,8 @@ setMethod("spark.posterior", signature(object = "LDAModel", newData = "SparkData
 
 #' @param object A Latent Dirichlet Allocation model fitted by \code{spark.lda}.
 #' @param maxTermsPerTopic Maximum number of terms to collect for each topic. Default value of 10.
-#' @return \code{summary} returns a list containing
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes
 #'         \item{\code{docConcentration}}{concentration parameter commonly named \code{alpha} for
 #'               the prior placed on documents distributions over topics \code{theta}}
 #'         \item{\code{topicConcentration}}{concentration parameter commonly named \code{beta} or
@@ -476,7 +478,7 @@ setMethod("spark.perplexity", signature(object = "LDAModel", data = "SparkDataFr
 
 # Saves the Latent Dirichlet Allocation model to the input path.
 
-#' @param path The directory where the model is saved
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -495,16 +497,16 @@ setMethod("write.ml", signature(object = "LDAModel", path = "character"),
 #' Fits an Isotonic Regression model against a Spark DataFrame, similarly to R's isoreg().
 #' Users can print, make predictions on the produced model and save the model to the input path.
 #'
-#' @param data SparkDataFrame for training
+#' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param isotonic Whether the output sequence should be isotonic/increasing (TRUE) or
-#'                 antitonic/decreasing (FALSE)
+#'                 antitonic/decreasing (FALSE).
 #' @param featureIndex The index of the feature if \code{featuresCol} is a vector column
-#'                     (default: 0), no effect otherwise
+#'                     (default: 0), no effect otherwise.
 #' @param weightCol The weight column name.
 #' @param ... additional arguments passed to the method.
-#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model
+#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model.
 #' @rdname spark.isoreg
 #' @aliases spark.isoreg,SparkDataFrame,formula-method
 #' @name spark.isoreg
@@ -550,9 +552,9 @@ setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"
 
 #  Predicted values based on an isotonicRegression model
 
-#' @param object a fitted IsotonicRegressionModel
-#' @param newData SparkDataFrame for testing
-#' @return \code{predict} returns a SparkDataFrame containing predicted values
+#' @param object a fitted IsotonicRegressionModel.
+#' @param newData SparkDataFrame for testing.
+#' @return \code{predict} returns a SparkDataFrame containing predicted values.
 #' @rdname spark.isoreg
 #' @aliases predict,IsotonicRegressionModel,SparkDataFrame-method
 #' @export
@@ -564,7 +566,9 @@ setMethod("predict", signature(object = "IsotonicRegressionModel"),
 
 #  Get the summary of an IsotonicRegressionModel model
 
-#' @return \code{summary} returns the model's boundaries and prediction as lists
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes model's \code{boundaries} (boundaries in increasing order)
+#'         and \code{predictions} (predictions associated with the boundaries at the same index).
 #' @rdname spark.isoreg
 #' @aliases summary,IsotonicRegressionModel-method
 #' @export
@@ -661,7 +665,11 @@ setMethod("fitted", signature(object = "KMeansModel"),
 #  Get the summary of a k-means model
 
 #' @param object a fitted k-means model.
-#' @return \code{summary} returns the model's features, coefficients, k, size and cluster.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes the model's \code{k} (number of cluster centers),
+#'         \code{coefficients} (model cluster centers),
+#'         \code{size} (number of data points in each cluster), and \code{cluster}
+#'         (cluster centers of the transformed data).
 #' @rdname spark.kmeans
 #' @export
 #' @note summary(KMeansModel) since 2.0.0
@@ -681,7 +689,7 @@ setMethod("summary", signature(object = "KMeansModel"),
             } else {
               dataFrame(callJMethod(jobj, "cluster"))
             }
-            list(coefficients = coefficients, size = size,
+            list(k = k, coefficients = coefficients, size = size,
                  cluster = cluster, is.loaded = is.loaded)
           })
 
@@ -703,7 +711,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet.
 #' Users can print, make predictions on the produced model and save the model to the input path.
 #'
-#' @param data SparkDataFrame for training
+#' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param regParam the regularization parameter.
@@ -734,7 +742,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                  is the original probability of that class and t is the class's threshold.
 #' @param weightCol The weight column name.
 #' @param ... additional arguments passed to the method.
-#' @return \code{spark.logit} returns a fitted logistic regression model
+#' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
 #' @aliases spark.logit,SparkDataFrame,formula-method
 #' @name spark.logit
@@ -802,8 +810,9 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
 
 #  Get the summary of an LogisticRegressionModel
 
-#' @param object an LogisticRegressionModel fitted by \code{spark.logit}
-#' @return \code{summary} returns coefficients matrix of the fitted model
+#' @param object an LogisticRegressionModel fitted by \code{spark.logit}.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{coefficients} (coefficients matrix of the fitted model).
 #' @rdname spark.logit
 #' @aliases summary,LogisticRegressionModel-method
 #' @export
@@ -842,7 +851,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param blockSize blockSize parameter.
-#' @param layers integer vector containing the number of nodes for each layer
+#' @param layers integer vector containing the number of nodes for each layer.
 #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
@@ -920,10 +929,12 @@ setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel
 # Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
 
 #' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
-#' @return \code{summary} returns a list containing \code{numOfInputs}, \code{numOfOutputs},
-#'         \code{layers}, and \code{weights}. For \code{weights}, it is a numeric vector with
-#'         length equal to the expected given the architecture (i.e., for 8-10-2 network,
-#'         112 connection weights).
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs}
+#'         (number of outputs), \code{layers} (array of layer sizes including input
+#'         and output layers), and \code{weights} (the weights of layers).
+#'         For \code{weights}, it is a numeric vector with length equal to the expected
+#'         given the architecture (i.e., for 8-10-2 network, 112 connection weights).
 #' @rdname spark.mlp
 #' @export
 #' @aliases summary,MultilayerPerceptronClassificationModel-method
@@ -988,7 +999,7 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
 
 # Saves the Bernoulli naive Bayes model to the input path.
 
-#' @param path the directory where the model is saved
+#' @param path the directory where the model is saved.
 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1062,7 +1073,7 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
 
 #  Save fitted IsotonicRegressionModel to the input path
 
-#' @param path The directory where the model is saved
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1077,7 +1088,7 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
 
 #  Save fitted LogisticRegressionModel to the input path
 
-#' @param path The directory where the model is saved
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1204,7 +1215,7 @@ setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula
 #' posterior probabilities on new data, \code{spark.perplexity} to compute log perplexity on new
 #' data and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
-#' @param data A SparkDataFrame for training
+#' @param data A SparkDataFrame for training.
 #' @param features Features column name. Either libSVM-format column or character-format column is
 #'        valid.
 #' @param k Number of topics.
@@ -1224,7 +1235,7 @@ setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula
 #'        parameter if libSVM-format column is used as the features column.
 #' @param maxVocabSize maximum vocabulary size, default 1 << 18
 #' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model
+#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model.
 #' @rdname spark.lda
 #' @aliases spark.lda,SparkDataFrame-method
 #' @seealso topicmodels: \url{https://cran.r-project.org/package=topicmodels}
@@ -1272,8 +1283,9 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
 # similarly to R's summary().
 
 #' @param object a fitted AFT survival regression model.
-#' @return \code{summary} returns a list containing the model's features, coefficients,
-#' intercept and log(scale)
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes the model's \code{coefficients} (features, coefficients,
+#'         intercept and log(scale)).
 #' @rdname spark.survreg
 #' @export
 #' @note summary(AFTSurvivalRegressionModel) since 2.0.0
@@ -1293,7 +1305,7 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted values
-#' on the original scale of the data (mean predicted value at scale = 1.0).
+#'         on the original scale of the data (mean predicted value at scale = 1.0).
 #' @rdname spark.survreg
 #' @export
 #' @note predict(AFTSurvivalRegressionModel) since 2.0.0
@@ -1360,7 +1372,9 @@ setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula =
 #  Get the summary of a multivariate gaussian mixture model
 
 #' @param object a fitted gaussian mixture model.
-#' @return \code{summary} returns the model's lambda, mu, sigma, k, dim and posterior.
+#' @return \code{summary} returns summary of the fitted model, which is a list.
+#'         The list includes the model's \code{lambda} (lambda), \code{mu} (mu),
+#'         \code{sigma} (sigma), and \code{posterior} (posterior).
 #' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
 #' @rdname spark.gaussianMixture
 #' @export
@@ -1434,7 +1448,7 @@ setMethod("predict", signature(object = "GaussianMixtureModel"),
 #' @param numItemBlocks number of item blocks used to parallelize computation (> 0).
 #' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1).
 #' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.als} returns a fitted ALS model
+#' @return \code{spark.als} returns a fitted ALS model.
 #' @rdname spark.als
 #' @aliases spark.als,SparkDataFrame-method
 #' @name spark.als
@@ -1494,9 +1508,11 @@ setMethod("spark.als", signature(data = "SparkDataFrame"),
 # Returns a summary of the ALS model produced by spark.als.
 
 #' @param object a fitted ALS model.
-#' @return \code{summary} returns a list containing the names of the user column,
-#'         the item column and the rating column, the estimated user and item factors,
-#'         rank, regularization parameter and maximum number of iterations used in training.
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list includes \code{user} (the names of the user column),
+#'         \code{item} (the item column), \code{rating} (the rating column), \code{userFactors}
+#'         (the estimated user factors), \code{itemFactors} (the estimated item factors),
+#'         and \code{rank} (rank of the matrix factorization model).
 #' @rdname spark.als
 #' @aliases summary,ALSModel-method
 #' @export
@@ -1609,9 +1625,10 @@ setMethod("spark.kstest", signature(data = "SparkDataFrame"),
 
 #  Get the summary of Kolmogorov-Smirnov (KS) Test.
 #' @param object test result object of KSTest by \code{spark.kstest}.
-#' @return \code{summary} returns a list containing the p-value, test statistic computed for the
-#'         test, the null hypothesis with its parameters tested against
-#'         and degrees of freedom of the test.
+#' @return \code{summary} returns summary information of KSTest object, which is a list.
+#'         The list includes the \code{p.value} (p-value), \code{statistic} (test statistic
+#'         computed for the test), \code{nullHypothesis} (the null hypothesis with its
+#'         parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test).
 #' @rdname spark.kstest
 #' @aliases summary,KSTest-method
 #' @export
@@ -1757,7 +1774,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
-#' "prediction"
+#'         "prediction".
 #' @rdname spark.randomForest
 #' @aliases predict,RandomForestRegressionModel-method
 #' @export
@@ -1778,8 +1795,8 @@ setMethod("predict", signature(object = "RandomForestClassificationModel"),
 
 # Save the Random Forest Regression or Classification model to the input path.
 
-#' @param object A fitted Random Forest regression model or classification model
-#' @param path The directory where the model is saved
+#' @param object A fitted Random Forest regression model or classification model.
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
@@ -1821,9 +1838,11 @@ summary.treeEnsemble <- function(model) {
 
 #  Get the summary of a Random Forest Regression Model
 
-#' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including formula, number of features, list of features, feature importances, number of
-#'         trees, and tree weights
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list of components includes \code{formula} (formula),
+#'         \code{numFeatures} (number of features), \code{features} (list of features),
+#'         \code{featureImportances} (feature importances), \code{numTrees} (number of trees),
+#'         and \code{treeWeights} (tree weights).
 #' @rdname spark.randomForest
 #' @aliases summary,RandomForestRegressionModel-method
 #' @export
@@ -2000,7 +2019,7 @@ setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
 
 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
-#' "prediction"
+#'         "prediction".
 #' @rdname spark.gbt
 #' @aliases predict,GBTRegressionModel-method
 #' @export
@@ -2021,8 +2040,8 @@ setMethod("predict", signature(object = "GBTClassificationModel"),
 
 # Save the Gradient Boosted Tree Regression or Classification model to the input path.
 
-#' @param object A fitted Gradient Boosted Tree regression model or classification model
-#' @param path The directory where the model is saved
+#' @param object A fitted Gradient Boosted Tree regression model or classification model.
+#' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #' @aliases write.ml,GBTRegressionModel,character-method
@@ -2045,9 +2064,11 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 
 #  Get the summary of a Gradient Boosted Tree Regression Model
 
-#' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including formula, number of features, list of features, feature importances, number of
-#'         trees, and tree weights
+#' @return \code{summary} returns summary information of the fitted model, which is a list.
+#'         The list of components includes \code{formula} (formula),
+#'         \code{numFeatures} (number of features), \code{features} (list of features),
+#'         \code{featureImportances} (feature importances), \code{numTrees} (number of trees),
+#'         and \code{treeWeights} (tree weights).
 #' @rdname spark.gbt
 #' @aliases summary,GBTRegressionModel-method
 #' @export
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index db1e4dc7d8458..46dffe3ca091f 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -350,6 +350,8 @@ test_that("spark.kmeans", {
   # Test summary works on KMeans
   summary.model <- summary(model)
   cluster <- summary.model$cluster
+  k <- summary.model$k
+  expect_equal(k, 2)
   expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
 
   # Test model save/load

From e8f351f9a670fc4d43f15c8d7cd57e49fb9ceba2 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 8 Dec 2016 22:21:24 -0800
Subject: [PATCH 320/534] Copy the SparkR source package with LFTP

This PR adds a line in release-build.sh to copy the SparkR source archive using LFTP

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #16226 from shivaram/fix-sparkr-copy-build.

(cherry picked from commit 934035ae7cb648fe61665d8efe0b7aa2bbe4ca47)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 dev/create-release/release-build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 1b05b20a14b76..7c77791418ffc 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -258,6 +258,7 @@ if [[ "$1" == "package" ]]; then
   LFTP mkdir -p $dest_dir
   LFTP mput -O $dest_dir 'spark-*'
   LFTP mput -O $dest_dir 'pyspark-*'
+  LFTP mput -O $dest_dir 'SparkR-*'
   exit 0
 fi
 

From 2c88e1dc31e1b90605ad8ab85b20b131b4b3c722 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Thu, 8 Dec 2016 22:52:34 -0800
Subject: [PATCH 321/534] Copy pyspark and SparkR packages to latest release
 dir too

## What changes were proposed in this pull request?

Copy pyspark and SparkR packages to latest release dir, as per comment [here](https://github.com/apache/spark/pull/16226#discussion_r91664822)

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16227 from felixcheung/pyrftp.

(cherry picked from commit c074c96dc57bf18b28fafdcac0c768d75c642cba)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 dev/create-release/release-build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 7c77791418ffc..c0663b815da9f 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -251,6 +251,8 @@ if [[ "$1" == "package" ]]; then
   # Put to new directory:
   LFTP mkdir -p $dest_dir
   LFTP mput -O $dest_dir 'spark-*'
+  LFTP mput -O $dest_dir 'pyspark-*'
+  LFTP mput -O $dest_dir 'SparkR-*'
   # Delete /latest directory and rename new upload to /latest
   LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0"
   LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest"

From 72bf5199738c7ab0361b2b55eb4f4299048a21fa Mon Sep 17 00:00:00 2001
From: Zhan Zhang <zhanzhang@fb.com>
Date: Fri, 9 Dec 2016 16:35:06 +0800
Subject: [PATCH 322/534] [SPARK-18637][SQL] Stateful UDF should be considered
 as nondeterministic

Make stateful udf as nondeterministic

Add new test cases with both Stateful and Stateless UDF.
Without the patch, the test cases will throw exception:

1 did not equal 10
ScalaTestFailureLocation: org.apache.spark.sql.hive.execution.HiveUDFSuite$$anonfun$21 at (HiveUDFSuite.scala:501)
org.scalatest.exceptions.TestFailedException: 1 did not equal 10
        at org.scalatest.Assertions$class.newAssertionFailedException(Assertions.scala:500)
        at org.scalatest.FunSuite.newAssertionFailedException(FunSuite.scala:1555)
        ...

Author: Zhan Zhang <zhanzhang@fb.com>

Closes #16068 from zhzhan/state.

(cherry picked from commit 67587d961d5f94a8639c20cb80127c86bf79d5a8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |  4 +-
 .../sql/hive/execution/HiveUDFSuite.scala     | 45 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index e30e0f9611f59..37414ad12934d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -59,7 +59,7 @@ private[hive] case class HiveSimpleUDF(
   @transient
   private lazy val isUDFDeterministic = {
     val udfType = function.getClass().getAnnotation(classOf[HiveUDFType])
-    udfType != null && udfType.deterministic()
+    udfType != null && udfType.deterministic() && !udfType.stateful()
   }
 
   override def foldable: Boolean = isUDFDeterministic && children.forall(_.foldable)
@@ -142,7 +142,7 @@ private[hive] case class HiveGenericUDF(
   @transient
   private lazy val isUDFDeterministic = {
     val udfType = function.getClass.getAnnotation(classOf[HiveUDFType])
-    udfType != null && udfType.deterministic()
+    udfType != null && udfType.deterministic() && !udfType.stateful()
   }
 
   @transient
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 48adc833f4b22..4098bb597bdee 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -21,15 +21,17 @@ import java.io.{DataInput, DataOutput, File, PrintWriter}
 import java.util.{ArrayList, Arrays, Properties}
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.ql.udf.UDAFPercentile
+import org.apache.hadoop.hive.ql.exec.UDF
+import org.apache.hadoop.hive.ql.udf.{UDAFPercentile, UDFType}
 import org.apache.hadoop.hive.ql.udf.generic._
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject
 import org.apache.hadoop.hive.serde2.{AbstractSerDe, SerDeStats}
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
-import org.apache.hadoop.io.Writable
+import org.apache.hadoop.io.{LongWritable, Writable}
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.apache.spark.sql.functions.max
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.util.Utils
@@ -487,6 +489,26 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
     assert(count4 == 1)
     sql("DROP TABLE parquet_tmp")
   }
+
+  test("Hive Stateful UDF") {
+    withUserDefinedFunction("statefulUDF" -> true, "statelessUDF" -> true) {
+      sql(s"CREATE TEMPORARY FUNCTION statefulUDF AS '${classOf[StatefulUDF].getName}'")
+      sql(s"CREATE TEMPORARY FUNCTION statelessUDF AS '${classOf[StatelessUDF].getName}'")
+      val testData = spark.range(10).repartition(1)
+
+      // Expected Max(s) is 10 as statefulUDF returns the sequence number starting from 1.
+      checkAnswer(testData.selectExpr("statefulUDF() as s").agg(max($"s")), Row(10))
+
+      // Expected Max(s) is 5 as statefulUDF returns the sequence number starting from 1,
+      // and the data is evenly distributed into 2 partitions.
+      checkAnswer(testData.repartition(2)
+        .selectExpr("statefulUDF() as s").agg(max($"s")), Row(5))
+
+      // Expected Max(s) is 1, as stateless UDF is deterministic and foldable and replaced
+      // by constant 1 by ConstantFolding optimizer.
+      checkAnswer(testData.selectExpr("statelessUDF() as s").agg(max($"s")), Row(1))
+    }
+  }
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {
@@ -551,3 +573,22 @@ class PairUDF extends GenericUDF {
 
   override def getDisplayString(p1: Array[String]): String = ""
 }
+
+@UDFType(stateful = true)
+class StatefulUDF extends UDF {
+  private val result = new LongWritable(0)
+
+  def evaluate(): LongWritable = {
+    result.set(result.get() + 1)
+    result
+  }
+}
+
+class StatelessUDF extends UDF {
+  private val result = new LongWritable(0)
+
+  def evaluate(): LongWritable = {
+    result.set(result.get() + 1)
+    result
+  }
+}

From b226f10e3df8b789da6ef820b256f994b178fbbe Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek@japila.pl>
Date: Fri, 9 Dec 2016 18:45:57 +0800
Subject: [PATCH 323/534] [MINOR][CORE][SQL][DOCS] Typo fixes

## What changes were proposed in this pull request?

Typo fixes

## How was this patch tested?

Local build. Awaiting the official build.

Author: Jacek Laskowski <jacek@japila.pl>

Closes #16144 from jaceklaskowski/typo-fixes.

(cherry picked from commit b162cc0c2810c1a9fa2eee8e664ffae84f9eea11)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 core/src/main/scala/org/apache/spark/MapOutputTracker.scala | 2 +-
 core/src/main/scala/org/apache/spark/SparkContext.scala     | 4 ++--
 .../spark/deploy/history/HistoryServerArguments.scala       | 2 +-
 .../scala/org/apache/spark/internal/config/package.scala    | 2 +-
 core/src/main/scala/org/apache/spark/rdd/RDD.scala          | 2 +-
 .../main/scala/org/apache/spark/rpc/RpcCallContext.scala    | 2 +-
 docs/monitoring.md                                          | 6 ++----
 .../java/org/apache/spark/sql/streaming/OutputMode.java     | 2 +-
 .../scala/org/apache/spark/sql/catalyst/InternalRow.scala   | 2 +-
 .../apache/spark/sql/catalyst/catalog/ExternalCatalog.scala | 2 +-
 .../apache/spark/sql/catalyst/expressions/Expression.scala  | 6 +++---
 .../spark/sql/catalyst/expressions/objects/objects.scala    | 2 +-
 12 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 7f8f0f513134f..6f5c31d7ab71c 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -322,7 +322,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf,
   if (minSizeForBroadcast > maxRpcMessageSize) {
     val msg = s"spark.shuffle.mapOutput.minSizeForBroadcast ($minSizeForBroadcast bytes) must " +
       s"be <= spark.rpc.message.maxSize ($maxRpcMessageSize bytes) to prevent sending an rpc " +
-      "message that is to large."
+      "message that is too large."
     logError(msg)
     throw new IllegalArgumentException(msg)
   }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 8f8392fa646de..b6aeeb9559ec8 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2567,8 +2567,8 @@ object SparkContext extends Logging {
     val serviceLoaders =
       ServiceLoader.load(classOf[ExternalClusterManager], loader).asScala.filter(_.canCreate(url))
     if (serviceLoaders.size > 1) {
-      throw new SparkException(s"Multiple Cluster Managers ($serviceLoaders) registered " +
-          s"for the url $url:")
+      throw new SparkException(
+        s"Multiple external cluster managers registered for the url $url: $serviceLoaders")
     }
     serviceLoaders.headOption
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
index 2eddb5ff54479..080ba12c2f0d1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
 /**
- * Command-line parser for the master.
+ * Command-line parser for the [[HistoryServer]].
  */
 private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String])
   extends Logging {
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 8ce9883ac5531..f4844dee62ef4 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -198,7 +198,7 @@ package object config {
     .createWithDefault(0)
 
   private[spark] val DRIVER_BLOCK_MANAGER_PORT = ConfigBuilder("spark.driver.blockManager.port")
-    .doc("Port to use for the block managed on the driver.")
+    .doc("Port to use for the block manager on the driver.")
     .fallbackConf(BLOCK_MANAGER_PORT)
 
   private[spark] val IGNORE_CORRUPT_FILES = ConfigBuilder("spark.files.ignoreCorruptFiles")
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index d285e917b8a67..374abccf6ad55 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1746,7 +1746,7 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Clears the dependencies of this RDD. This method must ensure that all references
-   * to the original parent RDDs is removed to enable the parent RDDs to be garbage
+   * to the original parent RDDs are removed to enable the parent RDDs to be garbage
    * collected. Subclasses of RDD may override this method for implementing their own cleaning
    * logic. See [[org.apache.spark.rdd.UnionRDD]] for an example.
    */
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
index f527ec86ab7b2..117f51c5b8f2a 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcCallContext.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.rpc
 
 /**
- * A callback that [[RpcEndpoint]] can use it to send back a message or failure. It's thread-safe
+ * A callback that [[RpcEndpoint]] can use to send back a message or failure. It's thread-safe
  * and can be called in any thread.
  */
 private[spark] trait RpcCallContext {
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 2eef4568d00e9..7a1de52668f1a 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -44,10 +44,8 @@ The spark jobs themselves must be configured to log events, and to log them to t
 writable directory. For example, if the server was configured with a log directory of
 `hdfs://namenode/shared/spark-logs`, then the client-side options would be:
 
-```
-spark.eventLog.enabled true
-spark.eventLog.dir hdfs://namenode/shared/spark-logs
-```
+    spark.eventLog.enabled true
+    spark.eventLog.dir hdfs://namenode/shared/spark-logs
 
 The history server can be configured as follows:
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
index 49a18df2c72c0..a515c1a109cf4 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
@@ -46,7 +46,7 @@ public static OutputMode Append() {
 
   /**
    * OutputMode in which all the rows in the streaming DataFrame/Dataset will be written
-   * to the sink every time these is some updates. This output mode can only be used in queries
+   * to the sink every time there are some updates. This output mode can only be used in queries
    * that contain aggregations.
    *
    * @since 2.0.0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
index f498e071b50a3..256f64e320be8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/InternalRow.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.types.{DataType, Decimal, StructType}
 
 /**
- * An abstract class for row used internal in Spark SQL, which only contain the columns as
+ * An abstract class for row used internally in Spark SQL, which only contains the columns as
  * internal types.
  */
 abstract class InternalRow extends SpecializedGetters with Serializable {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
index 4b8cac8f32b06..78897daec8107 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 
 
 /**
- * Interface for the system catalog (of columns, partitions, tables, and databases).
+ * Interface for the system catalog (of functions, partitions, tables, and databases).
  *
  * This is only used for non-temporary items, and implementations must be thread-safe as they
  * can be accessed in multiple threads. This is an external catalog because it is expected to
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 221f830aa8583..b93a5d0b7a0e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -70,9 +70,9 @@ abstract class Expression extends TreeNode[Expression] {
    * children.
    *
    * Note that this means that an expression should be considered as non-deterministic if:
-   * - if it relies on some mutable internal state, or
-   * - if it relies on some implicit input that is not part of the children expression list.
-   * - if it has non-deterministic child or children.
+   * - it relies on some mutable internal state, or
+   * - it relies on some implicit input that is not part of the children expression list.
+   * - it has non-deterministic child or children.
    *
    * An example would be `SparkPartitionID` that relies on the partition id returned by TaskContext.
    * By default leaf expressions are deterministic as Nil.forall(_.deterministic) returns true.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index e517ec18eb540..038b02351eaf7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -924,7 +924,7 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
 /**
  * Asserts that input values of a non-nullable child expression are not null.
  *
- * Note that there are cases where `child.nullable == true`, while we still needs to add this
+ * Note that there are cases where `child.nullable == true`, while we still need to add this
  * assertion.  Consider a nullable column `s` whose data type is a struct containing a non-nullable
  * `Int` field named `i`.  Expression `s.i` is nullable because `s` can be null.  However, for all
  * non-null `s`, `s.i` can't be null.

From 0c6415aeca7a5c2fc5462c483c60d770f0236efe Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 9 Dec 2016 07:51:46 -0800
Subject: [PATCH 324/534] [SPARK-17822][R] Make JVMObjectTracker a member
 variable of RBackend

## What changes were proposed in this pull request?

* This PR changes `JVMObjectTracker` from `object` to `class` and let its instance associated with each RBackend. So we can manage the lifecycle of JVM objects when there are multiple `RBackend` sessions. `RBackend.close` will clear the object tracker explicitly.
* I assume that `SQLUtils` and `RRunner` do not need to track JVM instances, which could be wrong.
* Small refactor of `SerDe.sqlSerDe` to increase readability.

## How was this patch tested?

* Added unit tests for `JVMObjectTracker`.
* Wait for Jenkins to run full tests.

Author: Xiangrui Meng <meng@databricks.com>

Closes #16154 from mengxr/SPARK-17822.

(cherry picked from commit fd48d80a6145ea94f03e7fc6e4d724a0fbccac58)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 .../apache/spark/api/r/JVMObjectTracker.scala | 87 ++++++++++++++++++
 .../org/apache/spark/api/r/RBackend.scala     |  6 +-
 .../apache/spark/api/r/RBackendHandler.scala  | 54 ++---------
 .../org/apache/spark/api/r/RRunner.scala      |  2 +-
 .../scala/org/apache/spark/api/r/SerDe.scala  | 92 +++++++++++--------
 .../spark/api/r/JVMObjectTrackerSuite.scala   | 73 +++++++++++++++
 .../apache/spark/api/r/RBackendSuite.scala    | 31 +++++++
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 12 +--
 8 files changed, 265 insertions(+), 92 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
 create mode 100644 core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
new file mode 100644
index 0000000000000..3432700f11602
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/r/JVMObjectTracker.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.concurrent.ConcurrentHashMap
+
+/** JVM object ID wrapper */
+private[r] case class JVMObjectId(id: String) {
+  require(id != null, "Object ID cannot be null.")
+}
+
+/**
+ * Counter that tracks JVM objects returned to R.
+ * This is useful for referencing these objects in RPC calls.
+ */
+private[r] class JVMObjectTracker {
+
+  private[this] val objMap = new ConcurrentHashMap[JVMObjectId, Object]()
+  private[this] val objCounter = new AtomicInteger()
+
+  /**
+   * Returns the JVM object associated with the input key or None if not found.
+   */
+  final def get(id: JVMObjectId): Option[Object] = this.synchronized {
+    if (objMap.containsKey(id)) {
+      Some(objMap.get(id))
+    } else {
+      None
+    }
+  }
+
+  /**
+   * Returns the JVM object associated with the input key or throws an exception if not found.
+   */
+  @throws[NoSuchElementException]("if key does not exist.")
+  final def apply(id: JVMObjectId): Object = {
+    get(id).getOrElse(
+      throw new NoSuchElementException(s"$id does not exist.")
+    )
+  }
+
+  /**
+   * Adds a JVM object to track and returns assigned ID, which is unique within this tracker.
+   */
+  final def addAndGetId(obj: Object): JVMObjectId = {
+    val id = JVMObjectId(objCounter.getAndIncrement().toString)
+    objMap.put(id, obj)
+    id
+  }
+
+  /**
+   * Removes and returns a JVM object with the specific ID from the tracker, or None if not found.
+   */
+  final def remove(id: JVMObjectId): Option[Object] = this.synchronized {
+    if (objMap.containsKey(id)) {
+      Some(objMap.remove(id))
+    } else {
+      None
+    }
+  }
+
+  /**
+   * Number of JVM objects being tracked.
+   */
+  final def size: Int = objMap.size()
+
+  /**
+   * Clears the tracker.
+   */
+  final def clear(): Unit = objMap.clear()
+}
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
index 550746c552d02..2d1152a036449 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackend.scala
@@ -22,7 +22,7 @@ import java.net.{InetAddress, InetSocketAddress, ServerSocket}
 import java.util.concurrent.TimeUnit
 
 import io.netty.bootstrap.ServerBootstrap
-import io.netty.channel.{ChannelFuture, ChannelInitializer, ChannelOption, EventLoopGroup}
+import io.netty.channel.{ChannelFuture, ChannelInitializer, EventLoopGroup}
 import io.netty.channel.nio.NioEventLoopGroup
 import io.netty.channel.socket.SocketChannel
 import io.netty.channel.socket.nio.NioServerSocketChannel
@@ -42,6 +42,9 @@ private[spark] class RBackend {
   private[this] var bootstrap: ServerBootstrap = null
   private[this] var bossGroup: EventLoopGroup = null
 
+  /** Tracks JVM objects returned to R for this RBackend instance. */
+  private[r] val jvmObjectTracker = new JVMObjectTracker
+
   def init(): Int = {
     val conf = new SparkConf()
     val backendConnectionTimeout = conf.getInt(
@@ -94,6 +97,7 @@ private[spark] class RBackend {
       bootstrap.childGroup().shutdownGracefully()
     }
     bootstrap = null
+    jvmObjectTracker.clear()
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
index 9f5afa29d6d22..cfd37ac54ba23 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RBackendHandler.scala
@@ -20,7 +20,6 @@ package org.apache.spark.api.r
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream}
 import java.util.concurrent.TimeUnit
 
-import scala.collection.mutable.HashMap
 import scala.language.existentials
 
 import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler}
@@ -62,7 +61,7 @@ private[r] class RBackendHandler(server: RBackend)
           assert(numArgs == 1)
 
           writeInt(dos, 0)
-          writeObject(dos, args(0))
+          writeObject(dos, args(0), server.jvmObjectTracker)
         case "stopBackend" =>
           writeInt(dos, 0)
           writeType(dos, "void")
@@ -72,9 +71,9 @@ private[r] class RBackendHandler(server: RBackend)
             val t = readObjectType(dis)
             assert(t == 'c')
             val objToRemove = readString(dis)
-            JVMObjectTracker.remove(objToRemove)
+            server.jvmObjectTracker.remove(JVMObjectId(objToRemove))
             writeInt(dos, 0)
-            writeObject(dos, null)
+            writeObject(dos, null, server.jvmObjectTracker)
           } catch {
             case e: Exception =>
               logError(s"Removing $objId failed", e)
@@ -143,12 +142,8 @@ private[r] class RBackendHandler(server: RBackend)
       val cls = if (isStatic) {
         Utils.classForName(objId)
       } else {
-        JVMObjectTracker.get(objId) match {
-          case None => throw new IllegalArgumentException("Object not found " + objId)
-          case Some(o) =>
-            obj = o
-            o.getClass
-        }
+        obj = server.jvmObjectTracker(JVMObjectId(objId))
+        obj.getClass
       }
 
       val args = readArgs(numArgs, dis)
@@ -173,7 +168,7 @@ private[r] class RBackendHandler(server: RBackend)
 
         // Write status bit
         writeInt(dos, 0)
-        writeObject(dos, ret.asInstanceOf[AnyRef])
+        writeObject(dos, ret.asInstanceOf[AnyRef], server.jvmObjectTracker)
       } else if (methodName == "<init>") {
         // methodName should be "<init>" for constructor
         val ctors = cls.getConstructors
@@ -193,7 +188,7 @@ private[r] class RBackendHandler(server: RBackend)
         val obj = ctors(index.get).newInstance(args : _*)
 
         writeInt(dos, 0)
-        writeObject(dos, obj.asInstanceOf[AnyRef])
+        writeObject(dos, obj.asInstanceOf[AnyRef], server.jvmObjectTracker)
       } else {
         throw new IllegalArgumentException("invalid method " + methodName + " for object " + objId)
       }
@@ -210,7 +205,7 @@ private[r] class RBackendHandler(server: RBackend)
   // Read a number of arguments from the data input stream
   def readArgs(numArgs: Int, dis: DataInputStream): Array[java.lang.Object] = {
     (0 until numArgs).map { _ =>
-      readObject(dis)
+      readObject(dis, server.jvmObjectTracker)
     }.toArray
   }
 
@@ -286,37 +281,4 @@ private[r] class RBackendHandler(server: RBackend)
   }
 }
 
-/**
- * Helper singleton that tracks JVM objects returned to R.
- * This is useful for referencing these objects in RPC calls.
- */
-private[r] object JVMObjectTracker {
-
-  // TODO: This map should be thread-safe if we want to support multiple
-  // connections at the same time
-  private[this] val objMap = new HashMap[String, Object]
-
-  // TODO: We support only one connection now, so an integer is fine.
-  // Investigate using use atomic integer in the future.
-  private[this] var objCounter: Int = 0
-
-  def getObject(id: String): Object = {
-    objMap(id)
-  }
-
-  def get(id: String): Option[Object] = {
-    objMap.get(id)
-  }
-
-  def put(obj: Object): String = {
-    val objId = objCounter.toString
-    objCounter = objCounter + 1
-    objMap.put(objId, obj)
-    objId
-  }
 
-  def remove(id: String): Option[Object] = {
-    objMap.remove(id)
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
index 7ef64723d9593..29e21b3b1aa8a 100644
--- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala
@@ -152,7 +152,7 @@ private[spark] class RRunner[U](
           dataOut.writeInt(mode)
 
           if (isDataFrame) {
-            SerDe.writeObject(dataOut, colNames)
+            SerDe.writeObject(dataOut, colNames, jvmObjectTracker = null)
           }
 
           if (!iter.hasNext) {
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 550e075a95129..dad928cdcfd0f 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -28,13 +28,20 @@ import scala.collection.mutable.WrappedArray
  * Utility functions to serialize, deserialize objects to / from R
  */
 private[spark] object SerDe {
-  type ReadObject = (DataInputStream, Char) => Object
-  type WriteObject = (DataOutputStream, Object) => Boolean
+  type SQLReadObject = (DataInputStream, Char) => Object
+  type SQLWriteObject = (DataOutputStream, Object) => Boolean
 
-  var sqlSerDe: (ReadObject, WriteObject) = _
+  private[this] var sqlReadObject: SQLReadObject = _
+  private[this] var sqlWriteObject: SQLWriteObject = _
 
-  def registerSqlSerDe(sqlSerDe: (ReadObject, WriteObject)): Unit = {
-    this.sqlSerDe = sqlSerDe
+  def setSQLReadObject(value: SQLReadObject): this.type = {
+    sqlReadObject = value
+    this
+  }
+
+  def setSQLWriteObject(value: SQLWriteObject): this.type = {
+    sqlWriteObject = value
+    this
   }
 
   // Type mapping from R to Java
@@ -56,32 +63,33 @@ private[spark] object SerDe {
     dis.readByte().toChar
   }
 
-  def readObject(dis: DataInputStream): Object = {
+  def readObject(dis: DataInputStream, jvmObjectTracker: JVMObjectTracker): Object = {
     val dataType = readObjectType(dis)
-    readTypedObject(dis, dataType)
+    readTypedObject(dis, dataType, jvmObjectTracker)
   }
 
   def readTypedObject(
       dis: DataInputStream,
-      dataType: Char): Object = {
+      dataType: Char,
+      jvmObjectTracker: JVMObjectTracker): Object = {
     dataType match {
       case 'n' => null
       case 'i' => new java.lang.Integer(readInt(dis))
       case 'd' => new java.lang.Double(readDouble(dis))
       case 'b' => new java.lang.Boolean(readBoolean(dis))
       case 'c' => readString(dis)
-      case 'e' => readMap(dis)
+      case 'e' => readMap(dis, jvmObjectTracker)
       case 'r' => readBytes(dis)
-      case 'a' => readArray(dis)
-      case 'l' => readList(dis)
+      case 'a' => readArray(dis, jvmObjectTracker)
+      case 'l' => readList(dis, jvmObjectTracker)
       case 'D' => readDate(dis)
       case 't' => readTime(dis)
-      case 'j' => JVMObjectTracker.getObject(readString(dis))
+      case 'j' => jvmObjectTracker(JVMObjectId(readString(dis)))
       case _ =>
-        if (sqlSerDe == null || sqlSerDe._1 == null) {
+        if (sqlReadObject == null) {
           throw new IllegalArgumentException (s"Invalid type $dataType")
         } else {
-          val obj = (sqlSerDe._1)(dis, dataType)
+          val obj = sqlReadObject(dis, dataType)
           if (obj == null) {
             throw new IllegalArgumentException (s"Invalid type $dataType")
           } else {
@@ -181,28 +189,28 @@ private[spark] object SerDe {
   }
 
   // All elements of an array must be of the same type
-  def readArray(dis: DataInputStream): Array[_] = {
+  def readArray(dis: DataInputStream, jvmObjectTracker: JVMObjectTracker): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {
       case 'i' => readIntArr(dis)
       case 'c' => readStringArr(dis)
       case 'd' => readDoubleArr(dis)
       case 'b' => readBooleanArr(dis)
-      case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
+      case 'j' => readStringArr(dis).map(x => jvmObjectTracker(JVMObjectId(x)))
       case 'r' => readBytesArr(dis)
       case 'a' =>
         val len = readInt(dis)
-        (0 until len).map(_ => readArray(dis)).toArray
+        (0 until len).map(_ => readArray(dis, jvmObjectTracker)).toArray
       case 'l' =>
         val len = readInt(dis)
-        (0 until len).map(_ => readList(dis)).toArray
+        (0 until len).map(_ => readList(dis, jvmObjectTracker)).toArray
       case _ =>
-        if (sqlSerDe == null || sqlSerDe._1 == null) {
+        if (sqlReadObject == null) {
           throw new IllegalArgumentException (s"Invalid array type $arrType")
         } else {
           val len = readInt(dis)
           (0 until len).map { _ =>
-            val obj = (sqlSerDe._1)(dis, arrType)
+            val obj = sqlReadObject(dis, arrType)
             if (obj == null) {
               throw new IllegalArgumentException (s"Invalid array type $arrType")
             } else {
@@ -215,17 +223,19 @@ private[spark] object SerDe {
 
   // Each element of a list can be of different type. They are all represented
   // as Object on JVM side
-  def readList(dis: DataInputStream): Array[Object] = {
+  def readList(dis: DataInputStream, jvmObjectTracker: JVMObjectTracker): Array[Object] = {
     val len = readInt(dis)
-    (0 until len).map(_ => readObject(dis)).toArray
+    (0 until len).map(_ => readObject(dis, jvmObjectTracker)).toArray
   }
 
-  def readMap(in: DataInputStream): java.util.Map[Object, Object] = {
+  def readMap(
+      in: DataInputStream,
+      jvmObjectTracker: JVMObjectTracker): java.util.Map[Object, Object] = {
     val len = readInt(in)
     if (len > 0) {
       // Keys is an array of String
-      val keys = readArray(in).asInstanceOf[Array[Object]]
-      val values = readList(in)
+      val keys = readArray(in, jvmObjectTracker).asInstanceOf[Array[Object]]
+      val values = readList(in, jvmObjectTracker)
 
       keys.zip(values).toMap.asJava
     } else {
@@ -272,7 +282,11 @@ private[spark] object SerDe {
     }
   }
 
-  private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = {
+  private def writeKeyValue(
+      dos: DataOutputStream,
+      key: Object,
+      value: Object,
+      jvmObjectTracker: JVMObjectTracker): Unit = {
     if (key == null) {
       throw new IllegalArgumentException("Key in map can't be null.")
     } else if (!key.isInstanceOf[String]) {
@@ -280,10 +294,10 @@ private[spark] object SerDe {
     }
 
     writeString(dos, key.asInstanceOf[String])
-    writeObject(dos, value)
+    writeObject(dos, value, jvmObjectTracker)
   }
 
-  def writeObject(dos: DataOutputStream, obj: Object): Unit = {
+  def writeObject(dos: DataOutputStream, obj: Object, jvmObjectTracker: JVMObjectTracker): Unit = {
     if (obj == null) {
       writeType(dos, "void")
     } else {
@@ -373,14 +387,14 @@ private[spark] object SerDe {
         case v: Array[Object] =>
           writeType(dos, "list")
           writeInt(dos, v.length)
-          v.foreach(elem => writeObject(dos, elem))
+          v.foreach(elem => writeObject(dos, elem, jvmObjectTracker))
 
         // Handle Properties
         // This must be above the case java.util.Map below.
         // (Properties implements Map<Object,Object> and will be serialized as map otherwise)
         case v: java.util.Properties =>
           writeType(dos, "jobj")
-          writeJObj(dos, value)
+          writeJObj(dos, value, jvmObjectTracker)
 
         // Handle map
         case v: java.util.Map[_, _] =>
@@ -392,19 +406,21 @@ private[spark] object SerDe {
             val key = entry.getKey
             val value = entry.getValue
 
-            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+            writeKeyValue(
+              dos, key.asInstanceOf[Object], value.asInstanceOf[Object], jvmObjectTracker)
           }
         case v: scala.collection.Map[_, _] =>
           writeType(dos, "map")
           writeInt(dos, v.size)
-          v.foreach { case (key, value) =>
-            writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
+          v.foreach { case (k1, v1) =>
+            writeKeyValue(dos, k1.asInstanceOf[Object], v1.asInstanceOf[Object], jvmObjectTracker)
           }
 
         case _ =>
-          if (sqlSerDe == null || sqlSerDe._2 == null || !(sqlSerDe._2)(dos, value)) {
+          val sqlWriteSucceeded = sqlWriteObject != null && sqlWriteObject(dos, value)
+          if (!sqlWriteSucceeded) {
             writeType(dos, "jobj")
-            writeJObj(dos, value)
+            writeJObj(dos, value, jvmObjectTracker)
           }
       }
     }
@@ -447,9 +463,9 @@ private[spark] object SerDe {
     out.write(value)
   }
 
-  def writeJObj(out: DataOutputStream, value: Object): Unit = {
-    val objId = JVMObjectTracker.put(value)
-    writeString(out, objId)
+  def writeJObj(out: DataOutputStream, value: Object, jvmObjectTracker: JVMObjectTracker): Unit = {
+    val JVMObjectId(id) = jvmObjectTracker.addAndGetId(value)
+    writeString(out, id)
   }
 
   def writeIntArr(out: DataOutputStream, value: Array[Int]): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala b/core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala
new file mode 100644
index 0000000000000..6a979aefe6e90
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/api/r/JVMObjectTrackerSuite.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import org.apache.spark.SparkFunSuite
+
+class JVMObjectTrackerSuite extends SparkFunSuite {
+  test("JVMObjectId does not take null IDs") {
+    intercept[IllegalArgumentException] {
+      JVMObjectId(null)
+    }
+  }
+
+  test("JVMObjectTracker") {
+    val tracker = new JVMObjectTracker
+    assert(tracker.size === 0)
+    withClue("an empty tracker can be cleared") {
+      tracker.clear()
+    }
+    val none = JVMObjectId("none")
+    assert(tracker.get(none) === None)
+    intercept[NoSuchElementException] {
+      tracker(JVMObjectId("none"))
+    }
+
+    val obj1 = new Object
+    val id1 = tracker.addAndGetId(obj1)
+    assert(id1 != null)
+    assert(tracker.size === 1)
+    assert(tracker.get(id1).get.eq(obj1))
+    assert(tracker(id1).eq(obj1))
+
+    val obj2 = new Object
+    val id2 = tracker.addAndGetId(obj2)
+    assert(id1 !== id2)
+    assert(tracker.size === 2)
+    assert(tracker(id2).eq(obj2))
+
+    val Some(obj1Removed) = tracker.remove(id1)
+    assert(obj1Removed.eq(obj1))
+    assert(tracker.get(id1) === None)
+    assert(tracker.size === 1)
+    assert(tracker(id2).eq(obj2))
+
+    val obj3 = new Object
+    val id3 = tracker.addAndGetId(obj3)
+    assert(tracker.size === 2)
+    assert(id3 != id1)
+    assert(id3 != id2)
+    assert(tracker(id3).eq(obj3))
+
+    tracker.clear()
+    assert(tracker.size === 0)
+    assert(tracker.get(id1) === None)
+    assert(tracker.get(id2) === None)
+    assert(tracker.get(id3) === None)
+  }
+}
diff --git a/core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala b/core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala
new file mode 100644
index 0000000000000..085cc267ca74d
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/api/r/RBackendSuite.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+import org.apache.spark.SparkFunSuite
+
+class RBackendSuite extends SparkFunSuite {
+  test("close() clears jvmObjectTracker") {
+    val backend = new RBackend
+    val tracker = backend.jvmObjectTracker
+    val id = tracker.addAndGetId(new Object)
+    backend.close()
+    assert(tracker.get(id) === None)
+    assert(tracker.size === 0)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index 9de6510c634b3..80bbad47f8f17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.types._
 
 private[sql] object SQLUtils extends Logging {
-  SerDe.registerSqlSerDe((readSqlObject, writeSqlObject))
+  SerDe.setSQLReadObject(readSqlObject).setSQLWriteObject(writeSqlObject)
 
   private[this] def withHiveExternalCatalog(sc: SparkContext): SparkContext = {
     sc.conf.set(CATALOG_IMPLEMENTATION.key, "hive")
@@ -158,7 +158,7 @@ private[sql] object SQLUtils extends Logging {
     val dis = new DataInputStream(bis)
     val num = SerDe.readInt(dis)
     Row.fromSeq((0 until num).map { i =>
-      doConversion(SerDe.readObject(dis), schema.fields(i).dataType)
+      doConversion(SerDe.readObject(dis, jvmObjectTracker = null), schema.fields(i).dataType)
     })
   }
 
@@ -167,7 +167,7 @@ private[sql] object SQLUtils extends Logging {
     val dos = new DataOutputStream(bos)
 
     val cols = (0 until row.length).map(row(_).asInstanceOf[Object]).toArray
-    SerDe.writeObject(dos, cols)
+    SerDe.writeObject(dos, cols, jvmObjectTracker = null)
     bos.toByteArray()
   }
 
@@ -247,7 +247,7 @@ private[sql] object SQLUtils extends Logging {
     dataType match {
       case 's' =>
         // Read StructType for DataFrame
-        val fields = SerDe.readList(dis).asInstanceOf[Array[Object]]
+        val fields = SerDe.readList(dis, jvmObjectTracker = null).asInstanceOf[Array[Object]]
         Row.fromSeq(fields)
       case _ => null
     }
@@ -258,8 +258,8 @@ private[sql] object SQLUtils extends Logging {
       // Handle struct type in DataFrame
       case v: GenericRowWithSchema =>
         dos.writeByte('s')
-        SerDe.writeObject(dos, v.schema.fieldNames)
-        SerDe.writeObject(dos, v.values)
+        SerDe.writeObject(dos, v.schema.fieldNames, jvmObjectTracker = null)
+        SerDe.writeObject(dos, v.values, jvmObjectTracker = null)
         true
       case _ =>
         false

From eb2d9bfd4e100789604ca0810929b42694ea7377 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Fri, 9 Dec 2016 10:12:56 -0800
Subject: [PATCH 325/534] [MINOR][SPARKR] Fix SparkR regex in copy command

Fix SparkR package copy regex. The existing code leads to
```
Copying release tarballs to /home/****/public_html/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-SNAPSHOT-2016_12_08_22_38-e8f351f-bin
mput: SparkR-*: no files found
```

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #16231 from shivaram/typo-sparkr-build.

(cherry picked from commit be5fc6ef72c7eb586b184b0f42ac50ef32843208)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 dev/create-release/release-build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c0663b815da9f..b08577c47c673 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -252,7 +252,7 @@ if [[ "$1" == "package" ]]; then
   LFTP mkdir -p $dest_dir
   LFTP mput -O $dest_dir 'spark-*'
   LFTP mput -O $dest_dir 'pyspark-*'
-  LFTP mput -O $dest_dir 'SparkR-*'
+  LFTP mput -O $dest_dir 'SparkR_*'
   # Delete /latest directory and rename new upload to /latest
   LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0"
   LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest"
@@ -260,7 +260,7 @@ if [[ "$1" == "package" ]]; then
   LFTP mkdir -p $dest_dir
   LFTP mput -O $dest_dir 'spark-*'
   LFTP mput -O $dest_dir 'pyspark-*'
-  LFTP mput -O $dest_dir 'SparkR-*'
+  LFTP mput -O $dest_dir 'SparkR_*'
   exit 0
 fi
 

From 562507ef038f09ff422e9831416af5119282a9d0 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 9 Dec 2016 23:13:36 +0100
Subject: [PATCH 326/534] [SPARK-18745][SQL] Fix signed integer overflow due to
 toInt cast

## What changes were proposed in this pull request?

This PR avoids that a result of a cast `toInt` is negative due to signed integer overflow (e.g. 0x0000_0000_1???????L.toInt < 0 ). This PR performs casts after we can ensure the value is within range of signed integer (the result of `max(array.length, ???)` is always integer).

## How was this patch tested?

Manually executed query68 of TPC-DS with 100TB

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #16235 from kiszk/SPARK-18745.

(cherry picked from commit d60ab5fd9b6af9aa5080a2d13b3589d8b79c5c5c)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../apache/spark/sql/execution/joins/HashedRelation.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index 8821c0dea9ee5..b9f6601ea87fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -670,9 +670,9 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     var offset: Long = Platform.LONG_ARRAY_OFFSET
     val end = len * 8L + Platform.LONG_ARRAY_OFFSET
     while (offset < end) {
-      val size = Math.min(buffer.length, (end - offset).toInt)
+      val size = Math.min(buffer.length, end - offset)
       Platform.copyMemory(arr, offset, buffer, Platform.BYTE_ARRAY_OFFSET, size)
-      writeBuffer(buffer, 0, size)
+      writeBuffer(buffer, 0, size.toInt)
       offset += size
     }
   }
@@ -710,8 +710,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     var offset: Long = Platform.LONG_ARRAY_OFFSET
     val end = length * 8L + Platform.LONG_ARRAY_OFFSET
     while (offset < end) {
-      val size = Math.min(buffer.length, (end - offset).toInt)
-      readBuffer(buffer, 0, size)
+      val size = Math.min(buffer.length, end - offset)
+      readBuffer(buffer, 0, size.toInt)
       Platform.copyMemory(buffer, Platform.BYTE_ARRAY_OFFSET, array, offset, size)
       offset += size
     }

From e45345d91e333e0b5f9219e857affeda461863c6 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Fri, 9 Dec 2016 17:34:52 -0800
Subject: [PATCH 327/534] [SPARK-18812][MLLIB] explain "Spark ML"

## What changes were proposed in this pull request?

There has been some confusion around "Spark ML" vs. "MLlib". This PR adds some FAQ-like entries to the MLlib user guide to explain "Spark ML" and reduce the confusion.

I check the [Spark FAQ page](http://spark.apache.org/faq.html), which seems too high-level for the content here. So I added it to the MLlib user guide instead.

cc: mateiz

Author: Xiangrui Meng <meng@databricks.com>

Closes #16241 from mengxr/SPARK-18812.

(cherry picked from commit d2493a203e852adf63dde4e1fc993e8d11efec3d)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 docs/ml-guide.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index ddf81be177f3d..971761961b965 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -35,6 +35,18 @@ The primary Machine Learning API for Spark is now the [DataFrame](sql-programmin
 * The DataFrame-based API for MLlib provides a uniform API across ML algorithms and across multiple languages.
 * DataFrames facilitate practical ML Pipelines, particularly feature transformations.  See the [Pipelines guide](ml-pipeline.html) for details.
 
+*What is "Spark ML"?*
+
+* "Spark ML" is not an official name but occasionally used to refer to the MLlib DataFrame-based API.
+  This is majorly due to the `org.apache.spark.ml` Scala package name used by the DataFrame-based API, 
+  and the "Spark ML Pipelines" term we used initially to emphasize the pipeline concept.
+  
+*Is MLlib deprecated?*
+
+* No. MLlib includes both the RDD-based API and the DataFrame-based API.
+  The RDD-based API is now in maintenance mode.
+  But neither API is deprecated, nor MLlib as a whole.
+
 # Dependencies
 
 MLlib uses the linear algebra package [Breeze](http://www.scalanlp.org/), which depends on

From 8bf56cc46b96874565ebd8109f62e69e6c0cf151 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 9 Dec 2016 19:06:05 -0800
Subject: [PATCH 328/534] [SPARK-18807][SPARKR] Should suppress output print
 for calls to JVM methods with void return values

## What changes were proposed in this pull request?

Several SparkR API calling into JVM methods that have void return values are getting printed out, especially when running in a REPL or IDE.
example:
```
> setLogLevel("WARN")
NULL
```
We should fix this to make the result more clear.

Also found a small change to return value of dropTempView in 2.1 - adding doc and test for it.

## How was this patch tested?

manually - I didn't find a expect_*() method in testthat for this

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16237 from felixcheung/rinvis.

(cherry picked from commit 3e11d5bfef2f05bd6d42c4d6188eae6d63c963ef)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/pkg/R/SQLContext.R                      |  7 ++++---
 R/pkg/R/context.R                         |  6 +++---
 R/pkg/R/sparkR.R                          |  6 +++---
 R/pkg/inst/tests/testthat/test_sparkSQL.R | 14 +++++++-------
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 38d83c6e5c52b..6f48cd66396ea 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -634,7 +634,7 @@ tableNames <- function(x, ...) {
 cacheTable.default <- function(tableName) {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "cacheTable", tableName)
+  invisible(callJMethod(catalog, "cacheTable", tableName))
 }
 
 cacheTable <- function(x, ...) {
@@ -663,7 +663,7 @@ cacheTable <- function(x, ...) {
 uncacheTable.default <- function(tableName) {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "uncacheTable", tableName)
+  invisible(callJMethod(catalog, "uncacheTable", tableName))
 }
 
 uncacheTable <- function(x, ...) {
@@ -686,7 +686,7 @@ uncacheTable <- function(x, ...) {
 clearCache.default <- function() {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "clearCache")
+  invisible(callJMethod(catalog, "clearCache"))
 }
 
 clearCache <- function() {
@@ -730,6 +730,7 @@ dropTempTable <- function(x, ...) {
 #' If the view has been cached before, then it will also be uncached.
 #'
 #' @param viewName the name of the view to be dropped.
+#' @return TRUE if the view is dropped successfully, FALSE otherwise.
 #' @rdname dropTempView
 #' @name dropTempView
 #' @export
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 438d77a388f0e..1138caf98ed8a 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -87,8 +87,8 @@ objectFile <- function(sc, path, minPartitions = NULL) {
 #' in the list are split into \code{numSlices} slices and distributed to nodes
 #' in the cluster.
 #'
-#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function 
-#' will write it to disk and send the file name to JVM. Also to make sure each slice is not 
+#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function
+#' will write it to disk and send the file name to JVM. Also to make sure each slice is not
 #' larger than that limit, number of slices may be increased.
 #'
 #' @param sc SparkContext to use
@@ -379,5 +379,5 @@ spark.lapply <- function(list, func) {
 #' @note setLogLevel since 2.0.0
 setLogLevel <- function(level) {
   sc <- getSparkContext()
-  callJMethod(sc, "setLogLevel", level)
+  invisible(callJMethod(sc, "setLogLevel", level))
 }
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 43bff97553c2f..c57cc8f285613 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -427,7 +427,7 @@ sparkR.session <- function(
 #' @method setJobGroup default
 setJobGroup.default <- function(groupId, description, interruptOnCancel) {
   sc <- getSparkContext()
-  callJMethod(sc, "setJobGroup", groupId, description, interruptOnCancel)
+  invisible(callJMethod(sc, "setJobGroup", groupId, description, interruptOnCancel))
 }
 
 setJobGroup <- function(sc, groupId, description, interruptOnCancel) {
@@ -457,7 +457,7 @@ setJobGroup <- function(sc, groupId, description, interruptOnCancel) {
 #' @method clearJobGroup default
 clearJobGroup.default <- function() {
   sc <- getSparkContext()
-  callJMethod(sc, "clearJobGroup")
+  invisible(callJMethod(sc, "clearJobGroup"))
 }
 
 clearJobGroup <- function(sc) {
@@ -484,7 +484,7 @@ clearJobGroup <- function(sc) {
 #' @method cancelJobGroup default
 cancelJobGroup.default <- function(groupId) {
   sc <- getSparkContext()
-  callJMethod(sc, "cancelJobGroup", groupId)
+  invisible(callJMethod(sc, "cancelJobGroup", groupId))
 }
 
 cancelJobGroup <- function(sc, groupId) {
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index c669c2e2e26ef..e8ccff81222d0 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -576,7 +576,7 @@ test_that("test tableNames and tables", {
   tables <- tables()
   expect_equal(count(tables), 2)
   suppressWarnings(dropTempTable("table1"))
-  dropTempView("table2")
+  expect_true(dropTempView("table2"))
 
   tables <- tables()
   expect_equal(count(tables), 0)
@@ -589,7 +589,7 @@ test_that(
   newdf <- sql("SELECT * FROM table1 where name = 'Michael'")
   expect_is(newdf, "SparkDataFrame")
   expect_equal(count(newdf), 1)
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 
   createOrReplaceTempView(df, "dfView")
   sqlCast <- collect(sql("select cast('2' as decimal) as x from dfView limit 1"))
@@ -600,7 +600,7 @@ test_that(
   expect_equal(ncol(sqlCast), 1)
   expect_equal(out[1], "  x")
   expect_equal(out[2], "1 2")
-  dropTempView("dfView")
+  expect_true(dropTempView("dfView"))
 })
 
 test_that("test cache, uncache and clearCache", {
@@ -609,7 +609,7 @@ test_that("test cache, uncache and clearCache", {
   cacheTable("table1")
   uncacheTable("table1")
   clearCache()
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 })
 
 test_that("insertInto() on a registered table", {
@@ -630,13 +630,13 @@ test_that("insertInto() on a registered table", {
   insertInto(dfParquet2, "table1")
   expect_equal(count(sql("select * from table1")), 5)
   expect_equal(first(sql("select * from table1 order by age"))$name, "Michael")
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 
   createOrReplaceTempView(dfParquet, "table1")
   insertInto(dfParquet2, "table1", overwrite = TRUE)
   expect_equal(count(sql("select * from table1")), 2)
   expect_equal(first(sql("select * from table1 order by age"))$name, "Bob")
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 
   unlink(jsonPath2)
   unlink(parquetPath2)
@@ -650,7 +650,7 @@ test_that("tableToDF() returns a new DataFrame", {
   expect_equal(count(tabledf), 3)
   tabledf2 <- tableToDF("table1")
   expect_equal(count(tabledf2), 3)
-  dropTempView("table1")
+  expect_true(dropTempView("table1"))
 })
 
 test_that("toRDD() returns an RRDD", {

From b020ce408507d7fd57f6d357054a2b3530a5b95e Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 9 Dec 2016 22:49:51 -0800
Subject: [PATCH 329/534] [SPARK-18811] StreamSource resolution should happen
 in stream execution thread

## What changes were proposed in this pull request?

When you start a stream, if we are trying to resolve the source of the stream, for example if we need to resolve partition columns, this could take a long time. This long execution time should not block the main thread where `query.start()` was called on. It should happen in the stream execution thread possibly before starting any triggers.

## How was this patch tested?

Unit test added. Made sure test fails with no code changes.

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #16238 from brkyvz/SPARK-18811.

(cherry picked from commit 63c9159870ee274c68e24360594ca01d476b9ace)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../execution/streaming/StreamExecution.scala | 24 ++++++-
 .../sql/streaming/StreamingQueryManager.scala | 14 +---
 .../StreamingQueryManagerSuite.scala          | 28 ++++++++
 .../sql/streaming/util/DefaultSource.scala    | 66 +++++++++++++++++++
 4 files changed, 116 insertions(+), 16 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 39be222d05d0f..b52810da88c3e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -47,7 +47,7 @@ class StreamExecution(
     override val sparkSession: SparkSession,
     override val name: String,
     checkpointRoot: String,
-    val logicalPlan: LogicalPlan,
+    analyzedPlan: LogicalPlan,
     val sink: Sink,
     val trigger: Trigger,
     val triggerClock: Clock,
@@ -115,12 +115,26 @@ class StreamExecution(
   private val prettyIdString =
     Option(name).map(_ + " ").getOrElse("") + s"[id = $id, runId = $runId]"
 
+  override lazy val logicalPlan: LogicalPlan = {
+    var nextSourceId = 0L
+    analyzedPlan.transform {
+      case StreamingRelation(dataSource, _, output) =>
+        // Materialize source to avoid creating it in every batch
+        val metadataPath = s"$checkpointRoot/sources/$nextSourceId"
+        val source = dataSource.createSource(metadataPath)
+        nextSourceId += 1
+        // We still need to use the previous `output` instead of `source.schema` as attributes in
+        // "df.logicalPlan" has already used attributes of the previous `output`.
+        StreamingExecutionRelation(source, output)
+    }
+  }
+
   /** All stream sources present in the query plan. */
-  protected val sources =
+  protected lazy val sources =
     logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
 
   /** A list of unique sources in the query plan. */
-  private val uniqueSources = sources.distinct
+  private lazy val uniqueSources = sources.distinct
 
   private val triggerExecutor = trigger match {
     case t: ProcessingTime => ProcessingTimeExecutor(t, triggerClock)
@@ -214,6 +228,10 @@ class StreamExecution(
       // While active, repeatedly attempt to run batches.
       SparkSession.setActiveSession(sparkSession)
 
+      updateStatusMessage("Initializing sources")
+      // force initialization of the logical plan so that the sources can be created
+      logicalPlan
+
       triggerExecutor.execute(() => {
         startTrigger()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index c6ab41655f5ef..52d079192dae4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -251,23 +251,11 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
         UnsupportedOperationChecker.checkForStreaming(analyzedPlan, outputMode)
       }
 
-      var nextSourceId = 0L
-
-      val logicalPlan = analyzedPlan.transform {
-        case StreamingRelation(dataSource, _, output) =>
-          // Materialize source to avoid creating it in every batch
-          val metadataPath = s"$checkpointLocation/sources/$nextSourceId"
-          val source = dataSource.createSource(metadataPath)
-          nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
-          StreamingExecutionRelation(source, output)
-      }
       val query = new StreamExecution(
         sparkSession,
         name,
         checkpointLocation,
-        logicalPlan,
+        analyzedPlan,
         sink,
         trigger,
         triggerClock,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
index 268b8ff7b41a5..d188319fe38dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.streaming
 
+import java.util.concurrent.CountDownLatch
+
 import scala.concurrent.Future
 import scala.util.Random
 import scala.util.control.NonFatal
@@ -213,6 +215,28 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
     }
   }
 
+  test("SPARK-18811: Source resolution should not block main thread") {
+    failAfter(streamingTimeout) {
+      StreamingQueryManagerSuite.latch = new CountDownLatch(1)
+      withTempDir { tempDir =>
+        // if source resolution was happening on the main thread, it would block the start call,
+        // now it should only be blocking the stream execution thread
+        val sq = spark.readStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .load()
+          .writeStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .option("checkpointLocation", tempDir.toString)
+          .start()
+        eventually(Timeout(streamingTimeout)) {
+          assert(sq.status.message.contains("Initializing sources"))
+        }
+        StreamingQueryManagerSuite.latch.countDown()
+        sq.stop()
+      }
+    }
+  }
+
 
   /** Run a body of code by defining a query on each dataset */
   private def withQueriesOn(datasets: Dataset[_]*)(body: Seq[StreamingQuery] => Unit): Unit = {
@@ -297,3 +321,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
     (inputData, mapped)
   }
 }
+
+object StreamingQueryManagerSuite {
+  var latch: CountDownLatch = null
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala
new file mode 100644
index 0000000000000..b0adf76814b18
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming.util
+
+import org.apache.spark.sql.{SQLContext, _}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
+import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
+import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryManagerSuite}
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+/** Dummy provider: returns a SourceProvider with a blocking `createSource` call. */
+class BlockingSource extends StreamSourceProvider with StreamSinkProvider {
+
+  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)
+
+  override def sourceSchema(
+      spark: SQLContext,
+      schema: Option[StructType],
+      providerName: String,
+      parameters: Map[String, String]): (String, StructType) = {
+    ("dummySource", fakeSchema)
+  }
+
+  override def createSource(
+      spark: SQLContext,
+      metadataPath: String,
+      schema: Option[StructType],
+      providerName: String,
+      parameters: Map[String, String]): Source = {
+    StreamingQueryManagerSuite.latch.await()
+    new Source {
+      override def schema: StructType = fakeSchema
+      override def getOffset: Option[Offset] = Some(new LongOffset(0))
+      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
+        import spark.implicits._
+        Seq[Int]().toDS().toDF()
+      }
+      override def stop() {}
+    }
+  }
+
+  override def createSink(
+      spark: SQLContext,
+      parameters: Map[String, String],
+      partitionColumns: Seq[String],
+      outputMode: OutputMode): Sink = {
+    new Sink {
+      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
+    }
+  }
+}

From 2b36f4943051fafea0b12b662b4f4dab54806d26 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Sat, 10 Dec 2016 22:41:40 +0800
Subject: [PATCH 330/534] [SPARK-17460][SQL] Make sure sizeInBytes in
 Statistics will not overflow

## What changes were proposed in this pull request?

1. In SparkStrategies.canBroadcast, I will add the check   plan.statistics.sizeInBytes >= 0
2. In LocalRelations.statistics, when calculate the statistics, I will change the size to BigInt so it won't overflow.

## How was this patch tested?

I will add a test case to make sure the statistics.sizeInBytes won't overflow.

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #16175 from huaxingao/spark-17460.

(cherry picked from commit c5172568b59b4cf1d3dc7ed8c17a9bea2ea2ab79)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/plans/logical/LocalRelation.scala     |  3 ++-
 .../apache/spark/sql/execution/SparkStrategies.scala   |  3 ++-
 .../scala/org/apache/spark/sql/internal/SQLConf.scala  |  4 ++--
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 10 ++++++++++
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
index 890865d177845..91633f5124a2f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LocalRelation.scala
@@ -75,7 +75,8 @@ case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
   }
 
   override lazy val statistics =
-    Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length)
+    Statistics(sizeInBytes =
+      (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length)
 
   def toSQL(inlineTableName: String): String = {
     require(data.nonEmpty)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index d88cbdfbcfa0e..b0bbcfc934cee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -115,7 +115,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      */
     private def canBroadcast(plan: LogicalPlan): Boolean = {
       plan.statistics.isBroadcastable ||
-        plan.statistics.sizeInBytes <= conf.autoBroadcastJoinThreshold
+        (plan.statistics.sizeInBytes >= 0 &&
+          plan.statistics.sizeInBytes <= conf.autoBroadcastJoinThreshold)
     }
 
     /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 809b267b884b6..24c3d0b5507b7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -136,7 +136,7 @@ object SQLConf {
       "That is to say by default the optimizer will not choose to broadcast a table unless it " +
       "knows for sure its size is small enough.")
     .longConf
-    .createWithDefault(-1)
+    .createWithDefault(Long.MaxValue)
 
   val SHUFFLE_PARTITIONS = SQLConfigBuilder("spark.sql.shuffle.partitions")
     .doc("The default number of partitions to use when shuffling data for joins or aggregations.")
@@ -738,7 +738,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def enableRadixSort: Boolean = getConf(RADIX_SORT_ENABLED)
 
-  def defaultSizeInBytes: Long = getConf(DEFAULT_SIZE_IN_BYTES, Long.MaxValue)
+  def defaultSizeInBytes: Long = getConf(DEFAULT_SIZE_IN_BYTES)
 
   def isParquetSchemaMergingEnabled: Boolean = getConf(PARQUET_SCHEMA_MERGING_ENABLED)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 1174d7354f931..cb64aab6acad9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1060,6 +1060,16 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     }
     assert(e.getMessage.contains("Cannot create encoder for Option of Product type"))
   }
+
+  test ("SPARK-17460: the sizeInBytes in Statistics shouldn't overflow to a negative number") {
+    // Since the sizeInBytes in Statistics could exceed the limit of an Int, we should use BigInt
+    // instead of Int for avoiding possible overflow.
+    val ds = (0 to 10000).map( i =>
+      (i, Seq((i, Seq((i, "This is really not that long of a string")))))).toDS()
+    val sizeInBytes = ds.logicalPlan.statistics.sizeInBytes
+    // sizeInBytes is 2404280404, before the fix, it overflows to a negative number
+    assert(sizeInBytes > 0)
+  }
 }
 
 case class Generic[T](id: T, value: Double)

From 83822df02fcd541068dd9cd462293f3cddfb6631 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 10 Dec 2016 16:40:10 +0000
Subject: [PATCH 331/534] [MINOR][DOCS] Remove Apache Spark Wiki address

## What changes were proposed in this pull request?

According to the notice of the following Wiki front page, we can remove the obsolete wiki pointer safely in `README.md` and `docs/index.md`, too. These two lines are the last occurrence of that links.

```
All current wiki content has been merged into pages at http://spark.apache.org as of November 2016.
Each page links to the new location of its information on the Spark web site.
Obsolete wiki content is still hosted here, but carries a notice that it is no longer current.
```

## How was this patch tested?

Manual.

- `README.md`: https://github.com/dongjoon-hyun/spark/tree/remove_wiki_from_readme
- `docs/index.md`:
```
cd docs
SKIP_API=1 jekyll build
```
![screen shot 2016-12-09 at 2 53 29 pm](https://cloud.githubusercontent.com/assets/9700541/21067323/517252e2-be1f-11e6-85b1-2a4471131c5d.png)

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #16239 from dongjoon-hyun/remove_wiki_from_readme.

(cherry picked from commit f3a3fed76cb74ecd0f46031f337576ce60f54fb2)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 README.md     | 3 +--
 docs/index.md | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 853f7f5ded3cb..f5983239c043f 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,7 @@ and Spark Streaming for stream processing.
 ## Online Documentation
 
 You can find the latest Spark documentation, including a programming
-guide, on the [project web page](http://spark.apache.org/documentation.html)
-and [project wiki](https://cwiki.apache.org/confluence/display/SPARK).
+guide, on the [project web page](http://spark.apache.org/documentation.html).
 This README file only contains basic setup instructions.
 
 ## Building Spark
diff --git a/docs/index.md b/docs/index.md
index c5d34cb5c4e73..57b9fa848f4a3 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -131,7 +131,6 @@ options for deployment:
 **External Resources:**
 
 * [Spark Homepage](http://spark.apache.org)
-* [Spark Wiki](https://cwiki.apache.org/confluence/display/SPARK)
 * [Spark Community](http://spark.apache.org/community.html) resources, including local meetups
 * [StackOverflow tag `apache-spark`](http://stackoverflow.com/questions/tagged/apache-spark)
 * [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here

From 5151dafaaa6533ea88f7173c136e004ad87abd04 Mon Sep 17 00:00:00 2001
From: Michal Senkyr <mike.senkyr@gmail.com>
Date: Sat, 10 Dec 2016 19:54:07 +0000
Subject: [PATCH 332/534] [SPARK-3359][DOCS] Fix greater-than symbols in
 Javadoc to allow building with Java 8

## What changes were proposed in this pull request?

The API documentation build was failing when using Java 8 due to incorrect character `>` in Javadoc.

Replace `>` with literals in Javadoc to allow the build to pass.

## How was this patch tested?

Documentation was built and inspected manually to ensure it still displays correctly in the browser

```
cd docs && jekyll serve
```

Author: Michal Senkyr <mike.senkyr@gmail.com>

Closes #16201 from michalsenkyr/javadoc8-gt-fix.

(cherry picked from commit 114324832abce1fbb2c5f5b84a66d39dd2d4398a)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../apache/spark/ml/classification/DecisionTreeClassifier.scala | 2 +-
 .../org/apache/spark/ml/classification/GBTClassifier.scala      | 2 +-
 .../apache/spark/ml/classification/RandomForestClassifier.scala | 2 +-
 .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala  | 2 +-
 .../scala/org/apache/spark/ml/regression/GBTRegressor.scala     | 2 +-
 .../org/apache/spark/ml/regression/RandomForestRegressor.scala  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 7e0bc19a7aeb4..9f60f0896ec52 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -81,7 +81,7 @@ class DecisionTreeClassifier @Since("1.4.0") (
    * E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
    * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
+   * Must be at least 1.
    * (default = 10)
    * @group setParam
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index c5fc3c8772908..c99b63b25d2e7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -98,7 +98,7 @@ class GBTClassifier @Since("1.4.0") (
    * E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
    * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
+   * Must be at least 1.
    * (default = 10)
    * @group setParam
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 34c055dce6511..5bbaafeff329f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -83,7 +83,7 @@ class RandomForestClassifier @Since("1.4.0") (
    * E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
    * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
+   * Must be at least 1.
    * (default = 10)
    * @group setParam
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 0cdfa7b0b742a..01c5cc1c7efa9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -80,7 +80,7 @@ class DecisionTreeRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
    * E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
    * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
+   * Must be at least 1.
    * (default = 10)
    * @group setParam
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 49a3f8b6b5152..f8ab3d3a45a49 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -95,7 +95,7 @@ class GBTRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: String)
    * E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
    * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
+   * Must be at least 1.
    * (default = 10)
    * @group setParam
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 67fb648625550..ca4a50b825dde 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -82,7 +82,7 @@ class RandomForestRegressor @Since("1.4.0") (@Since("1.4.0") override val uid: S
    * E.g. 10 means that the cache will get checkpointed every 10 iterations.
    * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
    * [[org.apache.spark.SparkContext]].
-   * Must be >= 1.
+   * Must be at least 1.
    * (default = 10)
    * @group setParam
    */

From de21ca46e5d992dd950b6dcec71d7aee0cf6532e Mon Sep 17 00:00:00 2001
From: wangzhenhua <wangzhenhua@huawei.com>
Date: Sat, 10 Dec 2016 21:25:29 -0800
Subject: [PATCH 333/534] [SPARK-18815][SQL] Fix NPE when collecting column
 stats for string/binary column having only null values

## What changes were proposed in this pull request?

During column stats collection, average and max length will be null if a column of string/binary type has only null values. To fix this, I use default size when avg/max length is null.

## How was this patch tested?

Add a test for handling null columns

Author: wangzhenhua <wangzhenhua@huawei.com>

Closes #16243 from wzhfy/nullStats.

(cherry picked from commit a29ee55aaadfe43ac9abb0eaf8b022b1e6d7babb)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../catalyst/plans/logical/Statistics.scala   |  9 ++-
 .../spark/sql/StatisticsCollectionSuite.scala | 67 +++++++++++++------
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index 79865609cb647..465fbab5716ac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -194,11 +194,12 @@ object ColumnStat extends Logging {
     val numNonNulls = if (col.nullable) Count(col) else Count(one)
     val ndv = Least(Seq(HyperLogLogPlusPlus(col, relativeSD), numNonNulls))
     val numNulls = Subtract(Count(one), numNonNulls)
+    val defaultSize = Literal(col.dataType.defaultSize, LongType)
 
     def fixedLenTypeStruct(castType: DataType) = {
       // For fixed width types, avg size should be the same as max size.
-      val avgSize = Literal(col.dataType.defaultSize, LongType)
-      struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), numNulls, avgSize, avgSize)
+      struct(ndv, Cast(Min(col), castType), Cast(Max(col), castType), numNulls, defaultSize,
+        defaultSize)
     }
 
     col.dataType match {
@@ -213,7 +214,9 @@ object ColumnStat extends Logging {
         val nullLit = Literal(null, col.dataType)
         struct(
           ndv, nullLit, nullLit, numNulls,
-          Ceil(Average(Length(col))), Cast(Max(Length(col)), LongType))
+          // Set avg/max size to default size if all the values are null or there is no value.
+          Coalesce(Seq(Ceil(Average(Length(col))), defaultSize)),
+          Coalesce(Seq(Cast(Max(Length(col)), LongType), defaultSize)))
       case _ =>
         throw new AnalysisException("Analyzing column statistics is not supported for column " +
             s"${col.name} of data type: ${col.dataType}.")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 1fcccd061079e..07408491953ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -21,6 +21,7 @@ import java.{lang => jl}
 import java.sql.{Date, Timestamp}
 
 import scala.collection.mutable
+import scala.util.Random
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -133,6 +134,40 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("column stats round trip serialization") {
+    // Make sure we serialize and then deserialize and we will get the result data
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+    stats.zip(df.schema).foreach { case ((k, v), field) =>
+      withClue(s"column $k with type ${field.dataType}") {
+        val roundtrip = ColumnStat.fromMap("table_is_foo", field, v.toMap)
+        assert(roundtrip == Some(v))
+      }
+    }
+  }
+
+  test("analyze column command - result verification") {
+    // (data.head.productArity - 1) because the last column does not support stats collection.
+    assert(stats.size == data.head.productArity - 1)
+    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+    checkColStats(df, stats)
+  }
+
+  test("column stats collection for null columns") {
+    val dataTypes: Seq[(DataType, Int)] = Seq(
+      BooleanType, ByteType, ShortType, IntegerType, LongType,
+      DoubleType, FloatType, DecimalType.SYSTEM_DEFAULT,
+      StringType, BinaryType, DateType, TimestampType
+    ).zipWithIndex
+
+    val df = sql("select " + dataTypes.map { case (tpe, idx) =>
+      s"cast(null as ${tpe.sql}) as col$idx"
+    }.mkString(", "))
+
+    val expectedColStats = dataTypes.map { case (tpe, idx) =>
+      (s"col$idx", ColumnStat(0, None, None, 1, tpe.defaultSize.toLong, tpe.defaultSize.toLong))
+    }
+    checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*))
+  }
 }
 
 
@@ -141,7 +176,6 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
  * when using the Hive external catalog) as well as in the sql/core module.
  */
 abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
-  import testImplicits._
 
   private val dec1 = new java.math.BigDecimal("1.000000000000000000")
   private val dec2 = new java.math.BigDecimal("8.000000000000000000")
@@ -180,35 +214,28 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
     "ctimestamp" -> ColumnStat(2, Some(t1), Some(t2), 1, 8, 8)
   )
 
-  test("column stats round trip serialization") {
-    // Make sure we serialize and then deserialize and we will get the result data
-    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
-    stats.zip(df.schema).foreach { case ((k, v), field) =>
-      withClue(s"column $k with type ${field.dataType}") {
-        val roundtrip = ColumnStat.fromMap("table_is_foo", field, v.toMap)
-        assert(roundtrip == Some(v))
-      }
-    }
-  }
-
-  test("analyze column command - result verification") {
-    val tableName = "column_stats_test2"
-    // (data.head.productArity - 1) because the last column does not support stats collection.
-    assert(stats.size == data.head.productArity - 1)
-    val df = data.toDF(stats.keys.toSeq :+ "carray" : _*)
+  private val randomName = new Random(31)
 
+  /**
+   * Compute column stats for the given DataFrame and compare it with colStats.
+   */
+  def checkColStats(
+      df: DataFrame,
+      colStats: mutable.LinkedHashMap[String, ColumnStat]): Unit = {
+    val tableName = "column_stats_test_" + randomName.nextInt(1000)
     withTable(tableName) {
       df.write.saveAsTable(tableName)
 
       // Collect statistics
-      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " + stats.keys.mkString(", "))
+      sql(s"analyze table $tableName compute STATISTICS FOR COLUMNS " +
+        colStats.keys.mkString(", "))
 
       // Validate statistics
       val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName))
       assert(table.stats.isDefined)
-      assert(table.stats.get.colStats.size == stats.size)
+      assert(table.stats.get.colStats.size == colStats.size)
 
-      stats.foreach { case (k, v) =>
+      colStats.foreach { case (k, v) =>
         withClue(s"column $k") {
           assert(table.stats.get.colStats(k) == v)
         }

From d4c03f8769f063b0dfac7d000513a2bc20989549 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Sun, 11 Dec 2016 09:12:46 +0000
Subject: [PATCH 334/534] [SQL][MINOR] simplify a test to fix the maven tests

## What changes were proposed in this pull request?

After https://github.com/apache/spark/pull/15620 , all of the Maven-based 2.0 Jenkins jobs time out consistently. As I pointed out in https://github.com/apache/spark/pull/15620#discussion_r91829129 , it seems that the regression test is an overkill and may hit constants pool size limitation, which is a known issue and hasn't been fixed yet.

Since #15620 only fix the code size limitation problem, we can simplify the test to avoid hitting constants pool size limitation.

## How was this patch tested?

test only change

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16244 from cloud-fan/minor.

(cherry picked from commit 9abd05b6b94eda31c47bce1f913af988c35f1cb1)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../expressions/CodeGenerationSuite.scala     | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 0f4b4b5bc8dd6..ee5d1f637374e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -98,20 +98,15 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("SPARK-18091: split large if expressions into blocks due to JVM code size limit") {
-    val inStr = "StringForTesting"
-    val row = create_row(inStr)
-    val inputStrAttr = 'a.string.at(0)
-
-    var strExpr: Expression = inputStrAttr
-    for (_ <- 1 to 13) {
-      strExpr = If(EqualTo(Decode(Encode(strExpr, "utf-8"), "utf-8"), inputStrAttr),
-        strExpr, strExpr)
+    var strExpr: Expression = Literal("abc")
+    for (_ <- 1 to 150) {
+      strExpr = Decode(Encode(strExpr, "utf-8"), "utf-8")
     }
 
-    val expressions = Seq(strExpr)
-    val plan = GenerateUnsafeProjection.generate(expressions, true)
-    val actual = plan(row).toSeq(expressions.map(_.dataType))
-    val expected = Seq(UTF8String.fromString(inStr))
+    val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr))
+    val plan = GenerateMutableProjection.generate(expressions)
+    val actual = plan(null).toSeq(expressions.map(_.dataType))
+    val expected = Seq(UTF8String.fromString("abc"))
 
     if (!checkResult(actual, expected)) {
       fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected")

From d5f14168d39433a02d065206c3910595339ff3dc Mon Sep 17 00:00:00 2001
From: krishnakalyan3 <krishnakalyan3@gmail.com>
Date: Sun, 11 Dec 2016 09:28:16 +0000
Subject: [PATCH 335/534] [SPARK-18628][ML] Update Scala param and Python param
 to have quotes

## What changes were proposed in this pull request?

Updated Scala param and Python param to have quotes around the options making it easier for users to read.

## How was this patch tested?

Manually checked the docstrings

Author: krishnakalyan3 <krishnakalyan3@gmail.com>

Closes #16242 from krishnakalyan3/doc-string.

(cherry picked from commit c802ad87182520662be51eb611ea1c64f4874c4e)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../main/scala/org/apache/spark/ml/feature/Bucketizer.scala | 6 +++---
 .../org/apache/spark/ml/feature/QuantileDiscretizer.scala   | 6 +++---
 python/pyspark/ml/feature.py                                | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index eb4d42f255345..d1f3b2af1e482 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -78,9 +78,9 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Param for how to handle invalid entries. Options are skip (filter out rows with
-   * invalid values), error (throw an error), or keep (keep invalid values in a special additional
-   * bucket).
+   * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
+   * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
+   * additional bucket).
    * Default: "error"
    * @group param
    */
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index b4fcfa2da47de..80c7f55e26b84 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -66,9 +66,9 @@ private[feature] trait QuantileDiscretizerBase extends Params
   def getRelativeError: Double = getOrDefault(relativeError)
 
   /**
-   * Param for how to handle invalid entries. Options are skip (filter out rows with
-   * invalid values), error (throw an error), or keep (keep invalid values in a special additional
-   * bucket).
+   * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
+   * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
+   * additional bucket).
    * Default: "error"
    * @group param
    */
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 1d62b325344e5..62c31431b58ff 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -165,8 +165,8 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
               typeConverter=TypeConverters.toListFloat)
 
     handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
-                          "Options are skip (filter out rows with invalid values), " +
-                          "error (throw an error), or keep (keep invalid values in a special " +
+                          "Options are 'skip' (filter out rows with invalid values), " +
+                          "'error' (throw an error), or 'keep' (keep invalid values in a special " +
                           "additional bucket).",
                           typeConverter=TypeConverters.toString)
 

From 63693c17e4407ec61052553d563218787c6f0dd6 Mon Sep 17 00:00:00 2001
From: Tyson Condie <tcondie@gmail.com>
Date: Sun, 11 Dec 2016 23:38:31 -0800
Subject: [PATCH 336/534] [SPARK-18790][SS] Keep a general offset history of
 stream batches

## What changes were proposed in this pull request?

Instead of only keeping the minimum number of offsets around, we should keep enough information to allow us to roll back n batches and reexecute the stream starting from a given point. In particular, we should create a config in SQLConf, spark.sql.streaming.retainedBatches that defaults to 100 and ensure that we keep enough log files in the following places to roll back the specified number of batches:
the offsets that are present in each batch
versions of the state store
the files lists stored for the FileStreamSource
the metadata log stored by the FileStreamSink

marmbrus zsxwing

## How was this patch tested?

The following tests were added.

### StreamExecution offset metadata
Test added to StreamingQuerySuite that ensures offset metadata is garbage collected according to minBatchesRetain

### CompactibleFileStreamLog
Tests added in CompactibleFileStreamLogSuite to ensure that logs are purged starting before the first compaction file that proceeds the current batch id - minBatchesToRetain.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Tyson Condie <tcondie@gmail.com>

Closes #16219 from tcondie/offset_hist.

(cherry picked from commit 83a42897ae90d84a54373db386a985e3e2d5903a)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../streaming/CompactibleFileStreamLog.scala  | 69 ++++++++++++-------
 .../execution/streaming/StreamExecution.scala | 10 ++-
 .../state/HDFSBackedStateStoreProvider.scala  |  1 -
 .../streaming/state/StateStoreConf.scala      |  4 +-
 .../apache/spark/sql/internal/SQLConf.scala   | 17 +++--
 .../CompactibleFileStreamLogSuite.scala       | 16 ++++-
 .../streaming/FileStreamSinkLogSuite.scala    | 48 +++++++++++--
 .../streaming/state/StateStoreSuite.scala     |  5 +-
 .../sql/streaming/StreamingQuerySuite.scala   | 64 ++++++++++++-----
 9 files changed, 170 insertions(+), 64 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
index 8529ceac30f1e..5a6f9e87f6eaa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLog.scala
@@ -52,6 +52,8 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
   /** Needed to serialize type T into JSON when using Jackson */
   private implicit val manifest = Manifest.classType[T](implicitly[ClassTag[T]].runtimeClass)
 
+  protected val minBatchesToRetain = sparkSession.sessionState.conf.minBatchesToRetain
+
   /**
    * If we delete the old files after compaction at once, there is a race condition in S3: other
    * processes may see the old files are deleted but still cannot see the compaction file using
@@ -152,11 +154,16 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
   }
 
   override def add(batchId: Long, logs: Array[T]): Boolean = {
-    if (isCompactionBatch(batchId, compactInterval)) {
-      compact(batchId, logs)
-    } else {
-      super.add(batchId, logs)
+    val batchAdded =
+      if (isCompactionBatch(batchId, compactInterval)) {
+        compact(batchId, logs)
+      } else {
+        super.add(batchId, logs)
+      }
+    if (batchAdded && isDeletingExpiredLog) {
+      deleteExpiredLog(batchId)
     }
+    batchAdded
   }
 
   /**
@@ -167,9 +174,6 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
     val validBatches = getValidBatchesBeforeCompactionBatch(batchId, compactInterval)
     val allLogs = validBatches.flatMap(batchId => super.get(batchId)).flatten ++ logs
     if (super.add(batchId, compactLogs(allLogs).toArray)) {
-      if (isDeletingExpiredLog) {
-        deleteExpiredLog(batchId)
-      }
       true
     } else {
       // Return false as there is another writer.
@@ -210,26 +214,41 @@ abstract class CompactibleFileStreamLog[T <: AnyRef : ClassTag](
   }
 
   /**
-   * Since all logs before `compactionBatchId` are compacted and written into the
-   * `compactionBatchId` log file, they can be removed. However, due to the eventual consistency of
-   * S3, the compaction file may not be seen by other processes at once. So we only delete files
-   * created `fileCleanupDelayMs` milliseconds ago.
+   * Delete expired log entries that proceed the currentBatchId and retain
+   * sufficient minimum number of batches (given by minBatchsToRetain). This
+   * equates to retaining the earliest compaction log that proceeds
+   * batch id position currentBatchId + 1 - minBatchesToRetain. All log entries
+   * prior to the earliest compaction log proceeding that position will be removed.
+   * However, due to the eventual consistency of S3, the compaction file may not
+   * be seen by other processes at once. So we only delete files created
+   * `fileCleanupDelayMs` milliseconds ago.
    */
-  private def deleteExpiredLog(compactionBatchId: Long): Unit = {
-    val expiredTime = System.currentTimeMillis() - fileCleanupDelayMs
-    fileManager.list(metadataPath, new PathFilter {
-      override def accept(path: Path): Boolean = {
-        try {
-          val batchId = getBatchIdFromFileName(path.getName)
-          batchId < compactionBatchId
-        } catch {
-          case _: NumberFormatException =>
-            false
+  private def deleteExpiredLog(currentBatchId: Long): Unit = {
+    if (compactInterval <= currentBatchId + 1 - minBatchesToRetain) {
+      // Find the first compaction batch id that maintains minBatchesToRetain
+      val minBatchId = currentBatchId + 1 - minBatchesToRetain
+      val minCompactionBatchId = minBatchId - (minBatchId % compactInterval) - 1
+      assert(isCompactionBatch(minCompactionBatchId, compactInterval),
+        s"$minCompactionBatchId is not a compaction batch")
+
+      logInfo(s"Current compact batch id = $currentBatchId " +
+        s"min compaction batch id to delete = $minCompactionBatchId")
+
+      val expiredTime = System.currentTimeMillis() - fileCleanupDelayMs
+      fileManager.list(metadataPath, new PathFilter {
+        override def accept(path: Path): Boolean = {
+          try {
+            val batchId = getBatchIdFromFileName(path.getName)
+            batchId < minCompactionBatchId
+          } catch {
+            case _: NumberFormatException =>
+              false
+          }
+        }
+      }).foreach { f =>
+        if (f.getModificationTime <= expiredTime) {
+          fileManager.delete(f.getPath)
         }
-      }
-    }).foreach { f =>
-      if (f.getModificationTime <= expiredTime) {
-        fileManager.delete(f.getPath)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index b52810da88c3e..48eee42a29011 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -58,6 +58,9 @@ class StreamExecution(
 
   private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
 
+  private val minBatchesToRetain = sparkSession.sessionState.conf.minBatchesToRetain
+  require(minBatchesToRetain > 0, "minBatchesToRetain has to be positive")
+
   /**
    * A lock used to wait/notify when batches complete. Use a fair lock to avoid thread starvation.
    */
@@ -400,10 +403,11 @@ class StreamExecution(
           }
         }
 
-        // Now that we have logged the new batch, no further processing will happen for
-        // the batch before the previous batch, and it is safe to discard the old metadata.
+        // It is now safe to discard the metadata beyond the minimum number to retain.
         // Note that purge is exclusive, i.e. it purges everything before the target ID.
-        offsetLog.purge(currentBatchId - 1)
+        if (minBatchesToRetain < currentBatchId) {
+          offsetLog.purge(currentBatchId - minBatchesToRetain)
+        }
       }
     } else {
       awaitBatchLock.lock()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
index 493fdaaec5069..4f3f8181d1f4e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -303,7 +303,6 @@ private[state] class HDFSBackedStateStoreProvider(
       val mapFromFile = readSnapshotFile(version).getOrElse {
         val prevMap = loadMap(version - 1)
         val newMap = new MapType(prevMap)
-        newMap.putAll(prevMap)
         updateFromDeltaFile(version, newMap)
         newMap
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
index de72f1cf2723d..acfaa8e5eb3c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreConf.scala
@@ -26,9 +26,11 @@ private[streaming] class StateStoreConf(@transient private val conf: SQLConf) ex
 
   val minDeltasForSnapshot = conf.stateStoreMinDeltasForSnapshot
 
-  val minVersionsToRetain = conf.stateStoreMinVersionsToRetain
+  val minVersionsToRetain = conf.minBatchesToRetain
 }
 
 private[streaming] object StateStoreConf {
   val empty = new StateStoreConf()
+
+  def apply(conf: SQLConf): StateStoreConf = new StateStoreConf(conf)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 24c3d0b5507b7..5454be4c01f19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -472,18 +472,17 @@ object SQLConf {
       .intConf
       .createWithDefault(10)
 
-  val STATE_STORE_MIN_VERSIONS_TO_RETAIN =
-    SQLConfigBuilder("spark.sql.streaming.stateStore.minBatchesToRetain")
-      .internal()
-      .doc("Minimum number of versions of a state store's data to retain after cleaning.")
-      .intConf
-      .createWithDefault(2)
-
   val CHECKPOINT_LOCATION = SQLConfigBuilder("spark.sql.streaming.checkpointLocation")
     .doc("The default location for storing checkpoint data for streaming queries.")
     .stringConf
     .createOptional
 
+  val MIN_BATCHES_TO_RETAIN = SQLConfigBuilder("spark.sql.streaming.minBatchesToRetain")
+    .internal()
+    .doc("The minimum number of batches that must be retained and made recoverable.")
+    .intConf
+    .createWithDefault(100)
+
   val UNSUPPORTED_OPERATION_CHECK_ENABLED =
     SQLConfigBuilder("spark.sql.streaming.unsupportedOperationCheck")
       .internal()
@@ -642,8 +641,6 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def stateStoreMinDeltasForSnapshot: Int = getConf(STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT)
 
-  def stateStoreMinVersionsToRetain: Int = getConf(STATE_STORE_MIN_VERSIONS_TO_RETAIN)
-
   def checkpointLocation: Option[String] = getConf(CHECKPOINT_LOCATION)
 
   def isUnsupportedOperationCheckEnabled: Boolean = getConf(UNSUPPORTED_OPERATION_CHECK_ENABLED)
@@ -697,6 +694,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
   def minNumPostShufflePartitions: Int =
     getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS)
 
+  def minBatchesToRetain: Int = getConf(MIN_BATCHES_TO_RETAIN)
+
   def parquetFilterPushDown: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_ENABLED)
 
   def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
index e511fda57912c..435d874d75b92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
@@ -104,6 +104,7 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
     withFakeCompactibleFileStreamLog(
       fileCleanupDelayMs = Long.MaxValue,
       defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
       compactibleLog => {
         assert("0" === compactibleLog.batchIdToPath(0).getName)
         assert("1" === compactibleLog.batchIdToPath(1).getName)
@@ -118,6 +119,7 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
     withFakeCompactibleFileStreamLog(
       fileCleanupDelayMs = Long.MaxValue,
       defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
       compactibleLog => {
         val logs = Array("entry_1", "entry_2", "entry_3")
         val expected = s"""${FakeCompactibleFileStreamLog.VERSION}
@@ -138,6 +140,7 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
     withFakeCompactibleFileStreamLog(
       fileCleanupDelayMs = Long.MaxValue,
       defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
       compactibleLog => {
         val logs = s"""${FakeCompactibleFileStreamLog.VERSION}
             |"entry_1"
@@ -157,6 +160,7 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
     withFakeCompactibleFileStreamLog(
       fileCleanupDelayMs = Long.MaxValue,
       defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
       compactibleLog => {
         for (batchId <- 0 to 10) {
           compactibleLog.add(batchId, Array("some_path_" + batchId))
@@ -175,6 +179,7 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
     withFakeCompactibleFileStreamLog(
       fileCleanupDelayMs = 0,
       defaultCompactInterval = 3,
+      defaultMinBatchesToRetain = 1,
       compactibleLog => {
         val fs = compactibleLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
 
@@ -194,25 +199,29 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
         compactibleLog.add(1, Array("some_path_1"))
         assert(Set("0", "1") === listBatchFiles())
         compactibleLog.add(2, Array("some_path_2"))
-        assert(Set("2.compact") === listBatchFiles())
+        assert(Set("0", "1", "2.compact") === listBatchFiles())
         compactibleLog.add(3, Array("some_path_3"))
         assert(Set("2.compact", "3") === listBatchFiles())
         compactibleLog.add(4, Array("some_path_4"))
         assert(Set("2.compact", "3", "4") === listBatchFiles())
         compactibleLog.add(5, Array("some_path_5"))
-        assert(Set("5.compact") === listBatchFiles())
+        assert(Set("2.compact", "3", "4", "5.compact") === listBatchFiles())
+        compactibleLog.add(6, Array("some_path_6"))
+        assert(Set("5.compact", "6") === listBatchFiles())
       })
   }
 
   private def withFakeCompactibleFileStreamLog(
     fileCleanupDelayMs: Long,
     defaultCompactInterval: Int,
+    defaultMinBatchesToRetain: Int,
     f: FakeCompactibleFileStreamLog => Unit
   ): Unit = {
     withTempDir { file =>
       val compactibleLog = new FakeCompactibleFileStreamLog(
         fileCleanupDelayMs,
         defaultCompactInterval,
+        defaultMinBatchesToRetain,
         spark,
         file.getCanonicalPath)
       f(compactibleLog)
@@ -227,6 +236,7 @@ object FakeCompactibleFileStreamLog {
 class FakeCompactibleFileStreamLog(
     _fileCleanupDelayMs: Long,
     _defaultCompactInterval: Int,
+    _defaultMinBatchesToRetain: Int,
     sparkSession: SparkSession,
     path: String)
   extends CompactibleFileStreamLog[String](
@@ -241,5 +251,7 @@ class FakeCompactibleFileStreamLog(
 
   override protected def defaultCompactInterval: Int = _defaultCompactInterval
 
+  override protected val minBatchesToRetain: Int = _defaultMinBatchesToRetain
+
   override def compactLogs(logs: Seq[String]): Seq[String] = logs
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
index 8a21b76e8f029..7e0de5e2657be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala
@@ -151,10 +151,11 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
 
   testWithUninterruptibleThread("delete expired file") {
     // Set FILE_SINK_LOG_CLEANUP_DELAY to 0 so that we can detect the deleting behaviour
-    // deterministically
+    // deterministically and one min batches to retain
     withSQLConf(
       SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3",
-      SQLConf.FILE_SINK_LOG_CLEANUP_DELAY.key -> "0") {
+      SQLConf.FILE_SINK_LOG_CLEANUP_DELAY.key -> "0",
+      SQLConf.MIN_BATCHES_TO_RETAIN.key -> "1") {
       withFileStreamSinkLog { sinkLog =>
         val fs = sinkLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
 
@@ -174,13 +175,52 @@ class FileStreamSinkLogSuite extends SparkFunSuite with SharedSQLContext {
         sinkLog.add(1, Array(newFakeSinkFileStatus("/a/b/1", FileStreamSinkLog.ADD_ACTION)))
         assert(Set("0", "1") === listBatchFiles())
         sinkLog.add(2, Array(newFakeSinkFileStatus("/a/b/2", FileStreamSinkLog.ADD_ACTION)))
-        assert(Set("2.compact") === listBatchFiles())
+        assert(Set("0", "1", "2.compact") === listBatchFiles())
         sinkLog.add(3, Array(newFakeSinkFileStatus("/a/b/3", FileStreamSinkLog.ADD_ACTION)))
         assert(Set("2.compact", "3") === listBatchFiles())
         sinkLog.add(4, Array(newFakeSinkFileStatus("/a/b/4", FileStreamSinkLog.ADD_ACTION)))
         assert(Set("2.compact", "3", "4") === listBatchFiles())
         sinkLog.add(5, Array(newFakeSinkFileStatus("/a/b/5", FileStreamSinkLog.ADD_ACTION)))
-        assert(Set("5.compact") === listBatchFiles())
+        assert(Set("2.compact", "3", "4", "5.compact") === listBatchFiles())
+        sinkLog.add(6, Array(newFakeSinkFileStatus("/a/b/6", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("5.compact", "6") === listBatchFiles())
+      }
+    }
+
+    withSQLConf(
+      SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key -> "3",
+      SQLConf.FILE_SINK_LOG_CLEANUP_DELAY.key -> "0",
+      SQLConf.MIN_BATCHES_TO_RETAIN.key -> "2") {
+      withFileStreamSinkLog { sinkLog =>
+        val fs = sinkLog.metadataPath.getFileSystem(spark.sessionState.newHadoopConf())
+
+        def listBatchFiles(): Set[String] = {
+          fs.listStatus(sinkLog.metadataPath).map(_.getPath.getName).filter { fileName =>
+            try {
+              getBatchIdFromFileName(fileName)
+              true
+            } catch {
+              case _: NumberFormatException => false
+            }
+          }.toSet
+        }
+
+        sinkLog.add(0, Array(newFakeSinkFileStatus("/a/b/0", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0") === listBatchFiles())
+        sinkLog.add(1, Array(newFakeSinkFileStatus("/a/b/1", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0", "1") === listBatchFiles())
+        sinkLog.add(2, Array(newFakeSinkFileStatus("/a/b/2", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0", "1", "2.compact") === listBatchFiles())
+        sinkLog.add(3, Array(newFakeSinkFileStatus("/a/b/3", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("0", "1", "2.compact", "3") === listBatchFiles())
+        sinkLog.add(4, Array(newFakeSinkFileStatus("/a/b/4", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("2.compact", "3", "4") === listBatchFiles())
+        sinkLog.add(5, Array(newFakeSinkFileStatus("/a/b/5", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("2.compact", "3", "4", "5.compact") === listBatchFiles())
+        sinkLog.add(6, Array(newFakeSinkFileStatus("/a/b/6", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("2.compact", "3", "4", "5.compact", "6") === listBatchFiles())
+        sinkLog.add(7, Array(newFakeSinkFileStatus("/a/b/7", FileStreamSinkLog.ADD_ACTION)))
+        assert(Set("5.compact", "6", "7") === listBatchFiles())
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
index 05fc7345a7daf..3404b1143bc62 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala
@@ -376,7 +376,9 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     val opId = 0
     val dir = Utils.createDirectory(tempDir, Random.nextString(5)).toString
     val storeId = StateStoreId(dir, opId, 0)
-    val storeConf = StateStoreConf.empty
+    val sqlConf = new SQLConf()
+    sqlConf.setConf(SQLConf.MIN_BATCHES_TO_RETAIN, 2)
+    val storeConf = StateStoreConf(sqlConf)
     val hadoopConf = new Configuration()
     val provider = new HDFSBackedStateStoreProvider(
       storeId, keySchema, valueSchema, storeConf, hadoopConf)
@@ -606,6 +608,7 @@ class StateStoreSuite extends SparkFunSuite with BeforeAndAfter with PrivateMeth
     ): HDFSBackedStateStoreProvider = {
     val sqlConf = new SQLConf()
     sqlConf.setConf(SQLConf.STATE_STORE_MIN_DELTAS_FOR_SNAPSHOT, minDeltasForSnapshot)
+    sqlConf.setConf(SQLConf.MIN_BATCHES_TO_RETAIN, 2)
     new HDFSBackedStateStoreProvider(
       StateStoreId(dir, opId, partition),
       keySchema,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 7be2f216919b0..c66d6b1f8d8e6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.types.StructType
 import org.apache.spark.SparkException
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.ManualClock
 
 
@@ -369,25 +370,52 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
   testQuietly("StreamExecution metadata garbage collection") {
     val inputData = MemoryStream[Int]
     val mapped = inputData.toDS().map(6 / _)
+    withSQLConf(SQLConf.MIN_BATCHES_TO_RETAIN.key -> "1") {
+      // Run 3 batches, and then assert that only 2 metadata files is are at the end
+      // since the first should have been purged.
+      testStream(mapped)(
+        AddData(inputData, 1, 2),
+        CheckAnswer(6, 3),
+        AddData(inputData, 1, 2),
+        CheckAnswer(6, 3, 6, 3),
+        AddData(inputData, 4, 6),
+        CheckAnswer(6, 3, 6, 3, 1, 1),
+
+        AssertOnQuery("metadata log should contain only two files") { q =>
+          val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
+          val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
+          val toTest = logFileNames.filter(!_.endsWith(".crc")).sorted // Workaround for SPARK-17475
+          assert(toTest.size == 2 && toTest.head == "1")
+          true
+        }
+      )
+    }
 
-    // Run 3 batches, and then assert that only 2 metadata files is are at the end
-    // since the first should have been purged.
-    testStream(mapped)(
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3),
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3, 6, 3),
-      AddData(inputData, 4, 6),
-      CheckAnswer(6, 3, 6, 3, 1, 1),
-
-      AssertOnQuery("metadata log should contain only two files") { q =>
-        val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
-        val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
-        val toTest = logFileNames.filter(! _.endsWith(".crc")).sorted  // Workaround for SPARK-17475
-        assert(toTest.size == 2 && toTest.head == "1")
-        true
-      }
-    )
+    val inputData2 = MemoryStream[Int]
+    withSQLConf(SQLConf.MIN_BATCHES_TO_RETAIN.key -> "2") {
+      // Run 5 batches, and then assert that 3 metadata files is are at the end
+      // since the two should have been purged.
+      testStream(inputData2.toDS())(
+        AddData(inputData2, 1, 2),
+        CheckAnswer(1, 2),
+        AddData(inputData2, 1, 2),
+        CheckAnswer(1, 2, 1, 2),
+        AddData(inputData2, 3, 4),
+        CheckAnswer(1, 2, 1, 2, 3, 4),
+        AddData(inputData2, 5, 6),
+        CheckAnswer(1, 2, 1, 2, 3, 4, 5, 6),
+        AddData(inputData2, 7, 8),
+        CheckAnswer(1, 2, 1, 2, 3, 4, 5, 6, 7, 8),
+
+        AssertOnQuery("metadata log should contain three files") { q =>
+          val metadataLogDir = new java.io.File(q.offsetLog.metadataPath.toString)
+          val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
+          val toTest = logFileNames.filter(!_.endsWith(".crc")).sorted // Workaround for SPARK-17475
+          assert(toTest.size == 3 && toTest.head == "2")
+          true
+        }
+      )
+    }
   }
 
   /** Create a streaming DF that only execute one batch in which it returns the given static DF */

From 35011608f492ddcb19144954ba96c45ca6f87784 Mon Sep 17 00:00:00 2001
From: Bill Chambers <bill@databricks.com>
Date: Mon, 12 Dec 2016 13:33:17 +0000
Subject: [PATCH 337/534] [DOCS][MINOR] Clarify Where AccumulatorV2s are
 Displayed

## What changes were proposed in this pull request?

This PR clarifies where accumulators will be displayed.

## How was this patch tested?

No testing.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Bill Chambers <bill@databricks.com>
Author: anabranch <wac.chambers@gmail.com>
Author: Bill Chambers <wchambers@ischool.berkeley.edu>

Closes #16180 from anabranch/improve-acc-docs.

(cherry picked from commit 70ffff21f769b149bee787fe5901d9844a4d97b8)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 docs/programming-guide.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/programming-guide.md b/docs/programming-guide.md
index 4267b8cae8110..353730c28f3c7 100644
--- a/docs/programming-guide.md
+++ b/docs/programming-guide.md
@@ -1345,14 +1345,15 @@ therefore be efficiently supported in parallel. They can be used to implement co
 MapReduce) or sums. Spark natively supports accumulators of numeric types, and programmers
 can add support for new types.
 
-If accumulators are created with a name, they will be
-displayed in Spark's UI. This can be useful for understanding the progress of
-running stages (NOTE: this is not yet supported in Python).
+As a user, you can create named or unnamed accumulators. As seen in the image below, a named accumulator (in this instance `counter`) will display in the web UI for the stage that modifies that accumulator. Spark displays the value for each accumulator modified by a task in the "Tasks" table.
 
 <p style="text-align: center;">
   <img src="img/spark-webui-accumulators.png" title="Accumulators in the Spark UI" alt="Accumulators in the Spark UI" />
 </p>
 
+Tracking accumulators in the UI can be useful for understanding the progress of 
+running stages (NOTE: this is not yet supported in Python).
+
 <div class="codetabs">
 
 <div data-lang="scala"  markdown="1">

From 523071f3fae72909b64c7f405868bbc85f5c3cde Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Mon, 12 Dec 2016 23:38:36 +0100
Subject: [PATCH 338/534] [SPARK-18681][SQL] Fix filtering to compatible with
 partition keys of type int

## What changes were proposed in this pull request?

Cloudera put `/var/run/cloudera-scm-agent/process/15000-hive-HIVEMETASTORE/hive-site.xml` as the configuration file for the Hive Metastore Server, where `hive.metastore.try.direct.sql=false`. But Spark isn't reading this configuration file and get default value `hive.metastore.try.direct.sql=true`. As mallman said, we should use `getMetaConf` method to obtain the original configuration from Hive Metastore Server. I have tested this method few times and the return value is always consistent with Hive Metastore Server.

## How was this patch tested?

The existing tests.

Author: Yuming Wang <wgyumg@gmail.com>

Closes #16122 from wangyum/SPARK-18681.

(cherry picked from commit 90abfd15f4b3f612a7b0ff65f03bf319c78a0243)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../scala/org/apache/spark/sql/hive/client/HiveShim.scala  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index e561706facf03..87f58e5f1aa37 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -590,8 +590,11 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       } else {
         logDebug(s"Hive metastore filter is '$filter'.")
         val tryDirectSqlConfVar = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL
-        val tryDirectSql =
-          hive.getConf.getBoolean(tryDirectSqlConfVar.varname, tryDirectSqlConfVar.defaultBoolVal)
+        // We should get this config value from the metaStore. otherwise hit SPARK-18681.
+        // To be compatible with hive-0.12 and hive-0.13, In the future we can achieve this by:
+        // val tryDirectSql = hive.getMetaConf(tryDirectSqlConfVar.varname).toBoolean
+        val tryDirectSql = hive.getMSC.getConfigValue(tryDirectSqlConfVar.varname,
+          tryDirectSqlConfVar.defaultBoolVal.toString).toBoolean
         try {
           // Hive may throw an exception when calling this method in some circumstances, such as
           // when filtering on a non-string partition column when the hive config key

From 1aeb7f427d31bfd44f7abb7c56dd7661be8bbaa6 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 12 Dec 2016 14:40:41 -0800
Subject: [PATCH 339/534] [SPARK-18810][SPARKR] SparkR install.spark does not
 work for RCs, snapshots

## What changes were proposed in this pull request?

Support overriding the download url (include version directory) in an environment variable, `SPARKR_RELEASE_DOWNLOAD_URL`

## How was this patch tested?

unit test, manually testing
- snapshot build url
  - download when spark jar not cached
  - when spark jar is cached
- RC build url
  - download when spark jar not cached
  - when spark jar is cached
- multiple cached spark versions
- starting with sparkR shell

To use this,
```
SPARKR_RELEASE_DOWNLOAD_URL=http://this_is_the_url_to_spark_release_tgz R
```
then in R,
```
library(SparkR) # or specify lib.loc
sparkR.session()
```

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16248 from felixcheung/rinstallurl.

(cherry picked from commit 8a51cfdcad5f8397558ed2e245eb03650f37ce66)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/pkg/R/install.R                      | 38 ++++++++++++++++++--------
 R/pkg/R/utils.R                        | 14 +++++++++-
 R/pkg/inst/tests/testthat/test_utils.R | 11 ++++++++
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
index 69b0a523b84e4..097b7ad4bea08 100644
--- a/R/pkg/R/install.R
+++ b/R/pkg/R/install.R
@@ -79,19 +79,28 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
     dir.create(localDir, recursive = TRUE)
   }
 
-  packageLocalDir <- file.path(localDir, packageName)
-
   if (overwrite) {
     message(paste0("Overwrite = TRUE: download and overwrite the tar file",
                    "and Spark package directory if they exist."))
   }
 
+  releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL")
+  if (releaseUrl != "") {
+    packageName <- basenameSansExtFromUrl(releaseUrl)
+  }
+
+  packageLocalDir <- file.path(localDir, packageName)
+
   # can use dir.exists(packageLocalDir) under R 3.2.0 or later
   if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
-    fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s"
-    msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
-                   packageLocalDir)
-    message(msg)
+    if (releaseUrl != "") {
+      message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir))
+    } else {
+      fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s"
+      msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
+                     packageLocalDir)
+      message(msg)
+    }
     Sys.setenv(SPARK_HOME = packageLocalDir)
     return(invisible(packageLocalDir))
   } else {
@@ -104,7 +113,12 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
   if (tarExists && !overwrite) {
     message("tar file found.")
   } else {
-    robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    if (releaseUrl != "") {
+      message("Downloading from alternate URL:\n- ", releaseUrl)
+      downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", releaseUrl))
+    } else {
+      robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    }
   }
 
   message(sprintf("Installing to %s", localDir))
@@ -182,16 +196,18 @@ getPreferredMirror <- function(version, packageName) {
 }
 
 directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
-  packageRemotePath <- paste0(
-    file.path(mirrorUrl, version, packageName), ".tgz")
+  packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz")
   fmt <- "Downloading %s for Hadoop %s from:\n- %s"
   msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
                  packageRemotePath)
   message(msg)
+  downloadUrl(packageRemotePath, packageLocalPath, paste0("Fetch failed from ", mirrorUrl))
+}
 
-  isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath),
+downloadUrl <- function(remotePath, localPath, errorMessage) {
+  isFail <- tryCatch(download.file(remotePath, localPath),
                      error = function(e) {
-                       message(sprintf("Fetch failed from %s", mirrorUrl))
+                       message(errorMessage)
                        print(e)
                        TRUE
                      })
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 098c0e3e31e95..1283449f3592a 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -841,7 +841,7 @@ captureJVMException <- function(e, method) {
 #
 # @param inputData a list of rows, with each row a list
 # @return data.frame with raw columns as lists
-rbindRaws <- function(inputData){
+rbindRaws <- function(inputData) {
   row1 <- inputData[[1]]
   rawcolumns <- ("raw" == sapply(row1, class))
 
@@ -851,3 +851,15 @@ rbindRaws <- function(inputData){
   out[!rawcolumns] <- lapply(out[!rawcolumns], unlist)
   out
 }
+
+# Get basename without extension from URL
+basenameSansExtFromUrl <- function(url) {
+  # split by '/'
+  splits <- unlist(strsplit(url, "^.+/"))
+  last <- tail(splits, 1)
+  # this is from file_path_sans_ext
+  # first, remove any compression extension
+  filename <- sub("[.](gz|bz2|xz)$", "", last)
+  # then, strip extension by the last '.'
+  sub("([^.]+)\\.[[:alnum:]]+$", "\\1", filename)
+}
diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R
index 607c407f04f97..c87524842876e 100644
--- a/R/pkg/inst/tests/testthat/test_utils.R
+++ b/R/pkg/inst/tests/testthat/test_utils.R
@@ -228,4 +228,15 @@ test_that("varargsToStrEnv", {
   expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 2, 3, 4.")
 })
 
+test_that("basenameSansExtFromUrl", {
+  x <- paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-",
+              "SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
+  y <- paste0("http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-bin/spark-2.1.0-",
+              "bin-hadoop2.4-without-hive.tgz")
+  expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
+  expect_equal(basenameSansExtFromUrl(y), "spark-2.1.0-bin-hadoop2.4-without-hive")
+  z <- "http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz"
+  expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
+})
+
 sparkR.session.stop()

From 9dc5fa5f77d910e44746c5866cb77565c4b761d9 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Mon, 12 Dec 2016 22:31:22 -0800
Subject: [PATCH 340/534] [SPARK-18796][SS] StreamingQueryManager should not
 block when starting a query

## What changes were proposed in this pull request?

Major change in this PR:
- Add `pendingQueryNames` and `pendingQueryIds` to track that are going to start but not yet put into `activeQueries` so that we don't need to hold a lock when starting a query.

Minor changes:
- Fix a potential NPE when the user sets `checkpointLocation` using SQLConf but doesn't specify a query name.
- Add missing docs in `StreamingQueryListener`

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16220 from zsxwing/SPARK-18796.

(cherry picked from commit 417e45c58484a6b984ad2ce9ba8f47aa0a9983fd)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../execution/streaming/StreamExecution.scala |   5 +-
 .../streaming/StreamingQueryListener.scala    |   7 +-
 .../sql/streaming/StreamingQueryManager.scala | 148 +++++++++++-------
 .../test/DataStreamReaderWriterSuite.scala    |  56 +++++++
 4 files changed, 158 insertions(+), 58 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 48eee42a29011..9fe6819837bbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -223,7 +223,8 @@ class StreamExecution(
         sparkSession.sparkContext.env.metricsSystem.registerSource(streamMetrics)
       }
 
-      postEvent(new QueryStartedEvent(id, runId, name)) // Assumption: Does not throw exception.
+      // `postEvent` does not throw non fatal exception.
+      postEvent(new QueryStartedEvent(id, runId, name))
 
       // Unblock starting thread
       startLatch.countDown()
@@ -286,7 +287,7 @@ class StreamExecution(
           e,
           committedOffsets.toOffsetSeq(sources, offsetSeqMetadata).toString,
           availableOffsets.toOffsetSeq(sources, offsetSeqMetadata).toString)
-        logError(s"Query $name terminated with error", e)
+        logError(s"Query $prettyIdString terminated with error", e)
         updateStatusMessage(s"Terminated with exception: ${e.getMessage}")
         // Rethrow the fatal errors to allow the user using `Thread.UncaughtExceptionHandler` to
         // handle them
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
index 6fc859d88d97e..817733286b03e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala
@@ -83,6 +83,9 @@ object StreamingQueryListener {
   /**
    * :: Experimental ::
    * Event representing the start of a query
+   * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
+   * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
+   * @param name User-specified name of the query, null if not specified.
    * @since 2.1.0
    */
   @Experimental
@@ -94,6 +97,7 @@ object StreamingQueryListener {
   /**
    * :: Experimental ::
    * Event representing any progress updates in a query.
+   * @param progress The query progress updates.
    * @since 2.1.0
    */
   @Experimental
@@ -103,7 +107,8 @@ object StreamingQueryListener {
    * :: Experimental ::
    * Event representing that termination of a query.
    *
-   * @param id The query id.
+   * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
+   * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
    * @param exception The exception message of the query if the query was terminated
    *                  with an exception. Otherwise, it will be `None`.
    * @since 2.1.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 52d079192dae4..6ebd70685effc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import java.util.UUID
-import java.util.concurrent.atomic.AtomicLong
+import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.mutable
 
@@ -44,10 +44,13 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
   private[sql] val stateStoreCoordinator =
     StateStoreCoordinatorRef.forDriver(sparkSession.sparkContext.env)
   private val listenerBus = new StreamingQueryListenerBus(sparkSession.sparkContext.listenerBus)
+
+  @GuardedBy("activeQueriesLock")
   private val activeQueries = new mutable.HashMap[UUID, StreamingQuery]
   private val activeQueriesLock = new Object
   private val awaitTerminationLock = new Object
 
+  @GuardedBy("awaitTerminationLock")
   private var lastTerminatedQuery: StreamingQuery = null
 
   /**
@@ -181,8 +184,65 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
     listenerBus.post(event)
   }
 
+  private def createQuery(
+      userSpecifiedName: Option[String],
+      userSpecifiedCheckpointLocation: Option[String],
+      df: DataFrame,
+      sink: Sink,
+      outputMode: OutputMode,
+      useTempCheckpointLocation: Boolean,
+      recoverFromCheckpointLocation: Boolean,
+      trigger: Trigger,
+      triggerClock: Clock): StreamExecution = {
+    val checkpointLocation = userSpecifiedCheckpointLocation.map { userSpecified =>
+      new Path(userSpecified).toUri.toString
+    }.orElse {
+      df.sparkSession.sessionState.conf.checkpointLocation.map { location =>
+        new Path(location, userSpecifiedName.getOrElse(UUID.randomUUID().toString)).toUri.toString
+      }
+    }.getOrElse {
+      if (useTempCheckpointLocation) {
+        Utils.createTempDir(namePrefix = s"temporary").getCanonicalPath
+      } else {
+        throw new AnalysisException(
+          "checkpointLocation must be specified either " +
+            """through option("checkpointLocation", ...) or """ +
+            s"""SparkSession.conf.set("${SQLConf.CHECKPOINT_LOCATION.key}", ...)""")
+      }
+    }
+
+    // If offsets have already been created, we trying to resume a query.
+    if (!recoverFromCheckpointLocation) {
+      val checkpointPath = new Path(checkpointLocation, "offsets")
+      val fs = checkpointPath.getFileSystem(df.sparkSession.sessionState.newHadoopConf())
+      if (fs.exists(checkpointPath)) {
+        throw new AnalysisException(
+          s"This query does not support recovering from checkpoint location. " +
+            s"Delete $checkpointPath to start over.")
+      }
+    }
+
+    val analyzedPlan = df.queryExecution.analyzed
+    df.queryExecution.assertAnalyzed()
+
+    if (sparkSession.sessionState.conf.isUnsupportedOperationCheckEnabled) {
+      UnsupportedOperationChecker.checkForStreaming(analyzedPlan, outputMode)
+    }
+
+    new StreamExecution(
+      sparkSession,
+      userSpecifiedName.orNull,
+      checkpointLocation,
+      analyzedPlan,
+      sink,
+      trigger,
+      triggerClock,
+      outputMode)
+  }
+
   /**
    * Start a [[StreamingQuery]].
+   *
    * @param userSpecifiedName Query name optionally specified by the user.
    * @param userSpecifiedCheckpointLocation  Checkpoint location optionally specified by the user.
    * @param df Streaming DataFrame.
@@ -206,72 +266,50 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) {
       recoverFromCheckpointLocation: Boolean = true,
       trigger: Trigger = ProcessingTime(0),
       triggerClock: Clock = new SystemClock()): StreamingQuery = {
-    activeQueriesLock.synchronized {
-      val name = userSpecifiedName match {
-        case Some(n) =>
-          if (activeQueries.values.exists(_.name == userSpecifiedName.get)) {
-            throw new IllegalArgumentException(
-              s"Cannot start query with name $n as a query with that name is already active")
-          }
-          n
-        case None => null
-      }
-      val checkpointLocation = userSpecifiedCheckpointLocation.map { userSpecified =>
-        new Path(userSpecified).toUri.toString
-      }.orElse {
-        df.sparkSession.sessionState.conf.checkpointLocation.map { location =>
-          new Path(location, name).toUri.toString
-        }
-      }.getOrElse {
-        if (useTempCheckpointLocation) {
-          Utils.createTempDir(namePrefix = s"temporary").getCanonicalPath
-        } else {
-          throw new AnalysisException(
-            "checkpointLocation must be specified either " +
-              """through option("checkpointLocation", ...) or """ +
-              s"""SparkSession.conf.set("${SQLConf.CHECKPOINT_LOCATION.key}", ...)""")
-        }
-      }
+    val query = createQuery(
+      userSpecifiedName,
+      userSpecifiedCheckpointLocation,
+      df,
+      sink,
+      outputMode,
+      useTempCheckpointLocation,
+      recoverFromCheckpointLocation,
+      trigger,
+      triggerClock)
 
-      // If offsets have already been created, we trying to resume a query.
-      if (!recoverFromCheckpointLocation) {
-        val checkpointPath = new Path(checkpointLocation, "offsets")
-        val fs = checkpointPath.getFileSystem(df.sparkSession.sessionState.newHadoopConf())
-        if (fs.exists(checkpointPath)) {
-          throw new AnalysisException(
-            s"This query does not support recovering from checkpoint location. " +
-              s"Delete $checkpointPath to start over.")
+    activeQueriesLock.synchronized {
+      // Make sure no other query with same name is active
+      userSpecifiedName.foreach { name =>
+        if (activeQueries.values.exists(_.name == name)) {
+          throw new IllegalArgumentException(
+            s"Cannot start query with name $name as a query with that name is already active")
         }
       }
 
-      val analyzedPlan = df.queryExecution.analyzed
-      df.queryExecution.assertAnalyzed()
-
-      if (sparkSession.sessionState.conf.isUnsupportedOperationCheckEnabled) {
-        UnsupportedOperationChecker.checkForStreaming(analyzedPlan, outputMode)
-      }
-
-      val query = new StreamExecution(
-        sparkSession,
-        name,
-        checkpointLocation,
-        analyzedPlan,
-        sink,
-        trigger,
-        triggerClock,
-        outputMode)
-
+      // Make sure no other query with same id is active
       if (activeQueries.values.exists(_.id == query.id)) {
         throw new IllegalStateException(
           s"Cannot start query with id ${query.id} as another query with same id is " +
-            s"already active. Perhaps you are attempting to restart a query from checkpoint" +
+            s"already active. Perhaps you are attempting to restart a query from checkpoint " +
             s"that is already active.")
       }
 
-      query.start()
       activeQueries.put(query.id, query)
-      query
     }
+    try {
+      // When starting a query, it will call `StreamingQueryListener.onQueryStarted` synchronously.
+      // As it's provided by the user and can run arbitrary codes, we must not hold any lock here.
+      // Otherwise, it's easy to cause dead-lock, or block too long if the user codes take a long
+      // time to finish.
+      query.start()
+    } catch {
+      case e: Throwable =>
+        activeQueriesLock.synchronized {
+          activeQueries -= query.id
+        }
+        throw e
+    }
+    query
   }
 
   /** Notify (by the StreamingQuery) that the query has been terminated */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index 0eb95a02432fb..f4a62903ebeb1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -27,6 +27,7 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, StreamingQuery, StreamTest}
 import org.apache.spark.sql.types._
@@ -575,4 +576,59 @@ class DataStreamReaderWriterSuite extends StreamTest with BeforeAndAfter {
       sq.stop()
     }
   }
+
+  test("user specified checkpointLocation precedes SQLConf") {
+    import testImplicits._
+    withTempDir { checkpointPath =>
+      withTempPath { userCheckpointPath =>
+        assert(!userCheckpointPath.exists(), s"$userCheckpointPath should not exist")
+        withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointPath.getAbsolutePath) {
+          val queryName = "test_query"
+          val ds = MemoryStream[Int].toDS
+          ds.writeStream
+            .format("memory")
+            .queryName(queryName)
+            .option("checkpointLocation", userCheckpointPath.getAbsolutePath)
+            .start()
+            .stop()
+          assert(checkpointPath.listFiles().isEmpty,
+            "SQLConf path is used even if user specified checkpointLoc: " +
+              s"${checkpointPath.listFiles()} is not empty")
+          assert(userCheckpointPath.exists(),
+            s"The user specified checkpointLoc (userCheckpointPath) is not created")
+        }
+      }
+    }
+  }
+
+  test("use SQLConf checkpoint dir when checkpointLocation is not specified") {
+    import testImplicits._
+    withTempDir { checkpointPath =>
+      withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointPath.getAbsolutePath) {
+        val queryName = "test_query"
+        val ds = MemoryStream[Int].toDS
+        ds.writeStream.format("memory").queryName(queryName).start().stop()
+        // Should use query name to create a folder in `checkpointPath`
+        val queryCheckpointDir = new File(checkpointPath, queryName)
+        assert(queryCheckpointDir.exists(), s"$queryCheckpointDir doesn't exist")
+        assert(
+          checkpointPath.listFiles().size === 1,
+          s"${checkpointPath.listFiles().toList} has 0 or more than 1 files ")
+      }
+    }
+  }
+
+  test("use SQLConf checkpoint dir when checkpointLocation is not specified without query name") {
+    import testImplicits._
+    withTempDir { checkpointPath =>
+      withSQLConf(SQLConf.CHECKPOINT_LOCATION.key -> checkpointPath.getAbsolutePath) {
+        val ds = MemoryStream[Int].toDS
+        ds.writeStream.format("console").start().stop()
+        // Should create a random folder in `checkpointPath`
+        assert(
+          checkpointPath.listFiles().size === 1,
+          s"${checkpointPath.listFiles().toList} has 0 or more than 1 files ")
+      }
+    }
+  }
 }

From 9f0e3be622c77f7a677ce2c930b6dba2f652df00 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Mon, 12 Dec 2016 22:41:11 -0800
Subject: [PATCH 341/534] [SPARK-18797][SPARKR] Update spark.logit in
 sparkr-vignettes

## What changes were proposed in this pull request?
spark.logit is added in 2.1. We need to update spark-vignettes to reflect the changes. This is part of SparkR QA work.

## How was this patch tested?

Manual build html. Please see attached image for the result.
![test](https://cloud.githubusercontent.com/assets/5033592/21032237/01b565fe-bd5d-11e6-8b59-4de4b6ef611d.jpeg)

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #16222 from wangmiao1981/veg.

(cherry picked from commit 2aa16d03db79a642cbe21f387441c34fc51a8236)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 45 +++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index a36f8fc0c1455..625b759626f36 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -565,7 +565,7 @@ head(aftPredictions)
 
 #### Gaussian Mixture Model
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model.
 
@@ -584,7 +584,7 @@ head(select(gmmFitted, "V1", "V2", "prediction"))
 
 #### Latent Dirichlet Allocation
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows:
 
@@ -657,7 +657,7 @@ perplexity
 
 #### Multilayer Perceptron
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows:
 $$
@@ -694,7 +694,7 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu
 
 #### Collaborative Filtering
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
@@ -725,7 +725,7 @@ head(predicted)
 
 #### Isotonic Regression Model
 
-(Coming in 2.1.0)
+(Added in 2.1.0)
 
 `spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to  minimize
 $$
@@ -768,8 +768,39 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
-#### What's More?
-We also expect Decision Tree, Random Forest, Kolmogorov-Smirnov Test coming in the next version 2.1.0.
+### Logistic Regression Model
+
+(Added in 2.1.0)
+
+[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a widely-used model when the response is categorical. It can be seen as a special case of the [Generalized Linear Predictive Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
+We provide `spark.logit` on top of `spark.glm` to support logistic regression with advanced hyper-parameters.
+It supports both binary and multiclass classification with elastic-net regularization and feature standardization, similar to `glmnet`.
+
+We use a simple example to demonstrate `spark.logit` usage. In general, there are three steps of using `spark.logit`:
+1). Create a dataframe from a proper data source; 2). Fit a logistic regression model using `spark.logit` with a proper parameter setting;
+and 3). Obtain the coefficient matrix of the fitted model using `summary` and use the model for prediction with `predict`.
+
+Binomial logistic regression
+```{r, warning=FALSE}
+df <- createDataFrame(iris)
+# Create a DataFrame containing two classes
+training <- df[df$Species %in% c("versicolor", "virginica"), ]
+model <- spark.logit(training, Species ~ ., regParam = 0.5)
+summary(model)
+```
+
+Predict values on training data
+```{r}
+fitted <- predict(model, training)
+```
+
+Multinomial logistic regression against three classes
+```{r, warning=FALSE}
+df <- createDataFrame(iris)
+# Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
+model <- spark.logit(df, Species ~ ., regParam = 0.5)
+summary(model)
+```
 
 ### Model Persistence
 The following example shows how to save/load an ML model by SparkR.

From 207107bca5e550657b02892eef74230787972d10 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Tue, 13 Dec 2016 10:02:19 -0800
Subject: [PATCH 342/534] [SPARK-18835][SQL] Don't expose Guava types in the
 JavaTypeInference API.

This avoids issues during maven tests because of shading.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #16260 from vanzin/SPARK-18835.

(cherry picked from commit f280ccf449f62a00eb4042dfbcf7a0715850fd4c)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../spark/sql/catalyst/JavaTypeInference.scala       | 12 +++++++++++-
 .../scala/org/apache/spark/sql/UDFRegistration.scala |  4 +---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 04f0cfce883f2..61c153c10e47c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst
 
 import java.beans.{Introspector, PropertyDescriptor}
 import java.lang.{Iterable => JIterable}
+import java.lang.reflect.Type
 import java.util.{Iterator => JIterator, List => JList, Map => JMap}
 
 import scala.language.existentials
@@ -54,12 +55,21 @@ object JavaTypeInference {
     inferDataType(TypeToken.of(beanClass))
   }
 
+  /**
+   * Infers the corresponding SQL data type of a Java type.
+   * @param beanType Java type
+   * @return (SQL data type, nullable)
+   */
+  private[sql] def inferDataType(beanType: Type): (DataType, Boolean) = {
+    inferDataType(TypeToken.of(beanType))
+  }
+
   /**
    * Infers the corresponding SQL data type of a Java type.
    * @param typeToken Java type
    * @return (SQL data type, nullable)
    */
-  private[sql] def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
+  private def inferDataType(typeToken: TypeToken[_]): (DataType, Boolean) = {
     typeToken.getRawType match {
       case c: Class[_] if c.isAnnotationPresent(classOf[SQLUserDefinedType]) =>
         (c.getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance(), true)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index c8be89c646957..d94185b390448 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -23,8 +23,6 @@ import java.lang.reflect.{ParameterizedType, Type}
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
-import com.google.common.reflect.TypeToken
-
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.api.java._
@@ -446,7 +444,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
           val udfReturnType = udfInterfaces(0).getActualTypeArguments.last
           var returnType = returnDataType
           if (returnType == null) {
-            returnType = JavaTypeInference.inferDataType(TypeToken.of(udfReturnType))._1
+            returnType = JavaTypeInference.inferDataType(udfReturnType)._1
           }
 
           udfInterfaces(0).getActualTypeArguments.length match {

From d5c4a5d06b3282aec8300d27510393161773061b Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Tue, 13 Dec 2016 10:37:45 -0800
Subject: [PATCH 343/534] [SPARK-18840][YARN] Avoid throw exception when
 getting token renewal interval in non HDFS security environment

## What changes were proposed in this pull request?

Fix `java.util.NoSuchElementException` when running Spark in non-hdfs security environment.

In the current code, we assume `HDFS_DELEGATION_KIND` token will be found in Credentials. But in some cloud environments, HDFS is not required, so we should avoid this exception.

## How was this patch tested?

Manually verified in local environment.

Author: jerryshao <sshao@hortonworks.com>

Closes #16265 from jerryshao/SPARK-18840.

(cherry picked from commit 43298d157d58d5d03ffab818f8cdfc6eac783c55)
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 .../security/HDFSCredentialProvider.scala     | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
index 8d06d735bad51..ebb176bc95caa 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
@@ -72,21 +72,22 @@ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider
     // We cannot use the tokens generated with renewer yarn. Trying to renew
     // those will fail with an access control issue. So create new tokens with the logged in
     // user as renewer.
-    sparkConf.get(PRINCIPAL).map { renewer =>
+    sparkConf.get(PRINCIPAL).flatMap { renewer =>
       val creds = new Credentials()
       nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
         val dstFs = dst.getFileSystem(hadoopConf)
         dstFs.addDelegationTokens(renewer, creds)
       }
-      val t = creds.getAllTokens.asScala
-        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
-        .head
-      val newExpiration = t.renew(hadoopConf)
-      val identifier = new DelegationTokenIdentifier()
-      identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
-      val interval = newExpiration - identifier.getIssueDate
-      logInfo(s"Renewal Interval is $interval")
-      interval
+      val hdfsToken = creds.getAllTokens.asScala
+        .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
+      hdfsToken.map { t =>
+        val newExpiration = t.renew(hadoopConf)
+        val identifier = new DelegationTokenIdentifier()
+        identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
+        val interval = newExpiration - identifier.getIssueDate
+        logInfo(s"Renewal Interval is $interval")
+        interval
+      }
     }
   }
 

From 292a37f2455b12ef8dfbdaf5b905a69b8b5e3728 Mon Sep 17 00:00:00 2001
From: Alex Bozarth <ajbozart@us.ibm.com>
Date: Tue, 13 Dec 2016 21:37:46 +0000
Subject: [PATCH 344/534] [SPARK-18816][WEB UI] Executors Logs column only ran
 visibility check on initial table load

## What changes were proposed in this pull request?

When I added a visibility check for the logs column on the executors page in #14382 the method I used only ran the check on the initial DataTable creation and not subsequent page loads. I moved the check out of the table definition and instead it runs on each page load. The jQuery DataTable functionality used is the same.

## How was this patch tested?

Tested Manually

No visible UI changes to screenshot.

Author: Alex Bozarth <ajbozart@us.ibm.com>

Closes #16256 from ajbozarth/spark18816.

(cherry picked from commit aebf44e50b6b04b848829adbbe08b0f74f31eb32)
Signed-off-by: Sean Owen <sowen@cloudera.com>
---
 .../resources/org/apache/spark/ui/static/executorspage.js  | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index 1df67337ea031..fe5db6aa26b65 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -411,10 +411,6 @@ $(document).ready(function () {
                         }
                     ],
                     "columnDefs": [
-                        {
-                            "targets": [ 15 ],
-                            "visible": logsExist(response)
-                        },
                         {
                             "targets": [ 16 ],
                             "visible": getThreadDumpEnabled()
@@ -423,7 +419,8 @@ $(document).ready(function () {
                     "order": [[0, "asc"]]
                 };
     
-                $(selector).DataTable(conf);
+                var dt = $(selector).DataTable(conf);
+                dt.column(15).visible(logsExist(response));
                 $('#active-executors [data-toggle="tooltip"]').tooltip();
     
                 var sumSelector = "#summary-execs-table";

From f672bfdf9689c0ab74226b11785ada50b72cd488 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 13 Dec 2016 14:09:25 -0800
Subject: [PATCH 345/534] [SPARK-18843][CORE] Fix timeout in
 awaitResultInForkJoinSafely (branch 2.1, 2.0)

## What changes were proposed in this pull request?

This PR fixes the timeout value in `awaitResultInForkJoinSafely` for 2.1 and 2.0. Master has been fixed by https://github.com/apache/spark/pull/16230.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16268 from zsxwing/SPARK-18843.
---
 core/src/main/scala/org/apache/spark/util/ThreadUtils.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 60a6e82c6f90d..2a21c6a52c524 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -209,7 +209,7 @@ private[spark] object ThreadUtils {
       // `awaitPermission` is not actually used anywhere so it's safe to pass in null here.
       // See SPARK-13747.
       val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
-      awaitable.result(Duration.Inf)(awaitPermission)
+      awaitable.result(atMost)(awaitPermission)
     } catch {
       case NonFatal(t) =>
         throw new SparkException("Exception thrown in awaitResult: ", t)

From 25b97589e32ddc424df500059cd9962eb1b2fa6b Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 13 Dec 2016 14:14:25 -0800
Subject: [PATCH 346/534] [SPARK-18834][SS] Expose event time stats through
 StreamingQueryProgress

## What changes were proposed in this pull request?

- Changed `StreamingQueryProgress.watermark` to `StreamingQueryProgress.queryTimestamps` which is a `Map[String, String]` containing the following keys: "eventTime.max", "eventTime.min", "eventTime.avg", "processingTime", "watermark". All of them UTC formatted strings.

- Renamed `StreamingQuery.timestamp` to `StreamingQueryProgress.triggerTimestamp` to differentiate from `queryTimestamps`. It has the timestamp of when the trigger was started.

## How was this patch tested?

Updated tests

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16258 from tdas/SPARK-18834.

(cherry picked from commit c68fb426d4ac05414fb402aa1f30f4c98df103ad)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/EventTimeWatermarkExec.scala    | 55 +++++++++++++------
 .../streaming/ProgressReporter.scala          | 38 +++++++++----
 .../execution/streaming/StreamExecution.scala | 33 ++++++-----
 .../apache/spark/sql/streaming/progress.scala | 31 +++++++----
 .../StreamingQueryListenerSuite.scala         |  3 +
 ...StreamingQueryStatusAndProgressSuite.scala | 16 ++++--
 .../sql/streaming/StreamingQuerySuite.scala   |  2 +
 .../spark/sql/streaming/WatermarkSuite.scala  | 49 ++++++++++++++---
 8 files changed, 161 insertions(+), 66 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
index 4c8cb069d23a0..e8570d040dbe4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import scala.math.max
-
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
@@ -28,24 +26,48 @@ import org.apache.spark.sql.types.MetadataBuilder
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.AccumulatorV2
 
-/** Tracks the maximum positive long seen. */
-class MaxLong(protected var currentValue: Long = 0)
-  extends AccumulatorV2[Long, Long] {
+/** Class for collecting event time stats with an accumulator */
+case class EventTimeStats(var max: Long, var min: Long, var sum: Long, var count: Long) {
+  def add(eventTime: Long): Unit = {
+    this.max = math.max(this.max, eventTime)
+    this.min = math.min(this.min, eventTime)
+    this.sum += eventTime
+    this.count += 1
+  }
+
+  def merge(that: EventTimeStats): Unit = {
+    this.max = math.max(this.max, that.max)
+    this.min = math.min(this.min, that.min)
+    this.sum += that.sum
+    this.count += that.count
+  }
+
+  def avg: Long = sum / count
+}
+
+object EventTimeStats {
+  def zero: EventTimeStats = EventTimeStats(
+    max = Long.MinValue, min = Long.MaxValue, sum = 0L, count = 0L)
+}
+
+/** Accumulator that collects stats on event time in a batch. */
+class EventTimeStatsAccum(protected var currentStats: EventTimeStats = EventTimeStats.zero)
+  extends AccumulatorV2[Long, EventTimeStats] {
 
-  override def isZero: Boolean = value == 0
-  override def value: Long = currentValue
-  override def copy(): AccumulatorV2[Long, Long] = new MaxLong(currentValue)
+  override def isZero: Boolean = value == EventTimeStats.zero
+  override def value: EventTimeStats = currentStats
+  override def copy(): AccumulatorV2[Long, EventTimeStats] = new EventTimeStatsAccum(currentStats)
 
   override def reset(): Unit = {
-    currentValue = 0
+    currentStats = EventTimeStats.zero
   }
 
   override def add(v: Long): Unit = {
-    currentValue = max(v, value)
+    currentStats.add(v)
   }
 
-  override def merge(other: AccumulatorV2[Long, Long]): Unit = {
-    currentValue = max(value, other.value)
+  override def merge(other: AccumulatorV2[Long, EventTimeStats]): Unit = {
+    currentStats.merge(other.value)
   }
 }
 
@@ -54,22 +76,21 @@ class MaxLong(protected var currentValue: Long = 0)
  * adding appropriate metadata to this column, this operator also tracks the maximum observed event
  * time. Based on the maximum observed time and a user specified delay, we can calculate the
  * `watermark` after which we assume we will no longer see late records for a particular time
- * period.
+ * period. Note that event time is measured in milliseconds.
  */
 case class EventTimeWatermarkExec(
     eventTime: Attribute,
     delay: CalendarInterval,
     child: SparkPlan) extends SparkPlan {
 
-  // TODO: Use Spark SQL Metrics?
-  val maxEventTime = new MaxLong
-  sparkContext.register(maxEventTime)
+  val eventTimeStats = new EventTimeStatsAccum()
+  sparkContext.register(eventTimeStats)
 
   override protected def doExecute(): RDD[InternalRow] = {
     child.execute().mapPartitions { iter =>
       val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
       iter.map { row =>
-        maxEventTime.add(getEventTime(row).getLong(0))
+        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
         row
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 40e3151337af6..549b93694d949 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -41,7 +41,9 @@ import org.apache.spark.util.Clock
 trait ProgressReporter extends Logging {
 
   case class ExecutionStats(
-    inputRows: Map[Source, Long], stateOperators: Seq[StateOperatorProgress])
+    inputRows: Map[Source, Long],
+    stateOperators: Seq[StateOperatorProgress],
+    eventTimeStats: Map[String, String])
 
   // Internal state of the stream, required for computing metrics.
   protected def id: UUID
@@ -127,12 +129,7 @@ trait ProgressReporter extends Logging {
   protected def finishTrigger(hasNewData: Boolean): Unit = {
     currentTriggerEndTimestamp = triggerClock.getTimeMillis()
 
-    val executionStats: ExecutionStats = if (!hasNewData) {
-      ExecutionStats(Map.empty, Seq.empty)
-    } else {
-      extractExecutionStats
-    }
-
+    val executionStats = extractExecutionStats(hasNewData)
     val processingTimeSec =
       (currentTriggerEndTimestamp - currentTriggerStartTimestamp).toDouble / 1000
 
@@ -160,10 +157,10 @@ trait ProgressReporter extends Logging {
       id = id,
       runId = runId,
       name = name,
-      timestamp = timestampFormat.format(new Date(currentTriggerStartTimestamp)),
+      timestamp = formatTimestamp(currentTriggerStartTimestamp),
       batchId = currentBatchId,
       durationMs = currentDurationsMs.toMap.mapValues(long2Long).asJava,
-      currentWatermark = offsetSeqMetadata.batchWatermarkMs,
+      eventTime = executionStats.eventTimeStats.asJava,
       stateOperators = executionStats.stateOperators.toArray,
       sources = sourceProgress.toArray,
       sink = sinkProgress)
@@ -184,7 +181,13 @@ trait ProgressReporter extends Logging {
   }
 
   /** Extracts statistics from the most recent query execution. */
-  private def extractExecutionStats: ExecutionStats = {
+  private def extractExecutionStats(hasNewData: Boolean): ExecutionStats = {
+    val watermarkTimestamp = Map("watermark" -> formatTimestamp(offsetSeqMetadata.batchWatermarkMs))
+
+    if (!hasNewData) {
+      return ExecutionStats(Map.empty, Seq.empty, watermarkTimestamp)
+    }
+
     // We want to associate execution plan leaves to sources that generate them, so that we match
     // the their metrics (e.g. numOutputRows) to the sources. To do this we do the following.
     // Consider the translation from the streaming logical plan to the final executed plan.
@@ -241,7 +244,16 @@ trait ProgressReporter extends Logging {
         numRowsUpdated = node.metrics.get("numUpdatedStateRows").map(_.value).getOrElse(0L))
     }
 
-    ExecutionStats(numInputRows, stateOperators)
+    val eventTimeStats = lastExecution.executedPlan.collect {
+      case e: EventTimeWatermarkExec if e.eventTimeStats.value.count > 0 =>
+        val stats = e.eventTimeStats.value
+        Map(
+          "max" -> stats.max,
+          "min" -> stats.min,
+          "avg" -> stats.avg).mapValues(formatTimestamp)
+    }.headOption.getOrElse(Map.empty) ++ watermarkTimestamp
+
+    ExecutionStats(numInputRows, stateOperators, eventTimeStats)
   }
 
   /** Records the duration of running `body` for the next query progress update. */
@@ -257,6 +269,10 @@ trait ProgressReporter extends Logging {
     result
   }
 
+  private def formatTimestamp(millis: Long): String = {
+    timestampFormat.format(new Date(millis))
+  }
+
   /** Updates the message returned in `status`. */
   protected def updateStatusMessage(message: String): Unit = {
     currentStatus = currentStatus.copy(message = message)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 9fe6819837bbc..8f97d9570eaa6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -382,6 +382,24 @@ class StreamExecution(
     if (hasNewData) {
       // Current batch timestamp in milliseconds
       offsetSeqMetadata.batchTimestampMs = triggerClock.getTimeMillis()
+      // Update the eventTime watermark if we find one in the plan.
+      if (lastExecution != null) {
+        lastExecution.executedPlan.collect {
+          case e: EventTimeWatermarkExec if e.eventTimeStats.value.count > 0 =>
+            logDebug(s"Observed event time stats: ${e.eventTimeStats.value}")
+            e.eventTimeStats.value.max - e.delay.milliseconds
+        }.headOption.foreach { newWatermarkMs =>
+          if (newWatermarkMs > offsetSeqMetadata.batchWatermarkMs) {
+            logInfo(s"Updating eventTime watermark to: $newWatermarkMs ms")
+            offsetSeqMetadata.batchWatermarkMs = newWatermarkMs
+          } else {
+            logDebug(
+              s"Event time didn't move: $newWatermarkMs < " +
+                s"${offsetSeqMetadata.batchWatermarkMs}")
+          }
+        }
+      }
+
       updateStatusMessage("Writing offsets to log")
       reportTimeTaken("walCommit") {
         assert(offsetLog.add(
@@ -485,21 +503,6 @@ class StreamExecution(
       sink.addBatch(currentBatchId, nextBatch)
     }
 
-    // Update the eventTime watermark if we find one in the plan.
-    lastExecution.executedPlan.collect {
-      case e: EventTimeWatermarkExec =>
-        logTrace(s"Maximum observed eventTime: ${e.maxEventTime.value}")
-        (e.maxEventTime.value / 1000) - e.delay.milliseconds()
-    }.headOption.foreach { newWatermark =>
-      if (newWatermark > offsetSeqMetadata.batchWatermarkMs) {
-        logInfo(s"Updating eventTime watermark to: $newWatermark ms")
-        offsetSeqMetadata.batchWatermarkMs = newWatermark
-      } else {
-        logTrace(s"Event time didn't move: $newWatermark < " +
-          s"$offsetSeqMetadata.currentEventTimeWatermark")
-      }
-    }
-
     awaitBatchLock.lock()
     try {
       // Wake up any threads that are waiting for the stream to progress.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index d1568758b7a43..e219cfde12656 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import java.{util => ju}
+import java.lang.{Long => JLong}
 import java.util.UUID
 
 import scala.collection.JavaConverters._
@@ -29,7 +30,6 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
 
 /**
  * :: Experimental ::
@@ -61,13 +61,20 @@ class StateOperatorProgress private[sql](
  * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`.
  * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`.
  * @param name User-specified name of the query, null if not specified.
- * @param timestamp Timestamp (ms) of the beginning of the trigger.
+ * @param timestamp Beginning time of the trigger in ISO8601 format, i.e. UTC timestamps.
  * @param batchId A unique id for the current batch of data being processed.  Note that in the
  *                case of retries after a failure a given batchId my be executed more than once.
  *                Similarly, when there is no data to be processed, the batchId will not be
  *                incremented.
  * @param durationMs The amount of time taken to perform various operations in milliseconds.
- * @param currentWatermark The current event time watermark in milliseconds
+ * @param eventTime Statistics of event time seen in this batch. It may contain the following keys:
+ *                 {
+ *                   "max" -> "2016-12-05T20:54:20.827Z"  // maximum event time seen in this trigger
+ *                   "min" -> "2016-12-05T20:54:20.827Z"  // minimum event time seen in this trigger
+ *                   "avg" -> "2016-12-05T20:54:20.827Z"  // average event time seen in this trigger
+ *                   "watermark" -> "2016-12-05T20:54:20.827Z"  // watermark used in this trigger
+ *                 }
+ *                 All timestamps are in ISO8601 format, i.e. UTC timestamps.
  * @param stateOperators Information about operators in the query that store state.
  * @param sources detailed statistics on data being read from each of the streaming sources.
  * @since 2.1.0
@@ -79,8 +86,8 @@ class StreamingQueryProgress private[sql](
   val name: String,
   val timestamp: String,
   val batchId: Long,
-  val durationMs: ju.Map[String, java.lang.Long],
-  val currentWatermark: Long,
+  val durationMs: ju.Map[String, JLong],
+  val eventTime: ju.Map[String, String],
   val stateOperators: Array[StateOperatorProgress],
   val sources: Array[SourceProgress],
   val sink: SinkProgress) {
@@ -107,6 +114,13 @@ class StreamingQueryProgress private[sql](
       if (value.isNaN || value.isInfinity) JNothing else JDouble(value)
     }
 
+    /** Convert map to JValue while handling empty maps. Also, this sorts the keys. */
+    def safeMapToJValue[T](map: ju.Map[String, T], valueToJValue: T => JValue): JValue = {
+      if (map.isEmpty) return JNothing
+      val keys = map.asScala.keySet.toSeq.sorted
+      keys.map { k => k -> valueToJValue(map.get(k)) : JObject }.reduce(_ ~ _)
+    }
+
     ("id" -> JString(id.toString)) ~
     ("runId" -> JString(runId.toString)) ~
     ("name" -> JString(name)) ~
@@ -114,11 +128,8 @@ class StreamingQueryProgress private[sql](
     ("numInputRows" -> JInt(numInputRows)) ~
     ("inputRowsPerSecond" -> safeDoubleToJValue(inputRowsPerSecond)) ~
     ("processedRowsPerSecond" -> safeDoubleToJValue(processedRowsPerSecond)) ~
-    ("durationMs" -> durationMs
-        .asScala
-        .map { case (k, v) => k -> JInt(v.toLong): JObject }
-        .reduce(_ ~ _)) ~
-    ("currentWatermark" -> JInt(currentWatermark)) ~
+    ("durationMs" -> safeMapToJValue[JLong](durationMs, v => JInt(v.toLong))) ~
+    ("eventTime" -> safeMapToJValue[String](eventTime, s => JString(s))) ~
     ("stateOperators" -> JArray(stateOperators.map(_.jsonValue).toList)) ~
     ("sources" -> JArray(sources.map(_.jsonValue).toList)) ~
     ("sink" -> sink.jsonValue)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index f75f5b537e41b..7c6745ac8285a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -185,9 +185,12 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
 
   test("QueryProgressEvent serialization") {
     def testSerialization(event: QueryProgressEvent): Unit = {
+      import scala.collection.JavaConverters._
       val json = JsonProtocol.sparkEventToJson(event)
       val newEvent = JsonProtocol.sparkEventFromJson(json).asInstanceOf[QueryProgressEvent]
       assert(newEvent.progress.json === event.progress.json)  // json as a proxy for equality
+      assert(newEvent.progress.durationMs.asScala === event.progress.durationMs.asScala)
+      assert(newEvent.progress.eventTime.asScala === event.progress.eventTime.asScala)
     }
     testSerialization(new QueryProgressEvent(StreamingQueryStatusAndProgressSuite.testProgress1))
     testSerialization(new QueryProgressEvent(StreamingQueryStatusAndProgressSuite.testProgress2))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
index 193c943f83be8..c970743a31ad6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala
@@ -44,7 +44,12 @@ class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
         |  "durationMs" : {
         |    "total" : 0
         |  },
-        |  "currentWatermark" : 3,
+        |  "eventTime" : {
+        |    "avg" : "2016-12-05T20:54:20.827Z",
+        |    "max" : "2016-12-05T20:54:20.827Z",
+        |    "min" : "2016-12-05T20:54:20.827Z",
+        |    "watermark" : "2016-12-05T20:54:20.827Z"
+        |  },
         |  "stateOperators" : [ {
         |    "numRowsTotal" : 0,
         |    "numRowsUpdated" : 1
@@ -76,7 +81,6 @@ class StreamingQueryStatusAndProgressSuite extends SparkFunSuite {
          |  "durationMs" : {
          |    "total" : 0
          |  },
-         |  "currentWatermark" : 3,
          |  "stateOperators" : [ {
          |    "numRowsTotal" : 0,
          |    "numRowsUpdated" : 1
@@ -134,7 +138,11 @@ object StreamingQueryStatusAndProgressSuite {
     timestamp = "2016-12-05T20:54:20.827Z",
     batchId = 2L,
     durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
-    currentWatermark = 3L,
+    eventTime = Map(
+      "max" -> "2016-12-05T20:54:20.827Z",
+      "min" -> "2016-12-05T20:54:20.827Z",
+      "avg" -> "2016-12-05T20:54:20.827Z",
+      "watermark" -> "2016-12-05T20:54:20.827Z").asJava,
     stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
     sources = Array(
       new SourceProgress(
@@ -156,7 +164,7 @@ object StreamingQueryStatusAndProgressSuite {
     timestamp = "2016-12-05T20:54:20.827Z",
     batchId = 2L,
     durationMs = Map("total" -> 0L).mapValues(long2Long).asJava,
-    currentWatermark = 3L,
+    eventTime = Map.empty[String, String].asJava,  // empty maps should be handled correctly
     stateOperators = Array(new StateOperatorProgress(numRowsTotal = 0, numRowsUpdated = 1)),
     sources = Array(
       new SourceProgress(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index c66d6b1f8d8e6..afd788ce3ddfd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.streaming
 
+import scala.collection.JavaConverters._
+
 import org.apache.commons.lang3.RandomStringUtils
 import org.scalactic.TolerantNumerics
 import org.scalatest.concurrent.Eventually._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
index 12f3c3e5ff3d9..f1cc19c6e235d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/WatermarkSuite.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.streaming
 
+import java.{util => ju}
+import java.text.SimpleDateFormat
+
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.internal.Logging
@@ -50,8 +53,7 @@ class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
   }
 
 
-  test("watermark metric") {
-
+  test("event time and watermark metrics") {
     val inputData = MemoryStream[Int]
 
     val windowedAggregation = inputData.toDF()
@@ -61,21 +63,43 @@ class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
         .agg(count("*") as 'count)
         .select($"window".getField("start").cast("long").as[Long], $"count".as[Long])
 
+    def assertEventStats(body: ju.Map[String, String] => Unit): AssertOnQuery = AssertOnQuery { q =>
+      body(q.recentProgress.filter(_.numInputRows > 0).lastOption.get.eventTime)
+      true
+    }
+
     testStream(windowedAggregation)(
       AddData(inputData, 15),
       CheckAnswer(),
-      AssertOnQuery { query =>
-        query.lastProgress.currentWatermark === 5000
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(15))
+        assert(e.get("min") === formatTimestamp(15))
+        assert(e.get("avg") === formatTimestamp(15))
+        assert(e.get("watermark") === formatTimestamp(0))
       },
-      AddData(inputData, 15),
+      AddData(inputData, 10, 12, 14),
       CheckAnswer(),
-      AssertOnQuery { query =>
-        query.lastProgress.currentWatermark === 5000
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(14))
+        assert(e.get("min") === formatTimestamp(10))
+        assert(e.get("avg") === formatTimestamp(12))
+        assert(e.get("watermark") === formatTimestamp(5))
       },
       AddData(inputData, 25),
       CheckAnswer(),
-      AssertOnQuery { query =>
-        query.lastProgress.currentWatermark === 15000
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(25))
+        assert(e.get("min") === formatTimestamp(25))
+        assert(e.get("avg") === formatTimestamp(25))
+        assert(e.get("watermark") === formatTimestamp(5))
+      },
+      AddData(inputData, 25),
+      CheckAnswer((10, 3)),
+      assertEventStats { e =>
+        assert(e.get("max") === formatTimestamp(25))
+        assert(e.get("min") === formatTimestamp(25))
+        assert(e.get("avg") === formatTimestamp(25))
+        assert(e.get("watermark") === formatTimestamp(15))
       }
     )
   }
@@ -206,4 +230,11 @@ class WatermarkSuite extends StreamTest with BeforeAndAfter with Logging {
       CheckAnswer((10, 1))
     )
   }
+
+  private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601
+  timestampFormat.setTimeZone(ju.TimeZone.getTimeZone("UTC"))
+
+  private def formatTimestamp(sec: Long): String = {
+    timestampFormat.format(new ju.Date(sec * 1000))
+  }
 }

From 5693ac8e5bd5df8aca1b0d6df0be072a45abcfbd Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 13 Dec 2016 16:59:09 -0800
Subject: [PATCH 347/534] [SPARK-18793][SPARK-18794][R] add
 spark.randomForest/spark.gbt to vignettes

## What changes were proposed in this pull request?

Mention `spark.randomForest` and `spark.gbt` in vignettes. Keep the content minimal since users can type `?spark.randomForest` to see the full doc.

cc: jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #16264 from mengxr/SPARK-18793.

(cherry picked from commit 594b14f1ebd0b3db9f630e504be92228f11b4d9f)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 32 ++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 625b759626f36..334daa51f019d 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -449,6 +449,10 @@ SparkR supports the following machine learning models and algorithms.
 
 * Generalized Linear Model (GLM)
 
+* Random Forest
+
+* Gradient-Boosted Trees (GBT)
+
 * Naive Bayes Model
 
 * $k$-means Clustering
@@ -526,6 +530,34 @@ gaussianFitted <- predict(gaussianGLM, carsDF)
 head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 ```
 
+#### Random Forest
+
+`spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+In the following example, we use the `longley` dataset to train a random forest and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2)
+summary(rfModel)
+predictions <- predict(rfModel, df)
+```
+
+#### Gradient-Boosted Trees
+
+`spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2)
+summary(gbtModel)
+predictions <- predict(gbtModel, df)
+```
+
 #### Naive Bayes Model
 
 Naive Bayes model assumes independence among the features. `spark.naiveBayes` fits a [Bernoulli naive Bayes model](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Bernoulli_naive_Bayes) against a SparkDataFrame. The data should be all categorical. These models are often used for document classification.

From 019d1fa3d421b5750170429fc07b204692b7b58e Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Tue, 13 Dec 2016 18:36:36 -0800
Subject: [PATCH 348/534] [SPARK-18588][TESTS] Ignore
 KafkaSourceStressForDontFailOnDataLossSuite

## What changes were proposed in this pull request?

Disable KafkaSourceStressForDontFailOnDataLossSuite for now.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16275 from zsxwing/ignore-flaky-test.

(cherry picked from commit e104e55c16e229e521c517393b8163cbc3bbf85a)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 544fbc5ec36a2..5d2779aba26d0 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -845,7 +845,7 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  test("stress test for failOnDataLoss=false") {
+  ignore("stress test for failOnDataLoss=false") {
     val reader = spark
       .readStream
       .format("kafka")

From 8ef005931a242d087f4879805571be0660aefaf9 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Tue, 13 Dec 2016 18:52:05 -0800
Subject: [PATCH 349/534] [MINOR][SPARKR] fix kstest example error and add unit
 test

## What changes were proposed in this pull request?

While adding vignettes for kstest, I found some errors in the example:
1. There is a typo of kstest;
2. print.summary.KStest doesn't work with the example;

Fix the example errors;
Add a new unit test for print.summary.KStest;

## How was this patch tested?
Manual test;
Add new unit test;

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #16259 from wangmiao1981/ks.

(cherry picked from commit f2ddabfa09fda26ff0391d026dd67545dab33e01)
Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
---
 R/pkg/R/mllib.R                        | 4 ++--
 R/pkg/inst/tests/testthat/test_mllib.R | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 5df843c2b9d5e..d736bbb5e9113 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -1595,14 +1595,14 @@ setMethod("write.ml", signature(object = "ALSModel", path = "character"),
 #' \dontrun{
 #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
 #' df <- createDataFrame(data)
-#' test <- spark.ktest(df, "test", "norm", c(0, 1))
+#' test <- spark.kstest(df, "test", "norm", c(0, 1))
 #'
 #' # get a summary of the test result
 #' testSummary <- summary(test)
 #' testSummary
 #'
 #' # print out the summary in an organized way
-#' print.summary.KSTest(test)
+#' print.summary.KSTest(testSummary)
 #' }
 #' @note spark.kstest since 2.1.0
 setMethod("spark.kstest", signature(data = "SparkDataFrame"),
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 46dffe3ca091f..40c0446740277 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -986,6 +986,12 @@ test_that("spark.kstest", {
   expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
   expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
   expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
+
+  # Test print.summary.KSTest
+  printStats <- capture.output(print.summary.KSTest(stats))
+  expect_match(printStats[1], "Kolmogorov-Smirnov test summary:")
+  expect_match(printStats[5],
+               "Low presumption against null hypothesis: Sample follows theoretical distribution. ")
 })
 
 test_that("spark.randomForest", {

From f999312e72940b559738048646013eec9e68d657 Mon Sep 17 00:00:00 2001
From: Nattavut Sutyanyong <nsy.can@gmail.com>
Date: Wed, 14 Dec 2016 11:09:31 +0100
Subject: [PATCH 350/534] [SPARK-18814][SQL] CheckAnalysis rejects TPCDS query
 32

## What changes were proposed in this pull request?
Move the checking of GROUP BY column in correlated scalar subquery from CheckAnalysis
to Analysis to fix a regression caused by SPARK-18504.

This problem can be reproduced with a simple script now.

Seq((1,1)).toDF("pk","pv").createOrReplaceTempView("p")
Seq((1,1)).toDF("ck","cv").createOrReplaceTempView("c")
sql("select * from p,c where p.pk=c.ck and c.cv = (select avg(c1.cv) from c c1 where c1.ck = p.pk)").show

The requirements are:
1. We need to reference the same table twice in both the parent and the subquery. Here is the table c.
2. We need to have a correlated predicate but to a different table. Here is from c (as c1) in the subquery to p in the parent.
3. We will then "deduplicate" c1.ck in the subquery to `ck#<n1>#<n2>` at `Project` above `Aggregate` of `avg`. Then when we compare `ck#<n1>#<n2>` and the original group by column `ck#<n1>` by their canonicalized form, which is #<n2> != #<n1>. That's how we trigger the exception added in SPARK-18504.

## How was this patch tested?

SubquerySuite and a simplified version of TPCDS-Q32

Author: Nattavut Sutyanyong <nsy.can@gmail.com>

Closes #16246 from nsyca/18814.

(cherry picked from commit cccd64393ea633e29d4a505fb0a7c01b51a79af8)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 31 +++++++++----
 .../sql-tests/inputs/scalar-subquery.sql      | 20 ++++++++
 .../sql-tests/results/scalar-subquery.sql.out | 46 +++++++++++++++++++
 .../org/apache/spark/sql/SubquerySuite.scala  |  2 +-
 4 files changed, 90 insertions(+), 9 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 235a79973d6ee..aa77a6efef347 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -124,6 +124,10 @@ trait CheckAnalysis extends PredicateHelper {
                 s"Scalar subquery must return only one column, but got ${query.output.size}")
 
           case s @ ScalarSubquery(query, conditions, _) if conditions.nonEmpty =>
+
+            // Collect the columns from the subquery for further checking.
+            var subqueryColumns = conditions.flatMap(_.references).filter(query.output.contains)
+
             def checkAggregate(agg: Aggregate): Unit = {
               // Make sure correlated scalar subqueries contain one row for every outer row by
               // enforcing that they are aggregates which contain exactly one aggregate expressions.
@@ -136,24 +140,35 @@ trait CheckAnalysis extends PredicateHelper {
                 failAnalysis("The output of a correlated scalar subquery must be aggregated")
               }
 
-              // SPARK-18504: block cases where GROUP BY columns
-              // are not part of the correlated columns
-              val groupByCols = ExpressionSet.apply(agg.groupingExpressions.flatMap(_.references))
-              val predicateCols = ExpressionSet.apply(conditions.flatMap(_.references))
-              val invalidCols = groupByCols.diff(predicateCols)
+              // SPARK-18504/SPARK-18814: Block cases where GROUP BY columns
+              // are not part of the correlated columns.
+              val groupByCols = AttributeSet(agg.groupingExpressions.flatMap(_.references))
+              val correlatedCols = AttributeSet(subqueryColumns)
+              val invalidCols = groupByCols -- correlatedCols
               // GROUP BY columns must be a subset of columns in the predicates
               if (invalidCols.nonEmpty) {
                 failAnalysis(
-                  "a GROUP BY clause in a scalar correlated subquery " +
+                  "A GROUP BY clause in a scalar correlated subquery " +
                     "cannot contain non-correlated columns: " +
                     invalidCols.mkString(","))
               }
             }
 
-            // Skip projects and subquery aliases added by the Analyzer and the SQLBuilder.
+            // Skip subquery aliases added by the Analyzer and the SQLBuilder.
+            // For projects, do the necessary mapping and skip to its child.
             def cleanQuery(p: LogicalPlan): LogicalPlan = p match {
               case s: SubqueryAlias => cleanQuery(s.child)
-              case p: Project => cleanQuery(p.child)
+              case p: Project =>
+                // SPARK-18814: Map any aliases to their AttributeReference children
+                // for the checking in the Aggregate operators below this Project.
+                subqueryColumns = subqueryColumns.map {
+                  xs => p.projectList.collectFirst {
+                    case e @ Alias(child : AttributeReference, _) if e.exprId == xs.exprId =>
+                      child
+                  }.getOrElse(xs)
+                }
+
+                cleanQuery(p.child)
               case child => child
             }
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql
new file mode 100644
index 0000000000000..3acc9db09cb80
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/scalar-subquery.sql
@@ -0,0 +1,20 @@
+CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv);
+CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv);
+
+-- SPARK-18814.1: Simplified version of TPCDS-Q32
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT avg(c1.cv)
+               FROM   c c1
+               WHERE  c1.ck = p.pk);
+
+-- SPARK-18814.2: Adding stack of aggregates
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT max(avg)
+	       FROM   (SELECT   c1.cv, avg(c1.cv) avg
+		       FROM     c c1
+		       WHERE    c1.ck = p.pk
+                       GROUP BY c1.cv));
diff --git a/sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out
new file mode 100644
index 0000000000000..c249329d6a61c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/scalar-subquery.sql.out
@@ -0,0 +1,46 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 4
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT avg(c1.cv)
+               FROM   c c1
+               WHERE  c1.ck = p.pk)
+-- !query 2 schema
+struct<pk:int,cv:int>
+-- !query 2 output
+1	1
+
+
+-- !query 3
+SELECT pk, cv
+FROM   p, c
+WHERE  p.pk = c.ck
+AND    c.cv = (SELECT max(avg)
+	       FROM   (SELECT   c1.cv, avg(c1.cv) avg
+		       FROM     c c1
+		       WHERE    c1.ck = p.pk
+                       GROUP BY c1.cv))
+-- !query 3 schema
+struct<pk:int,cv:int>
+-- !query 3 output
+1	1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 0f2f520006e35..5a4b1cfe95e27 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -491,7 +491,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
         sql("select (select sum(-1) from t t2 where t1.c2 = t2.c1 group by t2.c2) sum from t t1")
       }
       assert(errMsg.getMessage.contains(
-        "a GROUP BY clause in a scalar correlated subquery cannot contain non-correlated columns:"))
+        "A GROUP BY clause in a scalar correlated subquery cannot contain non-correlated columns:"))
     }
   }
 

From 16d4bd4a25e70e9396b3451a53157f7cc41c1359 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 14 Dec 2016 10:57:03 -0800
Subject: [PATCH 351/534] [SPARK-18730] Post Jenkins test report page instead
 of the full console output page to GitHub

## What changes were proposed in this pull request?

Currently, the full console output page of a Spark Jenkins PR build can be as large as several megabytes. It takes a relatively long time to load and may even freeze the browser for quite a while.

This PR makes the build script to post the test report page link to GitHub instead. The test report page is way more concise and is usually the first page I'd like to check when investigating a Jenkins build failure.

Note that for builds that a test report is not available (ongoing builds and builds that fail before test execution), the test report link automatically redirects to the build page.

## How was this patch tested?

N/A.

Author: Cheng Lian <lian@databricks.com>

Closes #16163 from liancheng/jenkins-test-report.

(cherry picked from commit ba4aab9b85688141d3d0c185165ec7a402c9fbba)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 dev/run-tests-jenkins.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index 1d1e72faccf2a..bb286af76384e 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -80,7 +80,7 @@ def pr_message(build_display_name,
                 short_commit_hash,
                 commit_url,
                 str(' ' + post_msg + '.') if post_msg else '.')
-    return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args
+    return '**[Test build %s %s](%stestReport)** for PR %s at commit [`%s`](%s)%s' % str_args
 
 
 def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):

From af12a21ca7145751acdec400134b1bd5c8168f74 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Wed, 14 Dec 2016 11:29:11 -0800
Subject: [PATCH 352/534] [SPARK-18753][SQL] Keep pushed-down null literal as a
 filter in Spark-side post-filter for FileFormat datasources

## What changes were proposed in this pull request?

Currently, `FileSourceStrategy` does not handle the case when the pushed-down filter is `Literal(null)` and removes it at the post-filter in Spark-side.

For example, the codes below:

```scala
val df = Seq(Tuple1(Some(true)), Tuple1(None), Tuple1(Some(false))).toDF()
df.filter($"_1" === "true").explain(true)
```

shows it keeps `null` properly.

```
== Parsed Logical Plan ==
'Filter ('_1 = true)
+- LocalRelation [_1#17]

== Analyzed Logical Plan ==
_1: boolean
Filter (cast(_1#17 as double) = cast(true as double))
+- LocalRelation [_1#17]

== Optimized Logical Plan ==
Filter (isnotnull(_1#17) && null)
+- LocalRelation [_1#17]

== Physical Plan ==
*Filter (isnotnull(_1#17) && null)       << Here `null` is there
+- LocalTableScan [_1#17]
```

However, when we read it back from Parquet,

```scala
val path = "/tmp/testfile"
df.write.parquet(path)
spark.read.parquet(path).filter($"_1" === "true").explain(true)
```

`null` is removed at the post-filter.

```
== Parsed Logical Plan ==
'Filter ('_1 = true)
+- Relation[_1#11] parquet

== Analyzed Logical Plan ==
_1: boolean
Filter (cast(_1#11 as double) = cast(true as double))
+- Relation[_1#11] parquet

== Optimized Logical Plan ==
Filter (isnotnull(_1#11) && null)
+- Relation[_1#11] parquet

== Physical Plan ==
*Project [_1#11]
+- *Filter isnotnull(_1#11)       << Here `null` is missing
   +- *FileScan parquet [_1#11] Batched: true, Format: ParquetFormat, Location: InMemoryFileIndex[file:/tmp/testfile], PartitionFilters: [null], PushedFilters: [IsNotNull(_1)], ReadSchema: struct<_1:boolean>
```

This PR fixes it to keep it properly. In more details,

```scala
val partitionKeyFilters =
  ExpressionSet(normalizedFilters.filter(_.references.subsetOf(partitionSet)))
```

This keeps this `null` in `partitionKeyFilters` as `Literal` always don't have `children` and `references` is being empty  which is always the subset of `partitionSet`.

And then in

```scala
val afterScanFilters = filterSet -- partitionKeyFilters
```

`null` is always removed from the post filter. So, if the referenced fields are empty, it should be applied into data columns too.

After this PR, it becomes as below:

```
== Parsed Logical Plan ==
'Filter ('_1 = true)
+- Relation[_1#276] parquet

== Analyzed Logical Plan ==
_1: boolean
Filter (cast(_1#276 as double) = cast(true as double))
+- Relation[_1#276] parquet

== Optimized Logical Plan ==
Filter (isnotnull(_1#276) && null)
+- Relation[_1#276] parquet

== Physical Plan ==
*Project [_1#276]
+- *Filter (isnotnull(_1#276) && null)
   +- *FileScan parquet [_1#276] Batched: true, Format: ParquetFormat, Location: InMemoryFileIndex[file:/private/var/folders/9j/gf_c342d7d150mwrxvkqnc180000gn/T/spark-a5d59bdb-5b..., PartitionFilters: [null], PushedFilters: [IsNotNull(_1)], ReadSchema: struct<_1:boolean>
```

## How was this patch tested?

Unit test in `FileSourceStrategySuite`

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #16184 from HyukjinKwon/SPARK-18753.

(cherry picked from commit 89ae26dcdb73266fbc3a8b6da9f5dff30dc4ec95)
Signed-off-by: Cheng Lian <lian@databricks.com>
---
 .../execution/datasources/FileSourceStrategy.scala    |  2 +-
 .../datasources/FileSourceStrategySuite.scala         | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
index 55ca4f11068f9..ead323320243a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -86,7 +86,7 @@ object FileSourceStrategy extends Strategy with Logging {
       val dataFilters = normalizedFilters.filter(_.references.intersect(partitionSet).isEmpty)
 
       // Predicates with both partition keys and attributes need to be evaluated after the scan.
-      val afterScanFilters = filterSet -- partitionKeyFilters
+      val afterScanFilters = filterSet -- partitionKeyFilters.filter(_.references.nonEmpty)
       logInfo(s"Post-Scan Filters: ${afterScanFilters.mkString(",")}")
 
       val filterAttributes = AttributeSet(afterScanFilters)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
index d900ce7bb2370..f36162858bf7a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -476,6 +476,17 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
     }
   }
 
+  test("[SPARK-18753] keep pushed-down null literal as a filter in Spark-side post-filter") {
+    val ds = Seq(Tuple1(Some(true)), Tuple1(None), Tuple1(Some(false))).toDS()
+    withTempPath { p =>
+      val path = p.getAbsolutePath
+      ds.write.parquet(path)
+      val readBack = spark.read.parquet(path).filter($"_1" === "true")
+      val filtered = ds.filter($"_1" === "true").toDF()
+      checkAnswer(readBack, filtered)
+    }
+  }
+
   // Helpers for checking the arguments passed to the FileFormat.
 
   protected val checkPartitionSchema =

From e8866f9fc62095b78421d461549f7eaf8e9070b3 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Dec 2016 21:22:49 +0100
Subject: [PATCH 353/534] [SPARK-18853][SQL] Project (UnaryNode) is way too
 aggressive in estimating statistics

## What changes were proposed in this pull request?
This patch reduces the default number element estimation for arrays and maps from 100 to 1. The issue with the 100 number is that when nested (e.g. an array of map), 100 * 100 would be used as the default size. This sounds like just an overestimation which doesn't seem that bad (since it is usually better to overestimate than underestimate). However, due to the way we assume the size output for Project (new estimated column size / old estimated column size), this overestimation can become underestimation. It is actually in general in this case safer to assume 1 default element.

## How was this patch tested?
This should be covered by existing tests.

Author: Reynold Xin <rxin@databricks.com>

Closes #16274 from rxin/SPARK-18853.

(cherry picked from commit 5d799473696a15fddd54ec71a93b6f8cb169810c)
Signed-off-by: Herman van Hovell <hvanhovell@databricks.com>
---
 .../org/apache/spark/sql/types/ArrayType.scala     |  6 +++---
 .../scala/org/apache/spark/sql/types/MapType.scala |  6 +++---
 .../org/apache/spark/sql/types/DataTypeSuite.scala | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
index d409271fbc6b5..98efba199ad47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayType.scala
@@ -78,10 +78,10 @@ case class ArrayType(elementType: DataType, containsNull: Boolean) extends DataT
       ("containsNull" -> containsNull)
 
   /**
-   * The default size of a value of the ArrayType is 100 * the default size of the element type.
-   * (We assume that there are 100 elements).
+   * The default size of a value of the ArrayType is the default size of the element type.
+   * We assume that there is only 1 element on average in an array. See SPARK-18853.
    */
-  override def defaultSize: Int = 100 * elementType.defaultSize
+  override def defaultSize: Int = 1 * elementType.defaultSize
 
   override def simpleString: String = s"array<${elementType.simpleString}>"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
index fbf3a61786251..6691b81dcea8d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/MapType.scala
@@ -56,10 +56,10 @@ case class MapType(
 
   /**
    * The default size of a value of the MapType is
-   * 100 * (the default size of the key type + the default size of the value type).
-   * (We assume that there are 100 elements).
+   * (the default size of the key type + the default size of the value type).
+   * We assume that there is only 1 element on average in a map. See SPARK-18853.
    */
-  override def defaultSize: Int = 100 * (keyType.defaultSize + valueType.defaultSize)
+  override def defaultSize: Int = 1 * (keyType.defaultSize + valueType.defaultSize)
 
   override def simpleString: String = s"map<${keyType.simpleString},${valueType.simpleString}>"
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
index b8ab9a9963de8..12d2c00dc9c49 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeSuite.scala
@@ -253,7 +253,7 @@ class DataTypeSuite extends SparkFunSuite {
   checkDataTypeJsonRepr(structType)
 
   def checkDefaultSize(dataType: DataType, expectedDefaultSize: Int): Unit = {
-    test(s"Check the default size of ${dataType}") {
+    test(s"Check the default size of $dataType") {
       assert(dataType.defaultSize === expectedDefaultSize)
     }
   }
@@ -272,18 +272,18 @@ class DataTypeSuite extends SparkFunSuite {
   checkDefaultSize(TimestampType, 8)
   checkDefaultSize(StringType, 20)
   checkDefaultSize(BinaryType, 100)
-  checkDefaultSize(ArrayType(DoubleType, true), 800)
-  checkDefaultSize(ArrayType(StringType, false), 2000)
-  checkDefaultSize(MapType(IntegerType, StringType, true), 2400)
-  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 80400)
-  checkDefaultSize(structType, 812)
+  checkDefaultSize(ArrayType(DoubleType, true), 8)
+  checkDefaultSize(ArrayType(StringType, false), 20)
+  checkDefaultSize(MapType(IntegerType, StringType, true), 24)
+  checkDefaultSize(MapType(IntegerType, ArrayType(DoubleType), false), 12)
+  checkDefaultSize(structType, 20)
 
   def checkEqualsIgnoreCompatibleNullability(
       from: DataType,
       to: DataType,
       expected: Boolean): Unit = {
     val testName =
-      s"equalsIgnoreCompatibleNullability: (from: ${from}, to: ${to})"
+      s"equalsIgnoreCompatibleNullability: (from: $from, to: $to)"
     test(testName) {
       assert(DataType.equalsIgnoreCompatibleNullability(from, to) === expected)
     }

From c4de90fc76d5aa5d2c8fee4ed692d4ab922cbab0 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Wed, 14 Dec 2016 13:36:41 -0800
Subject: [PATCH 354/534] [SPARK-18852][SS] StreamingQuery.lastProgress should
 be null when recentProgress is empty

## What changes were proposed in this pull request?

Right now `StreamingQuery.lastProgress` throws NoSuchElementException and it's hard to be used in Python since Python user will just see Py4jError.

This PR just makes it return null instead.

## How was this patch tested?

`test("lastProgress should be null when recentProgress is empty")`

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16273 from zsxwing/SPARK-18852.

(cherry picked from commit 1ac6567bdb03d7cc5c5f3473827a102280cb1030)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 python/pyspark/sql/streaming.py               |  9 ++++++--
 python/pyspark/sql/tests.py                   | 18 +++++++++++++++-
 .../streaming/ProgressReporter.scala          |  4 ++--
 .../StreamingQueryManagerSuite.scala          |  9 +++-----
 .../sql/streaming/StreamingQuerySuite.scala   | 21 ++++++++++++++++++-
 ...faultSource.scala => BlockingSource.scala} | 10 +++++++--
 6 files changed, 57 insertions(+), 14 deletions(-)
 rename sql/core/src/test/scala/org/apache/spark/sql/streaming/util/{DefaultSource.scala => BlockingSource.scala} (92%)

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 9cfb3fe25cdcc..eabd5ef54cb6b 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -125,10 +125,15 @@ def recentProgress(self):
     @since(2.1)
     def lastProgress(self):
         """
-        Returns the most recent :class:`StreamingQueryProgress` update of this streaming query.
+        Returns the most recent :class:`StreamingQueryProgress` update of this streaming query or
+        None if there were no progress updates
         :return: a map
         """
-        return json.loads(self._jsq.lastProgress().json())
+        lastProgress = self._jsq.lastProgress()
+        if lastProgress:
+            return json.loads(lastProgress.json())
+        else:
+            return None
 
     @since(2.0)
     def processAllAvailable(self):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 66320bd050c14..115b4a9bef11c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1119,9 +1119,25 @@ def test_stream_status_and_progress(self):
         self.assertTrue(df.isStreaming)
         out = os.path.join(tmpPath, 'out')
         chk = os.path.join(tmpPath, 'chk')
-        q = df.writeStream \
+
+        def func(x):
+            time.sleep(1)
+            return x
+
+        from pyspark.sql.functions import col, udf
+        sleep_udf = udf(func)
+
+        # Use "sleep_udf" to delay the progress update so that we can test `lastProgress` when there
+        # were no updates.
+        q = df.select(sleep_udf(col("value")).alias('value')).writeStream \
             .start(path=out, format='parquet', queryName='this_query', checkpointLocation=chk)
         try:
+            # "lastProgress" will return None in most cases. However, as it may be flaky when
+            # Jenkins is very slow, we don't assert it. If there is something wrong, "lastProgress"
+            # may throw error with a high chance and make this test flaky, so we should still be
+            # able to detect broken codes.
+            q.lastProgress
+
             q.processAllAvailable()
             lastProgress = q.lastProgress
             recentProgress = q.recentProgress
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 549b93694d949..e40135fdd7a55 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -100,9 +100,9 @@ trait ProgressReporter extends Logging {
     progressBuffer.toArray
   }
 
-  /** Returns the most recent query progress update. */
+  /** Returns the most recent query progress update or null if there were no progress updates. */
   def lastProgress: StreamingQueryProgress = progressBuffer.synchronized {
-    progressBuffer.last
+    progressBuffer.lastOption.orNull
   }
 
   /** Begins recording statistics about query progress for a given trigger. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
index d188319fe38dd..1742a5474cfd3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala
@@ -32,6 +32,7 @@ import org.scalatest.time.SpanSugar._
 import org.apache.spark.SparkException
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.streaming.util.BlockingSource
 import org.apache.spark.util.Utils
 
 class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
@@ -217,7 +218,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
 
   test("SPARK-18811: Source resolution should not block main thread") {
     failAfter(streamingTimeout) {
-      StreamingQueryManagerSuite.latch = new CountDownLatch(1)
+      BlockingSource.latch = new CountDownLatch(1)
       withTempDir { tempDir =>
         // if source resolution was happening on the main thread, it would block the start call,
         // now it should only be blocking the stream execution thread
@@ -231,7 +232,7 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
         eventually(Timeout(streamingTimeout)) {
           assert(sq.status.message.contains("Initializing sources"))
         }
-        StreamingQueryManagerSuite.latch.countDown()
+        BlockingSource.latch.countDown()
         sq.stop()
       }
     }
@@ -321,7 +322,3 @@ class StreamingQueryManagerSuite extends StreamTest with BeforeAndAfter {
     (inputData, mapped)
   }
 }
-
-object StreamingQueryManagerSuite {
-  var latch: CountDownLatch = null
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index afd788ce3ddfd..b052bd9e6a53b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.streaming
 
-import scala.collection.JavaConverters._
+import java.util.concurrent.CountDownLatch
 
 import org.apache.commons.lang3.RandomStringUtils
 import org.scalactic.TolerantNumerics
@@ -32,6 +32,7 @@ import org.apache.spark.SparkException
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.streaming.util.BlockingSource
 import org.apache.spark.util.ManualClock
 
 
@@ -312,6 +313,24 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging {
     )
   }
 
+  test("lastProgress should be null when recentProgress is empty") {
+    BlockingSource.latch = new CountDownLatch(1)
+    withTempDir { tempDir =>
+      val sq = spark.readStream
+        .format("org.apache.spark.sql.streaming.util.BlockingSource")
+        .load()
+        .writeStream
+        .format("org.apache.spark.sql.streaming.util.BlockingSource")
+        .option("checkpointLocation", tempDir.toString)
+        .start()
+      // Creating source is blocked so recentProgress is empty and lastProgress should be null
+      assert(sq.lastProgress === null)
+      // Release the latch and stop the query
+      BlockingSource.latch.countDown()
+      sq.stop()
+    }
+  }
+
   test("codahale metrics") {
     val inputData = MemoryStream[Int]
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
similarity index 92%
rename from sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
index b0adf76814b18..19ab2ff13e14e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/DefaultSource.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
@@ -17,10 +17,12 @@
 
 package org.apache.spark.sql.streaming.util
 
+import java.util.concurrent.CountDownLatch
+
 import org.apache.spark.sql.{SQLContext, _}
 import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
-import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryManagerSuite}
+import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
 /** Dummy provider: returns a SourceProvider with a blocking `createSource` call. */
@@ -42,7 +44,7 @@ class BlockingSource extends StreamSourceProvider with StreamSinkProvider {
       schema: Option[StructType],
       providerName: String,
       parameters: Map[String, String]): Source = {
-    StreamingQueryManagerSuite.latch.await()
+    BlockingSource.latch.await()
     new Source {
       override def schema: StructType = fakeSchema
       override def getOffset: Option[Offset] = Some(new LongOffset(0))
@@ -64,3 +66,7 @@ class BlockingSource extends StreamSourceProvider with StreamSinkProvider {
     }
   }
 }
+
+object BlockingSource {
+  var latch: CountDownLatch = null
+}

From d0d9c5725774897703f2611484838ec7ed09e84f Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Wed, 14 Dec 2016 14:10:40 -0800
Subject: [PATCH 355/534] [SPARK-18795][ML][SPARKR][DOC] Added KSTest section
 to SparkR vignettes

## What changes were proposed in this pull request?

Added short section for KSTest.
Also added logreg model to list of ML models in vignette.  (This will be reorganized under SPARK-18849)

![screen shot 2016-12-14 at 1 37 31 pm](https://cloud.githubusercontent.com/assets/5084283/21202140/7f24e240-c202-11e6-9362-458208bb9159.png)

## How was this patch tested?

Manually tested example locally.
Built vignettes locally.

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #16283 from jkbradley/ksTest-vignette.

(cherry picked from commit 78627425708a0afbe113efdf449e8622b43b652d)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 29 +++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 334daa51f019d..d507e2cdf941b 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -469,6 +469,10 @@ SparkR supports the following machine learning models and algorithms.
 
 * Isotonic Regression Model
 
+* Logistic Regression Model
+
+* Kolmogorov-Smirnov Test
+
 More will be added in the future.
 
 ### R Formula
@@ -800,7 +804,7 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
-### Logistic Regression Model
+#### Logistic Regression Model
 
 (Added in 2.1.0)
 
@@ -834,6 +838,29 @@ model <- spark.logit(df, Species ~ ., regParam = 0.5)
 summary(model)
 ```
 
+#### Kolmogorov-Smirnov Test
+
+`spark.kstest` runs a two-sided, one-sample [Kolmogorov-Smirnov (KS) test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test).
+Given a `SparkDataFrame`, the test compares continuous data in a given column `testCol` with the theoretical distribution
+specified by parameter `nullHypothesis`.
+Users can call `summary` to get a summary of the test results.
+
+In the following example, we test whether the `longley` dataset's `Armed_Forces` column
+follows a normal distribution.  We set the parameters of the normal distribution using
+the mean and standard deviation of the sample.
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+afStats <- head(select(df, mean(df$Armed_Forces), sd(df$Armed_Forces)))
+afMean <- afStats[1]
+afStd <- afStats[2]
+
+test <- spark.kstest(df, "Armed_Forces", "norm", c(afMean, afStd))
+testSummary <- summary(test)
+testSummary
+```
+
+
 ### Model Persistence
 The following example shows how to save/load an ML model by SparkR.
 ```{r, warning=FALSE}

From 280c35af97a20b15578c14b20aa8c19d8fe75456 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Dec 2016 16:12:14 -0800
Subject: [PATCH 356/534] [SPARK-18854][SQL] numberedTreeString and apply(i)
 inconsistent for subqueries

## What changes were proposed in this pull request?
This is a bug introduced by subquery handling. numberedTreeString (which uses generateTreeString under the hood) numbers trees including innerChildren (used to print subqueries), but apply (which uses getNodeNumbered) ignores innerChildren. As a result, apply(i) would return the wrong plan node if there are subqueries.

This patch fixes the bug.

## How was this patch tested?
Added a test case in SubquerySuite.scala to test both the depth-first traversal of numbering as well as making sure the two methods are consistent.

Author: Reynold Xin <rxin@databricks.com>

Closes #16277 from rxin/SPARK-18854.

(cherry picked from commit ffdd1fcd1e8f4f6453d5b0517c0ce82766b8e75f)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../spark/sql/catalyst/plans/QueryPlan.scala  |  9 ++++
 .../plans/logical/basicLogicalOperators.scala |  2 +-
 .../spark/sql/catalyst/trees/TreeNode.scala   | 46 +++++++++++--------
 .../execution/columnar/InMemoryRelation.scala |  3 +-
 .../org/apache/spark/sql/SubquerySuite.scala  | 18 ++++++++
 5 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index b108017c4c482..e67f2be6d237e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -24,6 +24,15 @@ import org.apache.spark.sql.types.{DataType, StructType}
 abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanType] {
   self: PlanType =>
 
+  /**
+   * Override [[TreeNode.apply]] to so we can return a more narrow type.
+   *
+   * Note that this cannot return BaseType because logical plan's plan node might return
+   * physical plan for innerChildren, e.g. in-memory relation logical plan node has a reference
+   * to the physical plan node it is referencing.
+   */
+  override def apply(number: Int): QueryPlan[_] = super.apply(number).asInstanceOf[QueryPlan[_]]
+
   def output: Seq[Attribute]
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 0f33e1dae944e..b4358c2ef2e62 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -412,7 +412,7 @@ case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)])
     s"CTE $cteAliases"
   }
 
-  override def innerChildren: Seq[QueryPlan[_]] = cteRelations.map(_._2)
+  override def innerChildren: Seq[LogicalPlan] = cteRelations.map(_._2)
 }
 
 case class WithWindowDefinition(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index ea8d8fef7bdf1..670fa2bc8de8e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.trees
 import java.util.UUID
 
 import scala.collection.Map
-import scala.collection.mutable.Stack
 import scala.reflect.ClassTag
 
 import org.apache.commons.lang3.ClassUtils
@@ -28,12 +27,9 @@ import org.json4s.JsonAST._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.SparkContext
-import org.apache.spark.rdd.{EmptyRDD, RDD}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, FunctionResource}
 import org.apache.spark.sql.catalyst.FunctionIdentifier
 import org.apache.spark.sql.catalyst.ScalaReflection._
-import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
@@ -493,7 +489,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
 
   /**
    * Returns a string representation of the nodes in this tree, where each operator is numbered.
-   * The numbers can be used with [[trees.TreeNode.apply apply]] to easily access specific subtrees.
+   * The numbers can be used with [[TreeNode.apply]] to easily access specific subtrees.
+   *
+   * The numbers are based on depth-first traversal of the tree (with innerChildren traversed first
+   * before children).
    */
   def numberedTreeString: String =
     treeString.split("\n").zipWithIndex.map { case (line, i) => f"$i%02d $line" }.mkString("\n")
@@ -501,17 +500,24 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
   /**
    * Returns the tree node at the specified number.
    * Numbers for each node can be found in the [[numberedTreeString]].
+   *
+   * Note that this cannot return BaseType because logical plan's plan node might return
+   * physical plan for innerChildren, e.g. in-memory relation logical plan node has a reference
+   * to the physical plan node it is referencing.
    */
-  def apply(number: Int): BaseType = getNodeNumbered(new MutableInt(number))
+  def apply(number: Int): TreeNode[_] = getNodeNumbered(new MutableInt(number)).orNull
 
-  protected def getNodeNumbered(number: MutableInt): BaseType = {
+  private def getNodeNumbered(number: MutableInt): Option[TreeNode[_]] = {
     if (number.i < 0) {
-      null.asInstanceOf[BaseType]
+      None
     } else if (number.i == 0) {
-      this
+      Some(this)
     } else {
       number.i -= 1
-      children.map(_.getNodeNumbered(number)).find(_ != null).getOrElse(null.asInstanceOf[BaseType])
+      // Note that this traversal order must be the same as numberedTreeString.
+      innerChildren.map(_.getNodeNumbered(number)).find(_ != None).getOrElse {
+        children.map(_.getNodeNumbered(number)).find(_ != None).flatten
+      }
     }
   }
 
@@ -527,6 +533,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
    * The `i`-th element in `lastChildren` indicates whether the ancestor of the current node at
    * depth `i + 1` is the last child of its own parent node.  The depth of the root node is 0, and
    * `lastChildren` for the root node should be empty.
+   *
+   * Note that this traversal (numbering) order must be the same as [[getNodeNumbered]].
    */
   def generateTreeString(
       depth: Int,
@@ -534,19 +542,16 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
       builder: StringBuilder,
       verbose: Boolean,
       prefix: String = ""): StringBuilder = {
+
     if (depth > 0) {
       lastChildren.init.foreach { isLast =>
-        val prefixFragment = if (isLast) "   " else ":  "
-        builder.append(prefixFragment)
+        builder.append(if (isLast) "   " else ":  ")
       }
-
-      val branch = if (lastChildren.last) "+- " else ":- "
-      builder.append(branch)
+      builder.append(if (lastChildren.last) "+- " else ":- ")
     }
 
     builder.append(prefix)
-    val headline = if (verbose) verboseString else simpleString
-    builder.append(headline)
+    builder.append(if (verbose) verboseString else simpleString)
     builder.append("\n")
 
     if (innerChildren.nonEmpty) {
@@ -557,9 +562,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     }
 
     if (children.nonEmpty) {
-      children.init.foreach(
-        _.generateTreeString(depth + 1, lastChildren :+ false, builder, verbose, prefix))
-      children.last.generateTreeString(depth + 1, lastChildren :+ true, builder, verbose, prefix)
+      children.init.foreach(_.generateTreeString(
+        depth + 1, lastChildren :+ false, builder, verbose, prefix))
+      children.last.generateTreeString(
+        depth + 1, lastChildren :+ true, builder, verbose, prefix)
     }
 
     builder
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index 56bd5c1891e8d..03cc04659bd55 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -24,7 +24,6 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.SparkPlan
@@ -64,7 +63,7 @@ case class InMemoryRelation(
     val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator)
   extends logical.LeafNode with MultiInstanceRelation {
 
-  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(child)
+  override protected def innerChildren: Seq[SparkPlan] = Seq(child)
 
   override def producedAttributes: AttributeSet = outputSet
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 5a4b1cfe95e27..2ef8b18c04612 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -54,6 +54,24 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
     t.createOrReplaceTempView("t")
   }
 
+  test("SPARK-18854 numberedTreeString for subquery") {
+    val df = sql("select * from range(10) where id not in " +
+      "(select id from range(2) union all select id from range(2))")
+
+    // The depth first traversal of the plan tree
+    val dfs = Seq("Project", "Filter", "Union", "Project", "Range", "Project", "Range", "Range")
+    val numbered = df.queryExecution.analyzed.numberedTreeString.split("\n")
+
+    // There should be 8 plan nodes in total
+    assert(numbered.size == dfs.size)
+
+    for (i <- dfs.indices) {
+      val node = df.queryExecution.analyzed(i)
+      assert(node.nodeName == dfs(i))
+      assert(numbered(i).contains(node.nodeName))
+    }
+  }
+
   test("rdd deserialization does not crash [SPARK-15791]") {
     sql("select (select 1 as b) as b").rdd.count()
   }

From 0d94201e0102fd5890ba07da6dd518cec7334b2b Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 14 Dec 2016 17:07:27 -0800
Subject: [PATCH 357/534] [SPARK-18865][SPARKR] SparkR vignettes MLP and LDA
 updates

## What changes were proposed in this pull request?

When do the QA work, I found that the following issues:

1). `spark.mlp` doesn't include an example;
2). `spark.mlp` and `spark.lda` have redundant parameter explanations;
3). `spark.lda` document misses default values for some parameters.

I also changed the `spark.logit` regParam in the examples, as we discussed in #16222.

## How was this patch tested?

Manual test

Author: wm624@hotmail.com <wm624@hotmail.com>

Closes #16284 from wangmiao1981/ks.

(cherry picked from commit 324388531648de20ee61bd42518a068d4789925c)
Signed-off-by: Felix Cheung <felixcheung@apache.org>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 56 +++++++++++++---------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index d507e2cdf941b..8f39922d4a219 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -636,22 +636,6 @@ To use LDA, we need to specify a `features` column in `data` where each entry re
 
 * libSVM: Each entry is a collection of words and will be processed directly.
 
-There are several parameters LDA takes for fitting the model.
-
-* `k`: number of topics (default 10).
-
-* `maxIter`: maximum iterations (default 20).
-
-* `optimizer`: optimizer to train an LDA model, "online" (default) uses [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf). "em" uses [expectation-maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm).
-
-* `subsamplingRate`: For `optimizer = "online"`. Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent, in range (0, 1] (default 0.05).
-
-* `topicConcentration`: concentration parameter (commonly named beta or eta) for the prior placed on topic distributions over terms, default -1 to set automatically on the Spark side. Use `summary` to retrieve the effective topicConcentration. Only 1-size numeric is accepted.
-
-* `docConcentration`: concentration parameter (commonly named alpha) for the prior placed on documents distributions over topics (theta), default -1 to set automatically on the Spark side. Use `summary` to retrieve the effective docConcentration. Only 1-size or k-size numeric is accepted.
-
-* `maxVocabSize`: maximum vocabulary size, default 1 << 18.
-
 Two more functions are provided for the fitted model.
 
 * `spark.posterior` returns a `SparkDataFrame` containing a column of posterior probabilities vectors named "topicDistribution".
@@ -690,7 +674,6 @@ perplexity <- spark.perplexity(model, corpusDF)
 perplexity
 ```
 
-
 #### Multilayer Perceptron
 
 (Added in 2.1.0)
@@ -714,19 +697,32 @@ The number of nodes $N$ in the output layer corresponds to the number of classes
 
 MLPC employs backpropagation for learning the model. We use the logistic loss function for optimization and L-BFGS as an optimization routine.
 
-`spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format. According to the description above, there are several additional parameters that can be set:
-
-* `layers`: integer vector containing the number of nodes for each layer.
-
-* `solver`: solver parameter, supported options: `"gd"` (minibatch gradient descent) or `"l-bfgs"`.
+`spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format.
 
-* `maxIter`: maximum iteration number.
-
-* `tol`: convergence tolerance of iterations.
-
-* `stepSize`: step size for `"gd"`.
+We use iris data set to show how to use `spark.mlp` in classification.
+```{r, warning=FALSE}
+df <- createDataFrame(iris)
+# fit a Multilayer Perceptron Classification Model
+model <- spark.mlp(df, Species ~ ., blockSize = 128, layers = c(4, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
+```
 
-* `seed`: seed parameter for weights initialization.
+To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell.
+```{r, include=FALSE}
+ops <- options()
+options(max.print=5)
+```
+```{r}
+# check the summary of the fitted model
+summary(model)
+```
+```{r, include=FALSE}
+options(ops)
+```
+```{r}
+# make predictions use the fitted model
+predictions <- predict(model, df)
+head(select(predictions, predictions$prediction))
+```
 
 #### Collaborative Filtering
 
@@ -821,7 +817,7 @@ Binomial logistic regression
 df <- createDataFrame(iris)
 # Create a DataFrame containing two classes
 training <- df[df$Species %in% c("versicolor", "virginica"), ]
-model <- spark.logit(training, Species ~ ., regParam = 0.5)
+model <- spark.logit(training, Species ~ ., regParam = 0.00042)
 summary(model)
 ```
 
@@ -834,7 +830,7 @@ Multinomial logistic regression against three classes
 ```{r, warning=FALSE}
 df <- createDataFrame(iris)
 # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
-model <- spark.logit(df, Species ~ ., regParam = 0.5)
+model <- spark.logit(df, Species ~ ., regParam = 0.056)
 summary(model)
 ```
 

From cb2c8428df0607cfbb17a2c874f8228561a2e8ef Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 14 Dec 2016 21:03:56 -0800
Subject: [PATCH 358/534] [SPARK-18856][SQL] non-empty partitioned table should
 not report zero size

## What changes were proposed in this pull request?

In `DataSource`, if the table is not analyzed, we will use 0 as the default value for table size. This is dangerous, we may broadcast a large table and cause OOM. We should use `defaultSizeInBytes` instead.

## How was this patch tested?

new regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16280 from cloud-fan/bug.

(cherry picked from commit d6f11a12a146a863553c5a5e2023d79d4375ef3f)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../sql/execution/datasources/DataSource.scala |  3 ++-
 .../spark/sql/StatisticsCollectionSuite.scala  | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index f47eb84df0288..8e51fc9414546 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -388,10 +388,11 @@ case class DataSource(
 
         val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
             catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
+          val defaultTableSize = sparkSession.sessionState.conf.defaultSizeInBytes
           new CatalogFileIndex(
             sparkSession,
             catalogTable.get,
-            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
+            catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize))
         } else {
           new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(partitionSchema))
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 07408491953ca..c663b31351b52 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -26,6 +26,7 @@ import scala.util.Random
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.internal.StaticSQLConf
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.sql.test.SQLTestData.ArrayData
 import org.apache.spark.sql.types._
@@ -176,6 +177,7 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
  * when using the Hive external catalog) as well as in the sql/core module.
  */
 abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
+  import testImplicits._
 
   private val dec1 = new java.math.BigDecimal("1.000000000000000000")
   private val dec2 = new java.math.BigDecimal("8.000000000000000000")
@@ -242,4 +244,20 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
       }
     }
   }
+
+  // This test will be run twice: with and without Hive support
+  test("SPARK-18856: non-empty partitioned table should not report zero size") {
+    withTable("ds_tbl", "hive_tbl") {
+      spark.range(100).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("ds_tbl")
+      val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.statistics
+      assert(stats.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
+
+      if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
+        sql("CREATE TABLE hive_tbl(i int) PARTITIONED BY (j int)")
+        sql("INSERT INTO hive_tbl PARTITION(j=1) SELECT 1")
+        val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.statistics
+        assert(stats2.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
+      }
+    }
+  }
 }

From b14fc391893468e25de1e24d982d6f260cac59ad Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Dec 2016 21:08:45 -0800
Subject: [PATCH 359/534] [SPARK-18869][SQL] Add TreeNode.p that returns
 BaseType

## What changes were proposed in this pull request?
After the bug fix in SPARK-18854, TreeNode.apply now returns TreeNode[_] rather than a more specific type. It would be easier for interactive debugging to introduce a function that returns the BaseType.

## How was this patch tested?
N/A - this is a developer only feature used for interactive debugging. As long as it compiles, it should be good to go. I tested this in spark-shell.

Author: Reynold Xin <rxin@databricks.com>

Closes #16288 from rxin/SPARK-18869.

(cherry picked from commit 5d510c693aca8c3fd3364b4453160bc8585ffc8e)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 .../apache/spark/sql/catalyst/plans/QueryPlan.scala    |  9 ---------
 .../org/apache/spark/sql/catalyst/trees/TreeNode.scala | 10 +++++++++-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index e67f2be6d237e..b108017c4c482 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -24,15 +24,6 @@ import org.apache.spark.sql.types.{DataType, StructType}
 abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanType] {
   self: PlanType =>
 
-  /**
-   * Override [[TreeNode.apply]] to so we can return a more narrow type.
-   *
-   * Note that this cannot return BaseType because logical plan's plan node might return
-   * physical plan for innerChildren, e.g. in-memory relation logical plan node has a reference
-   * to the physical plan node it is referencing.
-   */
-  override def apply(number: Int): QueryPlan[_] = super.apply(number).asInstanceOf[QueryPlan[_]]
-
   def output: Seq[Attribute]
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 670fa2bc8de8e..8cc16d662b603 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -498,7 +498,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     treeString.split("\n").zipWithIndex.map { case (line, i) => f"$i%02d $line" }.mkString("\n")
 
   /**
-   * Returns the tree node at the specified number.
+   * Returns the tree node at the specified number, used primarily for interactive debugging.
    * Numbers for each node can be found in the [[numberedTreeString]].
    *
    * Note that this cannot return BaseType because logical plan's plan node might return
@@ -507,6 +507,14 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
    */
   def apply(number: Int): TreeNode[_] = getNodeNumbered(new MutableInt(number)).orNull
 
+  /**
+   * Returns the tree node at the specified number, used primarily for interactive debugging.
+   * Numbers for each node can be found in the [[numberedTreeString]].
+   *
+   * This is a variant of [[apply]] that returns the node as BaseType (if the type matches).
+   */
+  def p(number: Int): BaseType = apply(number).asInstanceOf[BaseType]
+
   private def getNodeNumbered(number: MutableInt): Option[TreeNode[_]] = {
     if (number.i < 0) {
       None

From d399a297d1ec9e0a3c57658cba0320b4d7fe88c5 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 14 Dec 2016 21:29:20 -0800
Subject: [PATCH 360/534] [SPARK-18875][SPARKR][DOCS] Fix R API doc generation
 by adding `DESCRIPTION` file

## What changes were proposed in this pull request?

Since Apache Spark 1.4.0, R API document page has a broken link on `DESCRIPTION file` because Jekyll plugin script doesn't copy the file. This PR aims to fix that.

- Official Latest Website: http://spark.apache.org/docs/latest/api/R/index.html
- Apache Spark 2.1.0-rc2: http://people.apache.org/~pwendell/spark-releases/spark-2.1.0-rc2-docs/api/R/index.html

## How was this patch tested?

Manual.

```bash
cd docs
SKIP_SCALADOC=1 jekyll build
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #16292 from dongjoon-hyun/SPARK-18875.

(cherry picked from commit ec0eae486331c3977505d261676b77a33c334216)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 docs/_plugins/copy_api_dirs.rb | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index f926d67e6beaf..71e643244ec28 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -142,4 +142,7 @@
   puts "cp -r R/pkg/html/. docs/api/R"
   cp_r("R/pkg/html/.", "docs/api/R")
 
+  puts "cp R/pkg/DESCRIPTION docs/api"
+  cp("R/pkg/DESCRIPTION", "docs/api")
+
 end

From 2a8de2e11ebab0cb9056444053127619d8a47d8a Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 14 Dec 2016 21:51:52 -0800
Subject: [PATCH 361/534] [SPARK-18849][ML][SPARKR][DOC] vignettes final check
 update

## What changes were proposed in this pull request?

doc cleanup

## How was this patch tested?

~~vignettes is not building for me. I'm going to kick off a full clean build and try again and attach output here for review.~~
Output html here: https://felixcheung.github.io/sparkr-vignettes.html

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #16286 from felixcheung/rvignettespass.

(cherry picked from commit 7d858bc5ce870a28a559f4e81dcfc54cbd128cb7)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 38 +++++++++-------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 8f39922d4a219..fa2656c008660 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -447,33 +447,31 @@ head(teenagers)
 
 SparkR supports the following machine learning models and algorithms.
 
-* Generalized Linear Model (GLM)
+* Accelerated Failure Time (AFT) Survival Model
 
-* Random Forest
+* Collaborative Filtering with Alternating Least Squares (ALS)
+
+* Gaussian Mixture Model (GMM)
+
+* Generalized Linear Model (GLM)
 
 * Gradient-Boosted Trees (GBT)
 
-* Naive Bayes Model
+* Isotonic Regression Model
 
 * $k$-means Clustering
 
-* Accelerated Failure Time (AFT) Survival Model
-
-* Gaussian Mixture Model (GMM)
+* Kolmogorov-Smirnov Test
 
 * Latent Dirichlet Allocation (LDA)
 
-* Multilayer Perceptron Model
-
-* Collaborative Filtering with Alternating Least Squares (ALS)
-
-* Isotonic Regression Model
-
 * Logistic Regression Model
 
-* Kolmogorov-Smirnov Test
+* Multilayer Perceptron Model
 
-More will be added in the future.
+* Naive Bayes Model
+
+* Random Forest
 
 ### R Formula
 
@@ -601,8 +599,6 @@ head(aftPredictions)
 
 #### Gaussian Mixture Model
 
-(Added in 2.1.0)
-
 `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model.
 
 We use a simulated example to demostrate the usage.
@@ -620,8 +616,6 @@ head(select(gmmFitted, "V1", "V2", "prediction"))
 
 #### Latent Dirichlet Allocation
 
-(Added in 2.1.0)
-
 `spark.lda` fits a [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model on a `SparkDataFrame`. It is often used in topic modeling in which topics are inferred from a collection of text documents. LDA can be thought of as a clustering algorithm as follows:
 
 * Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset.
@@ -676,8 +670,6 @@ perplexity
 
 #### Multilayer Perceptron
 
-(Added in 2.1.0)
-
 Multilayer perceptron classifier (MLPC) is a classifier based on the [feedforward artificial neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network). MLPC consists of multiple layers of nodes. Each layer is fully connected to the next layer in the network. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights $w$ and bias $b$ and applying an activation function. This can be written in matrix form for MLPC with $K+1$ layers as follows:
 $$
 y(x)=f_K(\ldots f_2(w_2^T f_1(w_1^T x + b_1) + b_2) \ldots + b_K).
@@ -726,8 +718,6 @@ head(select(predictions, predictions$prediction))
 
 #### Collaborative Filtering
 
-(Added in 2.1.0)
-
 `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614).
 
 There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, `nonnegative`. For a complete list, refer to the help file.
@@ -757,8 +747,6 @@ head(predicted)
 
 #### Isotonic Regression Model
 
-(Added in 2.1.0)
-
 `spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to  minimize
 $$
 \ell(f) = \sum_{i=1}^n w_i (y_i - f(x_i))^2.
@@ -802,8 +790,6 @@ head(predict(isoregModel, newDF))
 
 #### Logistic Regression Model
 
-(Added in 2.1.0)
-
 [Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression) is a widely-used model when the response is categorical. It can be seen as a special case of the [Generalized Linear Predictive Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
 We provide `spark.logit` on top of `spark.glm` to support logistic regression with advanced hyper-parameters.
 It supports both binary and multiclass classification with elastic-net regularization and feature standardization, similar to `glmnet`.

From e430915fad7ffb9397a96f0ef16e741c6b4f158b Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 15 Dec 2016 11:54:35 -0800
Subject: [PATCH 362/534] [SPARK-18870] Disallowed Distinct Aggregations on
 Streaming Datasets

## What changes were proposed in this pull request?

Check whether Aggregation operators on a streaming subplan have aggregate expressions with isDistinct = true.

## How was this patch tested?

Added unit test

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #16289 from tdas/SPARK-18870.

(cherry picked from commit 4f7292c87512a7da3542998d0e5aa21c27a511e9)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../analysis/UnsupportedOperationChecker.scala    | 15 +++++++++++++--
 .../analysis/UnsupportedOperationsSuite.scala     | 13 +++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index c054fcbef36f3..c4a78f9d2113a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.{AnalysisException, InternalOutputModes}
 import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.streaming.OutputMode
@@ -95,6 +96,16 @@ object UnsupportedOperationChecker {
       // Operations that cannot exists anywhere in a streaming plan
       subPlan match {
 
+        case Aggregate(_, aggregateExpressions, child) =>
+          val distinctAggExprs = aggregateExpressions.flatMap { expr =>
+            expr.collect { case ae: AggregateExpression if ae.isDistinct => ae }
+          }
+          throwErrorIf(
+            child.isStreaming && distinctAggExprs.nonEmpty,
+            "Distinct aggregations are not supported on streaming DataFrames/Datasets, unless " +
+              "it is on aggregated DataFrame/Dataset in Complete output mode. Consider using " +
+              "approximate distinct aggregation (e.g. approx_count_distinct() instead of count()).")
+
         case _: Command =>
           throwError("Commands like CreateTable*, AlterTable*, Show* are not supported with " +
             "streaming DataFrames/Datasets")
@@ -143,7 +154,7 @@ object UnsupportedOperationChecker {
           throwError("Union between streaming and batch DataFrames/Datasets is not supported")
 
         case Except(left, right) if right.isStreaming =>
-          throwError("Except with a streaming DataFrame/Dataset on the right is not supported")
+          throwError("Except on a streaming DataFrame/Dataset on the right is not supported")
 
         case Intersect(left, right) if left.isStreaming && right.isStreaming =>
           throwError("Intersect between two streaming DataFrames/Datasets is not supported")
@@ -156,7 +167,7 @@ object UnsupportedOperationChecker {
 
         case Sort(_, _, _) | SortPartitions(_, _) if !containsCompleteData(subPlan) =>
           throwError("Sorting is not supported on streaming DataFrames/Datasets, unless it is on" +
-            "aggregated DataFrame/Dataset in Complete mode")
+            "aggregated DataFrame/Dataset in Complete output mode")
 
         case Sample(_, _, _, _, child) if child.isStreaming =>
           throwError("Sampling is not supported on streaming DataFrames/Datasets")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
index ff1bb126f463d..34e94c71422d7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala
@@ -98,6 +98,19 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
     outputMode = Update,
     expectedMsgs = Seq("multiple streaming aggregations"))
 
+  // Aggregation: Distinct aggregates not supported on streaming relation
+  val distinctAggExprs = Seq(Count("*").toAggregateExpression(isDistinct = true).as("c"))
+  assertSupportedInStreamingPlan(
+    "distinct aggregate - aggregate on batch relation",
+    Aggregate(Nil, distinctAggExprs, batchRelation),
+    outputMode = Append)
+
+  assertNotSupportedInStreamingPlan(
+    "distinct aggregate - aggregate on streaming relation",
+    Aggregate(Nil, distinctAggExprs, streamRelation),
+    outputMode = Complete,
+    expectedMsgs = Seq("distinct aggregation"))
+
   // Inner joins: Stream-stream not supported
   testBinaryOperationInStreamingPlan(
     "inner join",

From 900ce558a238fb9d8220527d8313646fe6830695 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <shixiong@databricks.com>
Date: Thu, 15 Dec 2016 13:17:51 -0800
Subject: [PATCH 363/534] [SPARK-18826][SS] Add 'latestFirst' option to
 FileStreamSource

## What changes were proposed in this pull request?

When starting a stream with a lot of backfill and maxFilesPerTrigger, the user could often want to start with most recent files first. This would let you keep low latency for recent data and slowly backfill historical data.

This PR adds a new option `latestFirst` to control this behavior. When it's true, `FileStreamSource` will sort the files by the modified time from latest to oldest, and take the first `maxFilesPerTrigger` files as a new batch.

## How was this patch tested?

The added test.

Author: Shixiong Zhu <shixiong@databricks.com>

Closes #16251 from zsxwing/newest-first.

(cherry picked from commit 68a6dc974b25e6eddef109f6fd23ae4e9775ceca)
Signed-off-by: Tathagata Das <tathagata.das1565@gmail.com>
---
 .../streaming/FileStreamOptions.scala         | 14 ++++++
 .../streaming/FileStreamSource.scala          | 11 ++++-
 .../sql/streaming/FileStreamSourceSuite.scala | 47 +++++++++++++++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
index fdea65cb10ae0..25ebe1797bed8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala
@@ -53,4 +53,18 @@ class FileStreamOptions(parameters: CaseInsensitiveMap) extends Logging {
   /** Options as specified by the user, in a case-insensitive map, without "path" set. */
   val optionMapWithoutPath: Map[String, String] =
     parameters.filterKeys(_ != "path")
+
+  /**
+   * Whether to scan latest files first. If it's true, when the source finds unprocessed files in a
+   * trigger, it will first process the latest files.
+   */
+  val latestFirst: Boolean = parameters.get("latestFirst").map { str =>
+    try {
+      str.toBoolean
+    } catch {
+      case _: IllegalArgumentException =>
+        throw new IllegalArgumentException(
+          s"Invalid value '$str' for option 'latestFirst', must be 'true' or 'false'")
+    }
+  }.getOrElse(false)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 20e0dcef8ffda..39c0b4979687b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -62,6 +62,15 @@ class FileStreamSource(
   /** Maximum number of new files to be considered in each batch */
   private val maxFilesPerBatch = sourceOptions.maxFilesPerTrigger
 
+  private val fileSortOrder = if (sourceOptions.latestFirst) {
+      logWarning(
+        """'latestFirst' is true. New files will be processed first.
+          |It may affect the watermark value""".stripMargin)
+      implicitly[Ordering[Long]].reverse
+    } else {
+      implicitly[Ordering[Long]]
+    }
+
   /** A mapping from a file that we have processed to some timestamp it was last modified. */
   // Visible for testing and debugging in production.
   val seenFiles = new SeenFilesMap(sourceOptions.maxFileAgeMs)
@@ -155,7 +164,7 @@ class FileStreamSource(
     val startTime = System.nanoTime
     val globbedPaths = SparkHadoopUtil.get.globPathIfNecessary(qualifiedBasePath)
     val catalog = new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(new StructType))
-    val files = catalog.allFiles().sortBy(_.getModificationTime).map { status =>
+    val files = catalog.allFiles().sortBy(_.getModificationTime)(fileSortOrder).map { status =>
       (status.getPath.toUri.toString, status.getModificationTime)
     }
     val endTime = System.nanoTime
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index bcb68520407bc..b96ccb4e6cbf5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.streaming
 import java.io.File
 
 import org.scalatest.PrivateMethodTester
+import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql._
@@ -1059,6 +1060,52 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     val str = Source.fromFile(getClass.getResource(s"/structured-streaming/$file").toURI).mkString
     SerializedOffset(str.trim)
   }
+
+  test("FileStreamSource - latestFirst") {
+    withTempDir { src =>
+      // Prepare two files: 1.txt, 2.txt, and make sure they have different modified time.
+      val f1 = stringToFile(new File(src, "1.txt"), "1")
+      val f2 = stringToFile(new File(src, "2.txt"), "2")
+      f2.setLastModified(f1.lastModified + 1000)
+
+      def runTwoBatchesAndVerifyResults(
+          latestFirst: Boolean,
+          firstBatch: String,
+          secondBatch: String): Unit = {
+        val fileStream = createFileStream(
+          "text",
+          src.getCanonicalPath,
+          options = Map("latestFirst" -> latestFirst.toString, "maxFilesPerTrigger" -> "1"))
+        val clock = new StreamManualClock()
+        testStream(fileStream)(
+          StartStream(trigger = ProcessingTime(10), triggerClock = clock),
+          AssertOnQuery { _ =>
+            // Block until the first batch finishes.
+            eventually(timeout(streamingTimeout)) {
+              assert(clock.isStreamWaitingAt(0))
+            }
+            true
+          },
+          CheckLastBatch(firstBatch),
+          AdvanceManualClock(10),
+          AssertOnQuery { _ =>
+            // Block until the second batch finishes.
+            eventually(timeout(streamingTimeout)) {
+              assert(clock.isStreamWaitingAt(10))
+            }
+            true
+          },
+          CheckLastBatch(secondBatch)
+        )
+      }
+
+      // Read oldest files first, so the first batch is "1", and the second batch is "2".
+      runTwoBatchesAndVerifyResults(latestFirst = false, firstBatch = "1", secondBatch = "2")
+
+      // Read latest files first, so the first batch is "2", and the second batch is "1".
+      runTwoBatchesAndVerifyResults(latestFirst = true, firstBatch = "2", secondBatch = "1")
+    }
+  }
 }
 
 class FileStreamSourceStressTestSuite extends FileStreamSourceTest {

From b6a81f4720752efe459860d28d7f8f738b2944c3 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Thu, 15 Dec 2016 14:26:54 -0800
Subject: [PATCH 364/534] [SPARK-18888] partitionBy in DataStreamWriter in
 Python throws _to_seq not defined

## What changes were proposed in this pull request?

`_to_seq` wasn't imported.

## How was this patch tested?

Added partitionBy to existing write path unit test

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #16297 from brkyvz/SPARK-18888.
---
 python/pyspark/sql/streaming.py | 1 +
 python/pyspark/sql/tests.py     | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index eabd5ef54cb6b..5014299ad220f 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -28,6 +28,7 @@
 
 from pyspark import since, keyword_only
 from pyspark.rdd import ignore_unicode_prefix
+from pyspark.sql.column import _to_seq
 from pyspark.sql.readwriter import OptionUtils, to_str
 from pyspark.sql.types import *
 from pyspark.sql.utils import StreamingQueryException
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 115b4a9bef11c..6de63e649325c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -50,7 +50,7 @@
 from pyspark.sql.types import *
 from pyspark.sql.types import UserDefinedType, _infer_type
 from pyspark.tests import ReusedPySparkTestCase, SparkSubmitTests
-from pyspark.sql.functions import UserDefinedFunction, sha2
+from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
 from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException
 
@@ -1056,7 +1056,8 @@ def test_stream_read_options_overwrite(self):
         self.assertEqual(df.schema.simpleString(), "struct<data:string>")
 
     def test_stream_save_options(self):
-        df = self.spark.readStream.format('text').load('python/test_support/sql/streaming')
+        df = self.spark.readStream.format('text').load('python/test_support/sql/streaming') \
+            .withColumn('id', lit(1))
         for q in self.spark._wrapped.streams.active:
             q.stop()
         tmpPath = tempfile.mkdtemp()
@@ -1065,7 +1066,7 @@ def test_stream_save_options(self):
         out = os.path.join(tmpPath, 'out')
         chk = os.path.join(tmpPath, 'chk')
         q = df.writeStream.option('checkpointLocation', chk).queryName('this_query') \
-            .format('parquet').outputMode('append').option('path', out).start()
+            .format('parquet').partitionBy('id').outputMode('append').option('path', out).start()
         try:
             self.assertEqual(q.name, 'this_query')
             self.assertTrue(q.isActive)

From ef2ccf94224f00154cab7ab173d65442ecd389d7 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Thu, 15 Dec 2016 14:46:00 -0800
Subject: [PATCH 365/534] Preparing Spark release v2.1.0-rc3

---
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 38 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 29522fd3fd829..aebfd12227751 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 85644c4a37bbe..67d78d5f102fb 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index e15ede974cf8c..93790979d7b26 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index c93a355b84d0b..53cb8dd815d81 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 7c9870a8cb85e..89bee8567fc74 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8f949b94fd233..7b45b23e9c546 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a9b858e27150f..9b84f1e0c1dfc 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index d24ef118a5c1e..bbe07006109ea 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 84ad5500c0a7d..cd5849b37453c 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.1.1
+SPARK_VERSION: 2.1.0
+SPARK_VERSION_SHORT: 2.1.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 8a9e6cfcfcc70..2fb42413aca81 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 3849c02ffb03c..4061c5f089c54 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 964e45f31b741..6cfc47ef00e2a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index eec7a889ca1ff..58caf35f65a16 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index a7622d08151fe..ed32fc0ec4c18 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index e862126e48dbe..a3f3907573f21 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index be8e73e41b947..9ae4461db64a2 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index fdfd2ccd4327a..f7276d0bd2197 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index e5bf070124b6a..52c88150137e3 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index c0a94f5950d5c..93b49bcf615b6 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index a02e23c69171d..cdfd29e3a9208 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index d7bb1acdc1d81..c6a79aa86bcf0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c53b72eefe84d..3fa28aa81f214 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 41b16500dd2bc..5c828780600cd 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 96e34cacff8b0..1818bc80ea78a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index c0b70dfdc3364..d60a633b87699 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index 532d6073343ba..f8e43d2c43ec2 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6c3a35eeb9ecd..6dcb44cebb254 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 757906d137c29..5cf3a7f3e0f5e 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 555324524ee82..49f12703c04df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.1-SNAPSHOT</version>
+  <version>2.1.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 6ae3609ae7fae..e91e778cb518c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.1.dev0"
+__version__ = "2.1.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 705316a944e28..1e7db9b10f045 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 72be7e1005f64..c58e0f43b2ac7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d7989c2413040..37e7dccd2e27d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 34e0ae5bbc229..468d758a77884 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index c543a3e049531..7bf4fc0df45e8 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fba6a5d7734a4..06569e6ee2231 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 0c4c9c9f51828..35d53b30191a5 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 85ec270bf9965..38374b5ae5a3b 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From a7364a82eb0d18f92f1d8e46c1160a55bc250032 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Thu, 15 Dec 2016 14:46:09 -0800
Subject: [PATCH 366/534] Preparing development version 2.1.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 0cb3a80a6e892..1ceda7ba024c0 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.1.0
+Version: 2.1.1
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index aebfd12227751..29522fd3fd829 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 67d78d5f102fb..85644c4a37bbe 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 93790979d7b26..e15ede974cf8c 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 53cb8dd815d81..c93a355b84d0b 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 89bee8567fc74..7c9870a8cb85e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 7b45b23e9c546..8f949b94fd233 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 9b84f1e0c1dfc..a9b858e27150f 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index bbe07006109ea..d24ef118a5c1e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index cd5849b37453c..84ad5500c0a7d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.0
-SPARK_VERSION_SHORT: 2.1.0
+SPARK_VERSION: 2.1.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.1.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 2fb42413aca81..8a9e6cfcfcc70 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 4061c5f089c54..3849c02ffb03c 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6cfc47ef00e2a..964e45f31b741 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 58caf35f65a16..eec7a889ca1ff 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index ed32fc0ec4c18..a7622d08151fe 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index a3f3907573f21..e862126e48dbe 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 9ae4461db64a2..be8e73e41b947 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index f7276d0bd2197..fdfd2ccd4327a 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 52c88150137e3..e5bf070124b6a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 93b49bcf615b6..c0a94f5950d5c 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index cdfd29e3a9208..a02e23c69171d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index c6a79aa86bcf0..d7bb1acdc1d81 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fa28aa81f214..c53b72eefe84d 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 5c828780600cd..41b16500dd2bc 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 1818bc80ea78a..96e34cacff8b0 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d60a633b87699..c0b70dfdc3364 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index f8e43d2c43ec2..532d6073343ba 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6dcb44cebb254..6c3a35eeb9ecd 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5cf3a7f3e0f5e..757906d137c29 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 49f12703c04df..555324524ee82 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0</version>
+  <version>2.1.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index e91e778cb518c..6ae3609ae7fae 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.0"
+__version__ = "2.1.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1e7db9b10f045..705316a944e28 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index c58e0f43b2ac7..72be7e1005f64 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 37e7dccd2e27d..d7989c2413040 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 468d758a77884..34e0ae5bbc229 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 7bf4fc0df45e8..c543a3e049531 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 06569e6ee2231..fba6a5d7734a4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 35d53b30191a5..0c4c9c9f51828 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 38374b5ae5a3b..85ec270bf9965 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 08e4272872fc17c43f0dc79d329b946e8e85694d Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Thu, 15 Dec 2016 15:46:03 -0800
Subject: [PATCH 367/534] [SPARK-18868][FLAKY-TEST] Deflake
 StreamingQueryListenerSuite: single listener, check trigger...

## What changes were proposed in this pull request?

Use `recentProgress` instead of `lastProgress` and filter out last non-zero value. Also add eventually to the latest assertQuery similar to first `assertQuery`

## How was this patch tested?

Ran test 1000 times

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #16287 from brkyvz/SPARK-18868.

(cherry picked from commit 9c7f83b0289ba4550b156e6af31cf7c44580eb12)
Signed-off-by: Shixiong Zhu <shixiong@databricks.com>
---
 .../StreamingQueryListenerSuite.scala         | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 7c6745ac8285a..a057d1d36c5a8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -84,7 +84,11 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         CheckAnswer(10, 5),
         AssertOnQuery { query =>
           assert(listener.progressEvents.nonEmpty)
-          assert(listener.progressEvents.last.json === query.lastProgress.json)
+          // SPARK-18868: We can't use query.lastProgress, because in progressEvents, we filter
+          // out non-zero input rows, but the lastProgress may be a zero input row trigger
+          val lastNonZeroProgress = query.recentProgress.filter(_.numInputRows > 0).lastOption
+            .getOrElse(fail("No progress updates received in StreamingQuery!"))
+          assert(listener.progressEvents.last.json === lastNonZeroProgress.json)
           assert(listener.terminationEvent === null)
           true
         },
@@ -109,14 +113,17 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         AdvanceManualClock(100),
         ExpectFailure[SparkException],
         AssertOnQuery { query =>
-          assert(listener.terminationEvent !== null)
-          assert(listener.terminationEvent.id === query.id)
-          assert(listener.terminationEvent.exception.nonEmpty)
-          // Make sure that the exception message reported through listener
-          // contains the actual exception and relevant stack trace
-          assert(!listener.terminationEvent.exception.get.contains("StreamingQueryException"))
-          assert(listener.terminationEvent.exception.get.contains("java.lang.ArithmeticException"))
-          assert(listener.terminationEvent.exception.get.contains("StreamingQueryListenerSuite"))
+          eventually(Timeout(streamingTimeout)) {
+            assert(listener.terminationEvent !== null)
+            assert(listener.terminationEvent.id === query.id)
+            assert(listener.terminationEvent.exception.nonEmpty)
+            // Make sure that the exception message reported through listener
+            // contains the actual exception and relevant stack trace
+            assert(!listener.terminationEvent.exception.get.contains("StreamingQueryException"))
+            assert(
+              listener.terminationEvent.exception.get.contains("java.lang.ArithmeticException"))
+            assert(listener.terminationEvent.exception.get.contains("StreamingQueryListenerSuite"))
+          }
           listener.checkAsyncErrors()
           true
         }

From ae853e8f3bdbd16427e6f1ffade4f63abaf74abb Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 15 Dec 2016 16:15:51 -0800
Subject: [PATCH 368/534] [MINOR] Only rename SparkR tar.gz if names mismatch

## What changes were proposed in this pull request?

For release builds the R_PACKAGE_VERSION and VERSION are the same (e.g., 2.1.0). Thus `cp` throws an error which causes the build to fail.

## How was this patch tested?

Manually by executing the following script
```
set -o pipefail
set -e
set -x

touch a

R_PACKAGE_VERSION=2.1.0
VERSION=2.1.0

if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then
  cp a a
fi
```

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #16299 from shivaram/sparkr-cp-fix.

(cherry picked from commit 9634018c4d6d5a4f2c909f7227d91e637107b7f4)
Signed-off-by: Reynold Xin <rxin@databricks.com>
---
 dev/make-distribution.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 4da7d573849f8..da44748e5810e 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -228,8 +228,8 @@ if [ "$MAKE_R" == "true" ]; then
   # Install source package to get it to generate vignettes, etc.
   # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
   NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh
-  # Make a copy of R source package matching the Spark release version.
-  cp $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz
+  # Move R source package to file name matching the Spark release version.
+  mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz
   popd > /dev/null
 else
   echo "Skipping building R source package"

From ec31726581a43624fd47ce48f4e33d2a8e96c15c Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Thu, 15 Dec 2016 16:18:20 -0800
Subject: [PATCH 369/534] Preparing Spark release v2.1.0-rc4

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 1ceda7ba024c0..0cb3a80a6e892 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.1.1
+Version: 2.1.0
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 29522fd3fd829..aebfd12227751 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 85644c4a37bbe..67d78d5f102fb 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index e15ede974cf8c..93790979d7b26 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index c93a355b84d0b..53cb8dd815d81 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 7c9870a8cb85e..89bee8567fc74 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8f949b94fd233..7b45b23e9c546 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a9b858e27150f..9b84f1e0c1dfc 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index d24ef118a5c1e..bbe07006109ea 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 84ad5500c0a7d..cd5849b37453c 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.1.1
+SPARK_VERSION: 2.1.0
+SPARK_VERSION_SHORT: 2.1.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 8a9e6cfcfcc70..2fb42413aca81 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 3849c02ffb03c..4061c5f089c54 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 964e45f31b741..6cfc47ef00e2a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index eec7a889ca1ff..58caf35f65a16 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index a7622d08151fe..ed32fc0ec4c18 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index e862126e48dbe..a3f3907573f21 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index be8e73e41b947..9ae4461db64a2 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index fdfd2ccd4327a..f7276d0bd2197 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index e5bf070124b6a..52c88150137e3 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index c0a94f5950d5c..93b49bcf615b6 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index a02e23c69171d..cdfd29e3a9208 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index d7bb1acdc1d81..c6a79aa86bcf0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c53b72eefe84d..3fa28aa81f214 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 41b16500dd2bc..5c828780600cd 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 96e34cacff8b0..1818bc80ea78a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index c0b70dfdc3364..d60a633b87699 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index 532d6073343ba..f8e43d2c43ec2 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6c3a35eeb9ecd..6dcb44cebb254 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 757906d137c29..5cf3a7f3e0f5e 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 555324524ee82..49f12703c04df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.1-SNAPSHOT</version>
+  <version>2.1.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 6ae3609ae7fae..e91e778cb518c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.1.dev0"
+__version__ = "2.1.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 705316a944e28..1e7db9b10f045 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 72be7e1005f64..c58e0f43b2ac7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d7989c2413040..37e7dccd2e27d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 34e0ae5bbc229..468d758a77884 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index c543a3e049531..7bf4fc0df45e8 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fba6a5d7734a4..06569e6ee2231 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 0c4c9c9f51828..35d53b30191a5 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 85ec270bf9965..38374b5ae5a3b 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 62a6577bfa3a83783c813e74286e62b668e9af83 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Thu, 15 Dec 2016 16:18:29 -0800
Subject: [PATCH 370/534] Preparing development version 2.1.1-SNAPSHOT

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 0cb3a80a6e892..1ceda7ba024c0 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.1.0
+Version: 2.1.1
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index aebfd12227751..29522fd3fd829 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 67d78d5f102fb..85644c4a37bbe 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 93790979d7b26..e15ede974cf8c 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 53cb8dd815d81..c93a355b84d0b 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 89bee8567fc74..7c9870a8cb85e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 7b45b23e9c546..8f949b94fd233 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 9b84f1e0c1dfc..a9b858e27150f 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index bbe07006109ea..d24ef118a5c1e 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index cd5849b37453c..84ad5500c0a7d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.0
-SPARK_VERSION_SHORT: 2.1.0
+SPARK_VERSION: 2.1.1-SNAPSHOT
+SPARK_VERSION_SHORT: 2.1.1
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 2fb42413aca81..8a9e6cfcfcc70 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 4061c5f089c54..3849c02ffb03c 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6cfc47ef00e2a..964e45f31b741 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 58caf35f65a16..eec7a889ca1ff 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index ed32fc0ec4c18..a7622d08151fe 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index a3f3907573f21..e862126e48dbe 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 9ae4461db64a2..be8e73e41b947 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index f7276d0bd2197..fdfd2ccd4327a 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 52c88150137e3..e5bf070124b6a 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 93b49bcf615b6..c0a94f5950d5c 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index cdfd29e3a9208..a02e23c69171d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index c6a79aa86bcf0..d7bb1acdc1d81 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fa28aa81f214..c53b72eefe84d 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 5c828780600cd..41b16500dd2bc 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 1818bc80ea78a..96e34cacff8b0 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d60a633b87699..c0b70dfdc3364 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index f8e43d2c43ec2..532d6073343ba 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6dcb44cebb254..6c3a35eeb9ecd 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5cf3a7f3e0f5e..757906d137c29 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 49f12703c04df..555324524ee82 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0</version>
+  <version>2.1.1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index e91e778cb518c..6ae3609ae7fae 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.0"
+__version__ = "2.1.1.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1e7db9b10f045..705316a944e28 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index c58e0f43b2ac7..72be7e1005f64 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 37e7dccd2e27d..d7989c2413040 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 468d758a77884..34e0ae5bbc229 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 7bf4fc0df45e8..c543a3e049531 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 06569e6ee2231..fba6a5d7734a4 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 35d53b30191a5..0c4c9c9f51828 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 38374b5ae5a3b..85ec270bf9965 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From b23220fa67dd279d0b8005cb66d0875adbd3c8cb Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Thu, 15 Dec 2016 17:13:35 -0800
Subject: [PATCH 371/534] [MINOR] Handle fact that mv is different on linux,
 mac

Follow up to https://github.com/apache/spark/commit/ae853e8f3bdbd16427e6f1ffade4f63abaf74abb as `mv` throws an error on the Jenkins machines if source and destinations are the same.

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #16302 from shivaram/sparkr-no-mv-fix.

(cherry picked from commit 5a44f18a2a114bdd37b6714d81f88cb68148f0c9)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
---
 dev/make-distribution.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index da44748e5810e..6ea319e4362ab 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -228,8 +228,11 @@ if [ "$MAKE_R" == "true" ]; then
   # Install source package to get it to generate vignettes, etc.
   # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME
   NO_TESTS=1 CLEAN_INSTALL=1 "$SPARK_HOME/"R/check-cran.sh
-  # Move R source package to file name matching the Spark release version.
-  mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz
+  # Move R source package to match the Spark release version if the versions are not the same.
+  # NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file
+  if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then
+    mv $SPARK_HOME/R/SparkR_"$R_PACKAGE_VERSION".tar.gz $SPARK_HOME/R/SparkR_"$VERSION".tar.gz
+  fi
   popd > /dev/null
 else
   echo "Skipping building R source package"

From cd0a08361e2526519e7c131c42116bf56fa62c76 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Thu, 15 Dec 2016 17:57:04 -0800
Subject: [PATCH 372/534] Preparing Spark release v2.1.0-rc5

---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/java8-tests/pom.xml              | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 launcher/pom.xml                          | 2 +-
 mesos/pom.xml                             | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 yarn/pom.xml                              | 2 +-
 39 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 1ceda7ba024c0..0cb3a80a6e892 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.1.1
+Version: 2.1.0
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 29522fd3fd829..aebfd12227751 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 85644c4a37bbe..67d78d5f102fb 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index e15ede974cf8c..93790979d7b26 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index c93a355b84d0b..53cb8dd815d81 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 7c9870a8cb85e..89bee8567fc74 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 8f949b94fd233..7b45b23e9c546 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a9b858e27150f..9b84f1e0c1dfc 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index d24ef118a5c1e..bbe07006109ea 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/docs/_config.yml b/docs/_config.yml
index 84ad5500c0a7d..cd5849b37453c 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.1.1-SNAPSHOT
-SPARK_VERSION_SHORT: 2.1.1
+SPARK_VERSION: 2.1.0
+SPARK_VERSION_SHORT: 2.1.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.7"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 8a9e6cfcfcc70..2fb42413aca81 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 3849c02ffb03c..4061c5f089c54 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 964e45f31b741..6cfc47ef00e2a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index eec7a889ca1ff..58caf35f65a16 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index a7622d08151fe..ed32fc0ec4c18 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index e862126e48dbe..a3f3907573f21 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index be8e73e41b947..9ae4461db64a2 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index fdfd2ccd4327a..f7276d0bd2197 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index e5bf070124b6a..52c88150137e3 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index c0a94f5950d5c..93b49bcf615b6 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index a02e23c69171d..cdfd29e3a9208 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index d7bb1acdc1d81..c6a79aa86bcf0 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index c53b72eefe84d..3fa28aa81f214 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 41b16500dd2bc..5c828780600cd 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 96e34cacff8b0..1818bc80ea78a 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index c0b70dfdc3364..d60a633b87699 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index 532d6073343ba..f8e43d2c43ec2 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6c3a35eeb9ecd..6dcb44cebb254 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 757906d137c29..5cf3a7f3e0f5e 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 555324524ee82..49f12703c04df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.1-SNAPSHOT</version>
+  <version>2.1.0</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 6ae3609ae7fae..e91e778cb518c 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.1.dev0"
+__version__ = "2.1.0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 705316a944e28..1e7db9b10f045 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 72be7e1005f64..c58e0f43b2ac7 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d7989c2413040..37e7dccd2e27d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 34e0ae5bbc229..468d758a77884 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index c543a3e049531..7bf4fc0df45e8 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fba6a5d7734a4..06569e6ee2231 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 0c4c9c9f51828..35d53b30191a5 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 85ec270bf9965..38374b5ae5a3b 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.1-SNAPSHOT</version>
+    <version>2.1.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 2ffed5916291077548e56ea733625dd715e1f0f3 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Tue, 6 Dec 2016 14:25:59 -0800
Subject: [PATCH 373/534] [SPARK-18278] Minimal support for submitting to
 Kubernetes.

---
 .../org/apache/spark/deploy/SparkSubmit.scala |  36 +-
 .../spark/deploy/SparkSubmitArguments.scala   |  30 ++
 kubernetes/core/pom.xml                       | 101 +++++
 ...che.spark.scheduler.ExternalClusterManager |   1 +
 .../spark/deploy/kubernetes/Client.scala      | 355 ++++++++++++++++++
 .../kubernetes/KubernetesClientBuilder.scala  |  53 +++
 .../spark/deploy/kubernetes/Retry.scala       |  42 +++
 .../rest/KubernetesRestProtocolMessages.scala |  58 +++
 .../rest/kubernetes/HttpClientUtil.scala      |  57 +++
 .../kubernetes/KubernetesSparkRestApi.scala   |  39 ++
 .../KubernetesSparkRestServer.scala           | 274 ++++++++++++++
 .../kubernetes/KubernetesClusterManager.scala |  42 +++
 .../KubernetesClusterSchedulerBackend.scala   | 264 +++++++++++++
 kubernetes/docker-minimal-bundle/pom.xml      | 137 +++++++
 .../src/main/assembly/driver-assembly.xml     |  84 +++++
 .../src/main/assembly/executor-assembly.xml   |  84 +++++
 .../src/main/docker/driver/Dockerfile         |  26 ++
 .../src/main/docker/executor/Dockerfile       |  26 ++
 .../integration-tests-spark-jobs/pom.xml      |  45 +++
 .../jobs/SparkPiWithInfiniteWait.scala        |  50 +++
 kubernetes/integration-tests/pom.xml          | 206 ++++++++++
 .../integrationtest/KubernetesSuite.scala     | 157 ++++++++
 .../docker/SparkDockerImageBuilder.scala      |  59 +++
 .../integrationtest/minikube/Minikube.scala   | 173 +++++++++
 .../restapis/SparkRestApiV1.scala             |  50 +++
 .../launcher/SparkSubmitOptionParser.java     |  10 +
 pom.xml                                       |  49 +++
 27 files changed, 2505 insertions(+), 3 deletions(-)
 create mode 100644 kubernetes/core/pom.xml
 create mode 100644 kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
 create mode 100644 kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
 create mode 100644 kubernetes/docker-minimal-bundle/pom.xml
 create mode 100644 kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
 create mode 100644 kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
 create mode 100644 kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
 create mode 100644 kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
 create mode 100644 kubernetes/integration-tests-spark-jobs/pom.xml
 create mode 100644 kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
 create mode 100644 kubernetes/integration-tests/pom.xml
 create mode 100644 kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
 create mode 100644 kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
 create mode 100644 kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
 create mode 100644 kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index c70061bc5b5bc..598bafcab81dc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -70,7 +70,8 @@ object SparkSubmit {
   private val STANDALONE = 2
   private val MESOS = 4
   private val LOCAL = 8
-  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | LOCAL
+  private val KUBERNETES = 16
+  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | KUBERNETES | LOCAL
 
   // Deploy modes
   private val CLIENT = 1
@@ -239,6 +240,7 @@ object SparkSubmit {
         YARN
       case m if m.startsWith("spark") => STANDALONE
       case m if m.startsWith("mesos") => MESOS
+      case m if m.startsWith("kubernetes") => KUBERNETES
       case m if m.startsWith("local") => LOCAL
       case _ =>
         printErrorAndExit("Master must either be yarn or start with spark, mesos, local")
@@ -284,6 +286,7 @@ object SparkSubmit {
     }
     val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
     val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
+    val isKubernetesCluster = clusterManager == KUBERNETES && deployMode == CLUSTER
 
     // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
     // too for packages that include Python code
@@ -330,6 +333,10 @@ object SparkSubmit {
 
     // The following modes are not supported or applicable
     (clusterManager, deployMode) match {
+      case (KUBERNETES, CLIENT) =>
+        printErrorAndExit("Client mode is currently not supported for Kubernetes.")
+      case (KUBERNETES, CLUSTER) if args.isPython || args.isR =>
+        printErrorAndExit("Kubernetes does not currently support python or R applications.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -463,7 +470,17 @@ object SparkSubmit {
       OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.principal"),
       OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.keytab"),
 
-      // Other options
+      // Kubernetes only
+      OptionAssigner(args.kubernetesMaster, KUBERNETES, ALL_DEPLOY_MODES,
+        sysProp = "spark.kubernetes.master"),
+      OptionAssigner(args.kubernetesNamespace, KUBERNETES, ALL_DEPLOY_MODES,
+        sysProp = "spark.kubernetes.namespace"),
+      OptionAssigner(args.kubernetesUploadJars, KUBERNETES, CLUSTER,
+        sysProp = "spark.kubernetes.driver.uploads.jars"),
+      OptionAssigner(args.kubernetesUploadDriverExtraClasspath, KUBERNETES, CLUSTER,
+        sysProp = "spark.kubernetes.driver.uploads.driverExtraClasspath"),
+
+        // Other options
       OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
         sysProp = "spark.executor.cores"),
       OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, ALL_DEPLOY_MODES,
@@ -506,8 +523,9 @@ object SparkSubmit {
 
     // Add the application jar automatically so the user doesn't have to call sc.addJar
     // For YARN cluster mode, the jar is already distributed on each node as "app.jar"
+    // In Kubernetes cluster mode, the jar will be uploaded by the client separately.
     // For python and R files, the primary resource is already distributed as a regular file
-    if (!isYarnCluster && !args.isPython && !args.isR) {
+    if (!isYarnCluster && !isKubernetesCluster && !args.isPython && !args.isR) {
       var jars = sysProps.get("spark.jars").map(x => x.split(",").toSeq).getOrElse(Seq.empty)
       if (isUserJar(args.primaryResource)) {
         jars = jars ++ Seq(args.primaryResource)
@@ -606,6 +624,13 @@ object SparkSubmit {
       }
     }
 
+    if (isKubernetesCluster) {
+      childMainClass = "org.apache.spark.deploy.kubernetes.Client"
+      childArgs += args.primaryResource
+      childArgs += args.mainClass
+      childArgs ++= args.childArgs
+    }
+
     // Load any properties specified through --conf and the default properties file
     for ((k, v) <- args.sparkProperties) {
       sysProps.getOrElseUpdate(k, v)
@@ -829,6 +854,7 @@ private[spark] object SparkSubmitUtils {
 
   /**
    * Represents a Maven Coordinate
+   *
    * @param groupId the groupId of the coordinate
    * @param artifactId the artifactId of the coordinate
    * @param version the version of the coordinate
@@ -840,6 +866,7 @@ private[spark] object SparkSubmitUtils {
 /**
  * Extracts maven coordinates from a comma-delimited string. Coordinates should be provided
  * in the format `groupId:artifactId:version` or `groupId/artifactId:version`.
+ *
  * @param coordinates Comma-delimited string of maven coordinates
  * @return Sequence of Maven coordinates
  */
@@ -870,6 +897,7 @@ private[spark] object SparkSubmitUtils {
 
   /**
    * Extracts maven coordinates from a comma-delimited string
+   *
    * @param remoteRepos Comma-delimited string of remote repositories
    * @param ivySettings The Ivy settings for this session
    * @return A ChainResolver used by Ivy to search for and resolve dependencies.
@@ -934,6 +962,7 @@ private[spark] object SparkSubmitUtils {
   /**
    * Output a comma-delimited list of paths for the downloaded jars to be added to the classpath
    * (will append to jars in SparkSubmit).
+   *
    * @param artifacts Sequence of dependencies that were resolved and retrieved
    * @param cacheDirectory directory where jars are cached
    * @return a comma-delimited list of paths for the dependencies
@@ -990,6 +1019,7 @@ private[spark] object SparkSubmitUtils {
 
   /**
    * Resolves any dependencies that were supplied through maven coordinates
+   *
    * @param coordinates Comma-delimited string of maven coordinates
    * @param remoteRepos Comma-delimited string of remote repositories other than maven central
    * @param ivyPath The path to the local ivy repository
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index f1761e7c1ec92..4244742aad14c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -71,6 +71,12 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   var principal: String = null
   var keytab: String = null
 
+  // Kubernetes only
+  var kubernetesMaster: String = null
+  var kubernetesNamespace: String = null
+  var kubernetesUploadJars: String = null
+  var kubernetesUploadDriverExtraClasspath: String = null
+
   // Standalone cluster mode only
   var supervise: Boolean = false
   var driverCores: String = null
@@ -186,6 +192,18 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       .getOrElse(sparkProperties.get("spark.executor.instances").orNull)
     keytab = Option(keytab).orElse(sparkProperties.get("spark.yarn.keytab")).orNull
     principal = Option(principal).orElse(sparkProperties.get("spark.yarn.principal")).orNull
+    kubernetesMaster = Option(kubernetesMaster)
+      .orElse(sparkProperties.get("spark.kubernetes.master"))
+      .orNull
+    kubernetesNamespace = Option(kubernetesNamespace)
+      .orElse(sparkProperties.get("spark.kubernetes.namespace"))
+      .orNull
+    kubernetesUploadJars = Option(kubernetesUploadJars)
+      .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.jars"))
+      .orNull
+    kubernetesUploadDriverExtraClasspath = Option(kubernetesUploadDriverExtraClasspath)
+      .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.driverExtraClasspath"))
+      .orNull
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && !isR && primaryResource != null) {
@@ -426,6 +444,18 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       case KEYTAB =>
         keytab = value
 
+      case KUBERNETES_MASTER =>
+        kubernetesMaster = value
+
+      case KUBERNETES_NAMESPACE =>
+        kubernetesNamespace = value
+
+      case KUBERNETES_UPLOAD_JARS =>
+        kubernetesUploadJars = value
+
+      case KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH =>
+        kubernetesUploadDriverExtraClasspath = value
+
       case HELP =>
         printUsageAndExit(0)
 
diff --git a/kubernetes/core/pom.xml b/kubernetes/core/pom.xml
new file mode 100644
index 0000000000000..9c7eb52b2680a
--- /dev/null
+++ b/kubernetes/core/pom.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-kubernetes_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Kubernetes</name>
+  <properties>
+    <sbt.project.name>kubernetes</sbt.project.name>
+    <kubernetes.client.version>1.4.17</kubernetes.client.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>io.fabric8</groupId>
+      <artifactId>kubernetes-client</artifactId>
+      <version>${kubernetes.client.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.netflix.feign</groupId>
+      <artifactId>feign-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.netflix.feign</groupId>
+      <artifactId>feign-okhttp</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.netflix.feign</groupId>
+      <artifactId>feign-jackson</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.netflix.feign</groupId>
+      <artifactId>feign-jaxrs</artifactId>
+      <exclusions>
+        <exclusion>
+          <groupId>javax.ws.rs</groupId>
+          <artifactId>jsr311-api</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.jaxrs</groupId>
+      <artifactId>jackson-jaxrs-json-provider</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>javax.ws.rs</groupId>
+      <artifactId>javax.ws.rs-api</artifactId>
+    </dependency>
+    <!-- Explicitly depend on shaded dependencies from the parent, since shaded deps aren't transitive -->
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+    </dependency>
+    <!-- End of shaded deps. -->
+
+  </dependencies>
+
+
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+
+</project>
+
diff --git a/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
new file mode 100644
index 0000000000000..55e7e38b28a08
--- /dev/null
+++ b/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
@@ -0,0 +1 @@
+org.apache.spark.scheduler.cluster.kubernetes.KubernetesClusterManager
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
new file mode 100644
index 0000000000000..4ee00e8802080
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.File
+import java.security.SecureRandom
+import java.util.concurrent.{Executors, TimeUnit}
+import javax.net.ssl.X509TrustManager
+
+import com.google.common.io.Files
+import com.google.common.util.concurrent.{SettableFuture, ThreadFactoryBuilder}
+import io.fabric8.kubernetes.api.model._
+import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient, KubernetesClientException, Watch, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import io.fabric8.kubernetes.client.internal.SSLUtils
+import org.apache.commons.codec.binary.Base64
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.ExecutionContext
+import scala.concurrent.duration.DurationInt
+import scala.util.Success
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.rest.{AppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
+import org.apache.spark.deploy.rest.kubernetes._
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+private[spark] class Client(
+    sparkConf: SparkConf,
+    mainClass: String,
+    mainAppResource: String,
+    appArgs: Array[String]) extends Logging {
+  import Client._
+
+  private val namespace = sparkConf.getOption("spark.kubernetes.namespace").getOrElse(
+    throw new IllegalArgumentException("Namespace must be provided in spark.kubernetes.namespace"))
+  private val master = sparkConf
+    .getOption("spark.kubernetes.master")
+    .getOrElse("Master must be provided in spark.kubernetes.master")
+
+  private val launchTime = System.currentTimeMillis
+  private val kubernetesAppId = sparkConf.getOption("spark.app.name")
+      .orElse(sparkConf.getOption("spark.app.id"))
+      .getOrElse(s"spark-$launchTime")
+
+  private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
+  private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
+  // TODO set precise version by default
+  private val driverDockerImage = sparkConf.get(
+    "spark.kubernetes.driver.docker.image", "spark-driver:latest")
+  private val uploadedDriverExtraClasspath = sparkConf
+    .getOption("spark.kubernetes.driver.uploads.driverExtraClasspath")
+  private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
+
+  private val secretBytes = new Array[Byte](128)
+  SECURE_RANDOM.nextBytes(secretBytes)
+  private val secretBase64String = Base64.encodeBase64String(secretBytes)
+
+  private implicit val retryableExecutionContext = ExecutionContext
+    .fromExecutorService(
+      Executors.newSingleThreadExecutor(new ThreadFactoryBuilder()
+        .setNameFormat("kubernetes-client-retryable-futures-%d")
+        .setDaemon(true)
+        .build()))
+
+  def run(): Unit = {
+    var k8ConfBuilder = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(master)
+      .withNamespace(namespace)
+    sparkConf.getOption("spark.kubernetes.submit.caCertFile").foreach {
+      f => k8ConfBuilder = k8ConfBuilder.withCaCertFile(f)
+    }
+    sparkConf.getOption("spark.kubernetes.submit.clientKeyFile").foreach {
+      f => k8ConfBuilder = k8ConfBuilder.withClientKeyFile(f)
+    }
+    sparkConf.getOption("spark.kubernetes.submit.clientCertFile").foreach {
+      f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
+    }
+
+    val k8ClientConfig = k8ConfBuilder.build
+    Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig))(kubernetesClient => {
+      val secret = kubernetesClient.secrets().createNew()
+        .withNewMetadata()
+        .withName(secretName)
+        .endMetadata()
+        .withData(Map((SUBMISSION_SERVER_SECRET_NAME, secretBase64String)).asJava)
+        .withType("Opaque")
+        .done()
+      try {
+        val selectors = Map(DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue).asJava
+        val uiPort = sparkConf
+          .getOption("spark.ui.port")
+          .map(_.toInt)
+          .getOrElse(DEFAULT_UI_PORT)
+        val (servicePorts, containerPorts) = configurePorts(uiPort)
+        val service = kubernetesClient.services().createNew()
+          .withNewMetadata()
+            .withName(kubernetesAppId)
+            .endMetadata()
+          .withNewSpec()
+            .withSelector(selectors)
+            .withPorts(servicePorts.asJava)
+            .endSpec()
+          .done()
+        sparkConf.set("spark.kubernetes.driver.service.name", service.getMetadata.getName)
+        sparkConf.setIfMissing("spark.driver.port", DRIVER_PORT.toString)
+        sparkConf.setIfMissing("spark.blockmanager.port", BLOCKMANAGER_PORT.toString)
+        val submitRequest = buildSubmissionRequest()
+        val submitCompletedFuture = SettableFuture.create[Boolean]
+        val secretDirectory = s"/var/run/secrets/spark-submission/$kubernetesAppId"
+
+        val podWatcher = new Watcher[Pod] {
+          override def eventReceived(action: Action, t: Pod): Unit = {
+            if ((action == Action.ADDED || action == Action.MODIFIED)
+              && t.getStatus.getPhase == "Running"
+              && !submitCompletedFuture.isDone) {
+              t.getStatus
+                .getContainerStatuses
+                .asScala
+                .find(status =>
+                  status.getName == DRIVER_LAUNCHER_CONTAINER_NAME && status.getReady) match {
+                case Some(status) =>
+                  try {
+                    val driverLauncher = getDriverLauncherService(
+                      k8ClientConfig, master)
+                    val ping = Retry.retry(5, 5.seconds) {
+                      driverLauncher.ping()
+                    }
+                    ping onFailure {
+                      case t: Throwable =>
+                        if (!submitCompletedFuture.isDone) {
+                          submitCompletedFuture.setException(t)
+                        }
+                    }
+                    val submitComplete = ping andThen {
+                      case Success(_) =>
+                        driverLauncher.create(submitRequest)
+                        submitCompletedFuture.set(true)
+                    }
+                    submitComplete onFailure {
+                      case t: Throwable =>
+                        if (!submitCompletedFuture.isDone) {
+                          submitCompletedFuture.setException(t)
+                        }
+                    }
+                  } catch {
+                    case e: Throwable =>
+                      if (!submitCompletedFuture.isDone) {
+                        submitCompletedFuture.setException(e)
+                        throw e
+                      }
+                  }
+                case None =>
+              }
+            }
+          }
+
+          override def onClose(e: KubernetesClientException): Unit = {
+            if (!submitCompletedFuture.isDone) {
+              submitCompletedFuture.setException(e)
+            }
+          }
+        }
+
+        def createDriverPod(unused: Watch): Unit = {
+          kubernetesClient.pods().createNew()
+            .withNewMetadata()
+              .withName(kubernetesAppId)
+              .withLabels(selectors)
+              .endMetadata()
+            .withNewSpec()
+              .withRestartPolicy("OnFailure")
+              .addNewVolume()
+                .withName(s"spark-submission-secret-volume")
+                  .withNewSecret()
+                  .withSecretName(secret.getMetadata.getName)
+                  .endSecret()
+                .endVolume
+              .addNewContainer()
+                .withName(DRIVER_LAUNCHER_CONTAINER_NAME)
+                .withImage(driverDockerImage)
+                .withImagePullPolicy("IfNotPresent")
+                .addNewVolumeMount()
+                  .withName("spark-submission-secret-volume")
+                  .withMountPath(secretDirectory)
+                  .withReadOnly(true)
+                  .endVolumeMount()
+                .addNewEnv()
+                  .withName("SPARK_SUBMISSION_SECRET_LOCATION")
+                  .withValue(s"$secretDirectory/$SUBMISSION_SERVER_SECRET_NAME")
+                  .endEnv()
+                .addNewEnv()
+                  .withName("SPARK_DRIVER_LAUNCHER_SERVER_PORT")
+                  .withValue(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT.toString)
+                  .endEnv()
+                .withPorts(containerPorts.asJava)
+                .endContainer()
+              .endSpec()
+            .done()
+          submitCompletedFuture.get(30, TimeUnit.SECONDS)
+        }
+
+        Utils.tryWithResource(kubernetesClient
+          .pods()
+          .withLabels(selectors)
+          .watch(podWatcher)) { createDriverPod }
+      } finally {
+        kubernetesClient.secrets().delete(secret)
+      }
+    })
+  }
+
+  private def configurePorts(uiPort: Int): (Seq[ServicePort], Seq[ContainerPort]) = {
+    val servicePorts = new ArrayBuffer[ServicePort]
+    val containerPorts = new ArrayBuffer[ContainerPort]
+
+    def addPortToServiceAndContainer(portName: String, portValue: Int): Unit = {
+      servicePorts += new ServicePortBuilder()
+        .withName(portName)
+        .withPort(portValue)
+        .withNewTargetPort(portValue)
+        .build()
+      containerPorts += new ContainerPortBuilder()
+        .withContainerPort(portValue)
+        .build()
+    }
+
+    addPortToServiceAndContainer(
+      DRIVER_LAUNCHER_SERVICE_PORT_NAME,
+      DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT)
+    addPortToServiceAndContainer(
+      DRIVER_PORT_NAME,
+      sparkConf
+        .getOption("spark.driver.port")
+        .map(_.toInt)
+        .getOrElse(DRIVER_PORT))
+    addPortToServiceAndContainer(
+      BLOCKMANAGER_PORT_NAME,
+      sparkConf
+        .getOption("spark.blockmanager.port")
+        .map(_.toInt)
+        .getOrElse(BLOCKMANAGER_PORT))
+
+    addPortToServiceAndContainer(UI_PORT_NAME, uiPort)
+    (servicePorts.toSeq, containerPorts.toSeq)
+  }
+
+  private def buildSubmissionRequest(): KubernetesCreateSubmissionRequest = {
+    val appResourceUri = Utils.resolveURI(mainAppResource)
+    val resolvedAppResource: AppResource = appResourceUri.getScheme match {
+      case "file" | null =>
+        val appFile = new File(appResourceUri.getPath)
+        if (!appFile.isFile) {
+          throw new IllegalStateException("Provided local file path does not exist" +
+            s" or is not a file: ${appFile.getAbsolutePath}")
+        }
+        val fileBytes = Files.toByteArray(appFile)
+        val fileBase64 = Base64.encodeBase64String(fileBytes)
+        UploadedAppResource(resourceBase64Contents = fileBase64, name = appFile.getName)
+      case other => RemoteAppResource(other)
+    }
+
+    val uploadDriverExtraClasspathBase64Contents = getFileContents(uploadedDriverExtraClasspath)
+    val uploadJarsBase64Contents = getFileContents(uploadedJars)
+    KubernetesCreateSubmissionRequest(
+      appResource = resolvedAppResource,
+      mainClass = mainClass,
+      appArgs = appArgs,
+      secret = secretBase64String,
+      sparkProperties = sparkConf.getAll.toMap,
+      uploadedDriverExtraClasspathBase64Contents = uploadDriverExtraClasspathBase64Contents,
+      uploadedJarsBase64Contents = uploadJarsBase64Contents)
+  }
+
+  def getFileContents(maybeFilePaths: Option[String]): Array[(String, String)] = {
+    maybeFilePaths
+      .map(_.split(",").map(filePath => {
+        val driverExtraClasspathFile = new File(filePath)
+        if (!driverExtraClasspathFile.isFile) {
+          throw new IllegalStateException("Provided file to upload for driver extra classpath" +
+            s" does not exist or is not a file: $filePath")
+        } else {
+          val fileBytes = Files.toByteArray(driverExtraClasspathFile)
+          val fileBase64 = Base64.encodeBase64String(fileBytes)
+          (driverExtraClasspathFile.getName, fileBase64)
+        }
+      })).getOrElse(Array.empty[(String, String)])
+  }
+
+  private def getDriverLauncherService(
+      k8ClientConfig: Config,
+      kubernetesMaster: String): KubernetesSparkRestApi = {
+    val url = s"${
+      Array[String](
+        kubernetesMaster,
+        "api", "v1", "proxy",
+        "namespaces", namespace,
+        "services", kubernetesAppId).mkString("/")}" +
+      s":$DRIVER_LAUNCHER_SERVICE_PORT_NAME/"
+
+    val sslContext = SSLUtils.sslContext(k8ClientConfig)
+    val trustManager = SSLUtils.trustManagers(
+      k8ClientConfig)(0).asInstanceOf[X509TrustManager]
+    HttpClientUtil.createClient[KubernetesSparkRestApi](
+      uri = url,
+      sslSocketFactory = sslContext.getSocketFactory,
+      trustContext = trustManager)
+  }
+}
+
+private object Client {
+
+  private val SUBMISSION_SERVER_SECRET_NAME = "spark-submission-server-secret"
+  private val DRIVER_LAUNCHER_SELECTOR_LABEL = "driver-launcher-selector"
+  private val DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT = 7077
+  private val DRIVER_PORT = 7078
+  private val BLOCKMANAGER_PORT = 7079
+  private val DEFAULT_UI_PORT = 4040
+  private val UI_PORT_NAME = "spark-ui-port"
+  private val DRIVER_LAUNCHER_SERVICE_PORT_NAME = "driver-launcher-port"
+  private val DRIVER_PORT_NAME = "driver-port"
+  private val BLOCKMANAGER_PORT_NAME = "block-manager-port"
+  private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher"
+  private val SECURE_RANDOM = new SecureRandom()
+
+  def main(args: Array[String]): Unit = {
+    require(args.length >= 2, s"Too few arguments. Usage: ${getClass.getName} <mainAppResource>" +
+      s" <mainClass> [<application arguments>]")
+    val mainAppResource = args(0)
+    val mainClass = args(1)
+    val appArgs = args.drop(2)
+    val sparkConf = new SparkConf(true)
+    new Client(
+      mainAppResource = mainAppResource,
+      mainClass = mainClass,
+      sparkConf = sparkConf,
+      appArgs = appArgs).run()
+  }
+}
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
new file mode 100644
index 0000000000000..4c715c86cc7f9
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
+
+private[spark] object KubernetesClientBuilder {
+  private val API_SERVER_TOKEN = new File("/var/run/secrets/kubernetes.io/serviceaccount/token")
+  private val CA_CERT_FILE = new File("/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
+
+  /**
+    * Creates a {@link KubernetesClient}, expecting to be from
+    * within the context of a pod. When doing so, credentials files
+    * are picked up from canonical locations, as they are injected
+    * into the pod's disk space.
+    */
+  def buildFromWithinPod(
+      kubernetesMaster: String,
+      kubernetesNamespace: String): DefaultKubernetesClient = {
+    var clientConfigBuilder = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(kubernetesMaster)
+      .withNamespace(kubernetesNamespace)
+
+    if (CA_CERT_FILE.isFile) {
+      clientConfigBuilder = clientConfigBuilder.withCaCertFile(CA_CERT_FILE.getAbsolutePath)
+    }
+
+    if (API_SERVER_TOKEN.isFile) {
+      clientConfigBuilder = clientConfigBuilder.withOauthToken(
+        Files.toString(API_SERVER_TOKEN, Charsets.UTF_8))
+    }
+    new DefaultKubernetesClient(clientConfigBuilder.build)
+  }
+}
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
new file mode 100644
index 0000000000000..e5ce0bcd606b2
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
+
+private[spark] object Retry {
+
+  private def retryableFuture[T]
+      (times: Int, interval: Duration)
+      (f: => Future[T])
+      (implicit executionContext: ExecutionContext): Future[T] = {
+    f recoverWith {
+      case _ if times > 0 => {
+        Thread.sleep(interval.toMillis)
+        retryableFuture(times - 1, interval)(f)
+      }
+    }
+  }
+
+  def retry[T]
+      (times: Int, interval: Duration)
+      (f: => T)
+      (implicit executionContext: ExecutionContext): Future[T] = {
+    retryableFuture(times, interval)(Future[T] { f })
+  }
+}
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
new file mode 100644
index 0000000000000..4b7bb66083f29
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest
+
+import com.fasterxml.jackson.annotation.{JsonSubTypes, JsonTypeInfo}
+
+import org.apache.spark.SPARK_VERSION
+
+// TODO: jars should probably be compressed. Shipping tarballs would be optimal.
+case class KubernetesCreateSubmissionRequest(
+  val appResource: AppResource,
+  val mainClass: String,
+  val appArgs: Array[String],
+  val sparkProperties: Map[String, String],
+  val secret: String,
+  val uploadedDriverExtraClasspathBase64Contents: Array[(String, String)]
+      = Array.empty[(String, String)],
+  val uploadedJarsBase64Contents: Array[(String, String)]
+      = Array.empty[(String, String)]) extends SubmitRestProtocolRequest {
+  message = "create"
+  clientSparkVersion = SPARK_VERSION
+}
+
+@JsonTypeInfo(
+  use = JsonTypeInfo.Id.NAME,
+  include = JsonTypeInfo.As.PROPERTY,
+  property = "type")
+@JsonSubTypes(value = Array(
+  new JsonSubTypes.Type(value = classOf[UploadedAppResource], name = "UploadedAppResource"),
+  new JsonSubTypes.Type(value = classOf[RemoteAppResource], name = "RemoteAppResource")))
+abstract class AppResource
+
+case class UploadedAppResource(
+  resourceBase64Contents: String,
+  name: String = "spark-app-resource") extends AppResource
+
+case class RemoteAppResource(resource: String) extends AppResource
+
+class PingResponse extends SubmitRestProtocolResponse {
+  val text = "pong"
+  message = "pong"
+  serverSparkVersion = SPARK_VERSION
+}
+
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
new file mode 100644
index 0000000000000..eb7d411700829
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import javax.net.ssl.{SSLContext, SSLSocketFactory, X509TrustManager}
+
+import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import feign.Feign
+import feign.Request.Options
+import feign.jackson.{JacksonDecoder, JacksonEncoder}
+import feign.jaxrs.JAXRSContract
+import okhttp3.OkHttpClient
+import scala.reflect.ClassTag
+
+import org.apache.spark.status.api.v1.JacksonMessageWriter
+
+private[spark] object HttpClientUtil {
+
+  def createClient[T: ClassTag](
+      uri: String,
+      sslSocketFactory: SSLSocketFactory = SSLContext.getDefault.getSocketFactory,
+      trustContext: X509TrustManager = null,
+      readTimeoutMillis: Int = 20000,
+      connectTimeoutMillis: Int = 20000): T = {
+    var httpClientBuilder = new OkHttpClient.Builder()
+    Option.apply(trustContext).foreach(context => {
+      httpClientBuilder = httpClientBuilder.sslSocketFactory(sslSocketFactory, context)
+    })
+    val objectMapper = new ObjectMapper()
+      .registerModule(new DefaultScalaModule)
+      .setDateFormat(JacksonMessageWriter.makeISODateFormat)
+    objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+    val clazz = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
+    Feign.builder()
+      .client(new feign.okhttp.OkHttpClient(httpClientBuilder.build()))
+      .contract(new JAXRSContract)
+      .encoder(new JacksonEncoder(objectMapper))
+      .decoder(new JacksonDecoder(objectMapper))
+      .options(new Options(connectTimeoutMillis, readTimeoutMillis))
+      .target(clazz, uri)
+  }
+}
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
new file mode 100644
index 0000000000000..3cbcb16293b1d
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import javax.ws.rs.{Consumes, GET, Path, POST, Produces}
+import javax.ws.rs.core.MediaType
+
+import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KubernetesCreateSubmissionRequest, PingResponse}
+
+@Path("/v1/submissions/")
+trait KubernetesSparkRestApi {
+
+  @POST
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.APPLICATION_JSON))
+  @Path("/create")
+  def create(request: KubernetesCreateSubmissionRequest): CreateSubmissionResponse
+
+  @GET
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.APPLICATION_JSON))
+  @Path("/ping")
+  def ping(): PingResponse
+
+}
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
new file mode 100644
index 0000000000000..0a2e8176394ab
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.io.File
+import java.net.URI
+import java.nio.file.Paths
+import java.util.concurrent.CountDownLatch
+import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
+
+import com.google.common.io.Files
+import org.apache.commons.codec.binary.Base64
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.{SecurityManager, SPARK_VERSION, SparkConf}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.rest._
+import org.apache.spark.util.{ShutdownHookManager, Utils}
+
+private case class KubernetesSparkRestServerArguments(
+  val host: Option[String] = None,
+  val port: Option[Int] = None,
+  val secretFile: Option[String] = None) {
+  def validate(): KubernetesSparkRestServerArguments = {
+    require(host.isDefined, "Hostname not set via --hostname.")
+    require(port.isDefined, "Port not set via --port")
+    require(secretFile.isDefined, "Secret file not set via --secret-file")
+    this
+  }
+}
+
+private object KubernetesSparkRestServerArguments {
+  def fromArgsArray(inputArgs: Array[String]): KubernetesSparkRestServerArguments = {
+    var args = inputArgs.toList
+    var resolvedArguments = KubernetesSparkRestServerArguments()
+    while (args.nonEmpty) {
+      resolvedArguments = args match {
+        case "--hostname" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(host = Some(value))
+        case "--port" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(port = Some(value.toInt))
+        case "--secret-file" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(secretFile = Some(value))
+        // TODO polish usage message
+        case Nil => resolvedArguments
+        case unknown => throw new IllegalStateException(s"Unknown argument(s) found: $unknown")
+      }
+    }
+    resolvedArguments.validate()
+  }
+}
+
+private[spark] class KubernetesSparkRestServer(
+    host: String,
+    port: Int,
+    conf: SparkConf,
+    expectedApplicationSecret: Array[Byte])
+  extends RestSubmissionServer(host, port, conf) {
+
+  private val javaExecutable = s"${System.getenv("JAVA_HOME")}/bin/java"
+  private val sparkHome = System.getenv("SPARK_HOME")
+  private val securityManager = new SecurityManager(conf)
+  override protected lazy val contextToServlet = Map[String, RestServlet](
+    s"$baseContext/create/*" -> submitRequestServlet,
+    s"$baseContext/ping/*" -> pingServlet)
+
+  private val pingServlet = new PingServlet
+  override protected val submitRequestServlet: SubmitRequestServlet
+    = new KubernetesSubmitRequestServlet
+  // TODO
+  override protected val statusRequestServlet: StatusRequestServlet = null
+  override protected val killRequestServlet: KillRequestServlet = null
+
+  private class PingServlet extends RestServlet {
+    protected override def doGet(
+      request: HttpServletRequest,
+      response: HttpServletResponse): Unit = {
+      sendResponse(new PingResponse, response)
+    }
+  }
+
+  private class KubernetesSubmitRequestServlet extends SubmitRequestServlet {
+
+    // TODO validating the secret should be done as part of a header of the request.
+    // Instead here we have to specify the secret in the body.
+    override protected def handleSubmit(
+      requestMessageJson: String,
+      requestMessage: SubmitRestProtocolMessage,
+      responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
+      requestMessage match {
+        case KubernetesCreateSubmissionRequest(
+            appResource,
+            mainClass,
+            appArgs,
+            sparkProperties,
+            secret,
+            uploadedDriverExtraClasspath,
+            uploadedJars) =>
+          val decodedSecret = Base64.decodeBase64(secret)
+          if (!expectedApplicationSecret.sameElements(decodedSecret)) {
+            responseServlet.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
+            handleError("Unauthorized to submit application.")
+          } else {
+            val tempDir = Utils.createTempDir()
+            val appResourcePath = resolvedAppResource(appResource, tempDir)
+            val driverClasspathDirectory = new File(tempDir, "driver-extra-classpath")
+            if (!driverClasspathDirectory.mkdir) {
+              throw new IllegalStateException("Failed to create driver extra classpath" +
+                s" dir at ${driverClasspathDirectory.getAbsolutePath}")
+            }
+            val jarsDirectory = new File(tempDir, "jars")
+            if (!jarsDirectory.mkdir) {
+              throw new IllegalStateException("Failed to create jars dir at" +
+                 s"${jarsDirectory.getAbsolutePath}")
+            }
+            val writtenDriverExtraClasspath = writeBase64ContentsToFiles(
+              uploadedDriverExtraClasspath, driverClasspathDirectory)
+            val writtenJars = writeBase64ContentsToFiles(uploadedJars, jarsDirectory)
+            val originalDriverExtraClasspath = sparkProperties.get("spark.driver.extraClassPath")
+              .map(_.split(","))
+              .getOrElse(Array.empty[String])
+            val resolvedDriverExtraClasspath = writtenDriverExtraClasspath ++
+              originalDriverExtraClasspath
+            val originalJars = sparkProperties.get("spark.jars")
+              .map(_.split(","))
+              .getOrElse(Array.empty[String])
+            val resolvedJars = writtenJars ++ originalJars ++ Array(appResourcePath)
+            val sparkJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
+            val driverClasspath = resolvedDriverExtraClasspath ++
+              resolvedJars ++
+              sparkJars ++
+              Array(appResourcePath)
+            val resolvedSparkProperties = new mutable.HashMap[String, String]
+            resolvedSparkProperties ++= sparkProperties
+            resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
+
+            val command = new ArrayBuffer[String]
+            command += javaExecutable
+            command += "-cp"
+            command += s"${driverClasspath.mkString(":")}"
+            for (prop <- resolvedSparkProperties) {
+              command += s"-D${prop._1}=${prop._2}"
+            }
+            val driverMemory = resolvedSparkProperties.getOrElse("spark.driver.memory", "1g")
+            command += s"-Xms$driverMemory"
+            command += s"-Xmx$driverMemory"
+            command += mainClass
+            command ++= appArgs
+            val pb = new ProcessBuilder(command: _*)
+            Paths.get(sparkHome, "logs").toFile.mkdirs
+            pb.redirectOutput(Paths.get(sparkHome, "logs", "stdout").toFile)
+            pb.redirectError(Paths.get(sparkHome, "logs", "stderr").toFile)
+            val process = pb.start()
+            ShutdownHookManager.addShutdownHook(() => {
+              logInfo("Received stop command, shutting down the running Spark application...")
+              process.destroy()
+            })
+            val response = new CreateSubmissionResponse
+            response.success = true
+            response.submissionId = null
+            response.message = "success"
+            response.serverSparkVersion = SPARK_VERSION
+            response
+          }
+        case unexpected =>
+          responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+          handleError(s"Received message of unexpected type ${unexpected.messageType}.")
+      }
+    }
+
+    def resolvedAppResource(appResource: AppResource, tempDir: File): String = {
+      val appResourcePath = appResource match {
+        case UploadedAppResource(resourceContentsBase64, resourceName) =>
+          val resourceFile = new File(tempDir, resourceName)
+          val resourceFilePath = resourceFile.getAbsolutePath
+          if (resourceFile.createNewFile()) {
+            val resourceContentsBytes = Base64.decodeBase64(resourceContentsBase64)
+            Files.write(resourceContentsBytes, resourceFile)
+            resourceFile.getAbsolutePath
+          } else {
+            throw new IllegalStateException(s"Failed to write main app resource file" +
+              s" to $resourceFilePath")
+          }
+        case RemoteAppResource(resource) =>
+          Utils.fetchFile(resource, tempDir, conf,
+            securityManager, SparkHadoopUtil.get.newConfiguration(conf),
+            System.currentTimeMillis(), useCache = false)
+          val fileName = Utils.decodeFileNameInURI(URI.create(resource))
+          val downloadedFile = new File(tempDir, fileName)
+          val downloadedFilePath = downloadedFile.getAbsolutePath
+          if (!downloadedFile.isFile) {
+            throw new IllegalStateException(s"Main app resource is not a file or" +
+              s" does not exist at $downloadedFilePath")
+          }
+          downloadedFilePath
+      }
+      appResourcePath
+    }
+  }
+
+  private def writeBase64ContentsToFiles(
+        filesBase64Contents: Array[(String, String)],
+        rootDir: File): Seq[String] = {
+    val resolvedFileNames = new scala.collection.mutable.HashSet[String]
+    val resolvedFilePaths = new ArrayBuffer[String]
+    for (file <- filesBase64Contents)  {
+      var currentFileName = file._1
+      var deduplicationCounter = 1
+      while (resolvedFileNames.contains(currentFileName)) {
+        // Prepend the deduplication counter so as to not mess with the extension
+        currentFileName = s"$deduplicationCounter-$currentFileName"
+        deduplicationCounter += 1
+      }
+      val resolvedFile = new File(rootDir, currentFileName)
+      val resolvedFilePath = resolvedFile.getAbsolutePath
+      if (resolvedFile.createNewFile()) {
+        val fileContents = Base64.decodeBase64(file._2)
+        Files.write(fileContents, resolvedFile)
+      } else {
+        throw new IllegalStateException(s"Could not write jar file to $resolvedFilePath")
+      }
+      resolvedFileNames += currentFileName
+      resolvedFilePaths += resolvedFilePath
+    }
+    resolvedFilePaths.toSeq
+  }
+}
+
+private[spark] object KubernetesSparkRestServer {
+  private val barrier = new CountDownLatch(1)
+  def main(args: Array[String]): Unit = {
+    val parsedArguments = KubernetesSparkRestServerArguments.fromArgsArray(args)
+    val secretFile = new File(parsedArguments.secretFile.get)
+    if (!secretFile.isFile) {
+      throw new IllegalArgumentException(s"Secret file specified by --secret-file" +
+        " is not a file, or does not exist.")
+    }
+    val secretBytes = Files.toByteArray(secretFile)
+    val sparkConf = new SparkConf(true)
+    val server = new KubernetesSparkRestServer(
+      parsedArguments.host.get,
+      parsedArguments.port.get,
+      sparkConf,
+      secretBytes)
+    server.start()
+    ShutdownHookManager.addShutdownHook(() => {
+      try {
+        server.stop()
+      } finally {
+        barrier.countDown()
+      }
+    })
+    barrier.await()
+  }
+}
+
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
new file mode 100644
index 0000000000000..0d3b97c636ca3
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import org.apache.spark.SparkContext
+import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
+
+private[spark] class KubernetesClusterManager extends ExternalClusterManager {
+
+  override def canCreate(masterURL: String): Boolean = masterURL.startsWith("kubernetes")
+
+  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
+    val scheduler = new TaskSchedulerImpl(sc)
+    sc.taskScheduler = scheduler
+    scheduler
+  }
+
+  override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler)
+      : SchedulerBackend = {
+    new KubernetesClusterSchedulerBackend(sc.taskScheduler.asInstanceOf[TaskSchedulerImpl], sc)
+  }
+
+  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
+    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
+  }
+
+}
+
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
new file mode 100644
index 0000000000000..f37b97e4dd0dc
--- /dev/null
+++ b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import java.util.UUID
+import java.util.concurrent.Executors
+import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder
+import io.fabric8.kubernetes.api.model.{ContainerPort, ContainerPortBuilder, EnvVar, EnvVarBuilder, Pod, QuantityBuilder}
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.deploy.kubernetes.KubernetesClientBuilder
+import org.apache.spark.rpc.RpcEndpointAddress
+import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.util.Utils
+
+private[spark] class KubernetesClusterSchedulerBackend(
+  scheduler: TaskSchedulerImpl,
+  val sc: SparkContext)
+  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
+
+  import KubernetesClusterSchedulerBackend._
+
+  private val EXECUTOR_MODIFICATION_LOCK = new Object
+  private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
+
+  private val kubernetesMaster = conf
+    .getOption("spark.kubernetes.master")
+    .getOrElse(
+      throw new SparkException("Kubernetes master must be specified in kubernetes mode."))
+
+  private val executorDockerImage = conf
+    .get("spark.kubernetes.executor.docker.image", s"spark-executor:${sc.version}")
+
+  private val kubernetesNamespace = conf
+    .getOption("spark.kubernetes.namespace")
+    .getOrElse(
+      throw new SparkException("Kubernetes namespace must be specified in kubernetes mode."))
+
+  private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)
+
+  private val blockmanagerPort = conf
+    .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
+
+  private val kubernetesDriverServiceName = conf
+    .getOption("spark.kubernetes.driver.service.name")
+    .getOrElse(
+      throw new SparkException("Must specify the service name the driver is running with"))
+
+  private val executorMemory = conf.getOption("spark.executor.memory").getOrElse("1g")
+  private val executorMemoryBytes = Utils.byteStringAsBytes(executorMemory)
+
+  private val memoryOverheadBytes = conf
+    .getOption("spark.kubernetes.executor.memoryOverhead")
+    .map(overhead => Utils.byteStringAsBytes(overhead))
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryBytes).toInt,
+      MEMORY_OVERHEAD_MIN))
+  private val executorMemoryWithOverhead = executorMemoryBytes + memoryOverheadBytes
+
+  private val executorCores = conf.getOption("spark.executor.cores").getOrElse("1")
+
+  private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
+    Executors.newCachedThreadPool(
+      new ThreadFactoryBuilder()
+        .setDaemon(true)
+        .setNameFormat("kubernetes-executor-requests-%d")
+        .build))
+
+  private val kubernetesClient = KubernetesClientBuilder
+    .buildFromWithinPod(kubernetesMaster, kubernetesNamespace)
+
+  override val minRegisteredRatio =
+    if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
+      0.8
+    } else {
+      super.minRegisteredRatio
+    }
+
+  protected var totalExpectedExecutors = new AtomicInteger(0)
+
+  private val driverUrl = RpcEndpointAddress(
+    System.getenv(s"${convertToEnvMode(kubernetesDriverServiceName)}_SERVICE_HOST"),
+    sc.getConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT),
+    CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
+
+  private def convertToEnvMode(value: String): String =
+    value.toUpperCase.map { c => if (c == '-') '_' else c }
+
+  private val initialExecutors = getInitialTargetExecutorNumber(1)
+
+  private def getInitialTargetExecutorNumber(defaultNumExecutors: Int = 1): Int = {
+    if (Utils.isDynamicAllocationEnabled(conf)) {
+      val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
+      val initialNumExecutors = Utils.getDynamicAllocationInitialExecutors(conf)
+      val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", 1)
+      require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
+        s"initial executor number $initialNumExecutors must between min executor number " +
+          s"$minNumExecutors and max executor number $maxNumExecutors")
+
+      initialNumExecutors
+    } else {
+      conf.getInt("spark.executor.instances", defaultNumExecutors)
+    }
+  }
+
+  override def sufficientResourcesRegistered(): Boolean = {
+    totalRegisteredExecutors.get() >= initialExecutors * minRegisteredRatio
+  }
+
+  override def start(): Unit = {
+    super.start()
+    if (!Utils.isDynamicAllocationEnabled(sc.conf)) {
+      doRequestTotalExecutors(initialExecutors)
+    }
+  }
+
+  override def stop(): Unit = {
+    // TODO investigate why Utils.tryLogNonFatalError() doesn't work in this context.
+    // When using Utils.tryLogNonFatalError some of the code fails but without any logs or
+    // indication as to why.
+    try {
+      runningExecutorPods.values.foreach(kubernetesClient.pods().delete(_))
+    } catch {
+      case e: Throwable => logError("Uncaught exception while shutting down controllers.", e)
+    }
+    try {
+      kubernetesClient.services().withName(kubernetesDriverServiceName).delete()
+    } catch {
+      case e: Throwable => logError("Uncaught exception while shutting down driver service.", e)
+    }
+    try {
+      kubernetesClient.close()
+    } catch {
+      case e: Throwable => logError("Uncaught exception closing Kubernetes client.", e)
+    }
+    super.stop()
+  }
+
+  private def allocateNewExecutorPod(): (String, Pod) = {
+    val executorKubernetesId = UUID.randomUUID().toString.replaceAll("-", "")
+    val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
+    val name = s"exec$executorKubernetesId"
+    val selectors = Map(SPARK_EXECUTOR_SELECTOR -> executorId,
+      SPARK_APP_SELECTOR -> applicationId()).asJava
+    val executorMemoryQuantity = new QuantityBuilder(false)
+      .withAmount(executorMemoryBytes.toString)
+      .build()
+    val executorMemoryLimitQuantity = new QuantityBuilder(false)
+      .withAmount(executorMemoryWithOverhead.toString)
+      .build()
+    val requiredEnv = new ArrayBuffer[EnvVar]
+    requiredEnv += new EnvVarBuilder()
+      .withName("SPARK_EXECUTOR_PORT")
+      .withValue(executorPort.toString)
+      .build()
+    requiredEnv += new EnvVarBuilder()
+      .withName("SPARK_DRIVER_URL")
+      .withValue(driverUrl)
+      .build()
+    requiredEnv += new EnvVarBuilder()
+      .withName("SPARK_EXECUTOR_CORES")
+      .withValue(executorCores)
+      .build()
+    requiredEnv += new EnvVarBuilder()
+      .withName("SPARK_EXECUTOR_MEMORY")
+      .withValue(executorMemory)
+      .build()
+    requiredEnv += new EnvVarBuilder()
+      .withName("SPARK_APPLICATION_ID")
+      .withValue(applicationId())
+      .build()
+    requiredEnv += new EnvVarBuilder()
+      .withName("SPARK_EXECUTOR_ID")
+      .withValue(executorId)
+      .build()
+    val requiredPorts = new ArrayBuffer[ContainerPort]
+    requiredPorts += new ContainerPortBuilder()
+      .withName(EXECUTOR_PORT_NAME)
+      .withContainerPort(executorPort)
+      .build()
+    requiredPorts += new ContainerPortBuilder()
+      .withName(BLOCK_MANAGER_PORT_NAME)
+      .withContainerPort(blockmanagerPort)
+      .build()
+    (executorKubernetesId, kubernetesClient.pods().createNew()
+      .withNewMetadata()
+        .withName(name)
+        .withLabels(selectors)
+        .endMetadata()
+      .withNewSpec()
+        .addNewContainer()
+          .withName(s"exec-${applicationId()}-container")
+          .withImage(executorDockerImage)
+          .withImagePullPolicy("IfNotPresent")
+          .withNewResources()
+            .addToRequests("memory", executorMemoryQuantity)
+            .addToLimits("memory", executorMemoryLimitQuantity)
+            .endResources()
+          .withEnv(requiredEnv.asJava)
+          .withPorts(requiredPorts.asJava)
+          .endContainer()
+        .endSpec()
+      .done())
+  }
+
+  override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
+    EXECUTOR_MODIFICATION_LOCK.synchronized {
+      if (requestedTotal > totalExpectedExecutors.get) {
+        logInfo(s"Requesting ${requestedTotal - totalExpectedExecutors.get}"
+          + s" additional executors, expecting total $requestedTotal and currently" +
+          s" expected ${totalExpectedExecutors.get}")
+        for (i <- 0 until (requestedTotal - totalExpectedExecutors.get)) {
+          runningExecutorPods += allocateNewExecutorPod()
+        }
+      }
+      totalExpectedExecutors.set(requestedTotal)
+    }
+    true
+  }
+
+  override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future[Boolean] {
+    EXECUTOR_MODIFICATION_LOCK.synchronized {
+      for (executor <- executorIds) {
+        runningExecutorPods.remove(executor) match {
+          case Some(pod) => kubernetesClient.pods().delete(pod)
+          case None => logWarning(s"Unable to remove pod for unknown executor $executor")
+        }
+      }
+    }
+    true
+  }
+}
+
+private object KubernetesClusterSchedulerBackend {
+  private val SPARK_EXECUTOR_SELECTOR = "spark-exec"
+  private val SPARK_APP_SELECTOR = "spark-app"
+  private val DEFAULT_STATIC_PORT = 10000
+  private val DEFAULT_BLOCKMANAGER_PORT = 7079
+  private val DEFAULT_DRIVER_PORT = 7078
+  private val BLOCK_MANAGER_PORT_NAME = "blockmanager"
+  private val EXECUTOR_PORT_NAME = "executor"
+  private val MEMORY_OVERHEAD_FACTOR = 0.10
+  private val MEMORY_OVERHEAD_MIN = 384L
+  private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
+}
diff --git a/kubernetes/docker-minimal-bundle/pom.xml b/kubernetes/docker-minimal-bundle/pom.xml
new file mode 100644
index 0000000000000..3de939ea3978a
--- /dev/null
+++ b/kubernetes/docker-minimal-bundle/pom.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-docker-minimal-bundle_2.11</artifactId>
+  <name>Spark Project Docker Minimal Bundle</name>
+  <url>http://spark.apache.org/</url>
+  <packaging>pom</packaging>
+
+  <properties>
+    <sbt.project.name>docker-minimal-bundle</sbt.project.name>
+    <build.testJarPhase>none</build.testJarPhase>
+    <build.copyDependenciesPhase>pre-integration-test</build.copyDependenciesPhase>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-assembly_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>pom</type>
+    </dependency>
+    <!--
+      Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
+      that the libraries Spark depend on have it available. We'll package the version that Spark
+      uses (14.0.1) which is not the same as Hadoop dependencies, but works.
+    -->
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
+
+    <!-- Unfortunately profiles aren't inherited when we depend on the assembly.
+         However, profile management should be simpler here since this is to
+         be expressly used for a docker image for Kubernetes - so we don't need
+         to consider YARN and Mesos, and always include the kubernetes module. -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>driver-docker-dist</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <descriptors>
+                <descriptor>src/main/assembly/driver-assembly.xml</descriptor>
+              </descriptors>
+              <tarLongFileMode>posix</tarLongFileMode>
+            </configuration>
+          </execution>
+          <execution>
+            <id>executor-docker-dist</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <descriptors>
+                <descriptor>src/main/assembly/executor-assembly.xml</descriptor>
+              </descriptors>
+              <tarLongFileMode>posix</tarLongFileMode>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+  <!-- Include other profiles from the assembly. -->
+  <profiles>
+    <profile>
+      <id>hive</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+    <profile>
+      <id>hive-thriftserver</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive-thriftserver_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+    <profile>
+      <id>spark-ganglia-lgpl</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-ganglia-lgpl_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
+  </profiles>
+</project>
diff --git a/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml b/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
new file mode 100644
index 0000000000000..145244f34d1d9
--- /dev/null
+++ b/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
@@ -0,0 +1,84 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<assembly>
+  <id>driver-docker-dist</id>
+  <formats>
+    <format>tar.gz</format>
+    <format>dir</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <fileSets>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/core/src/main/resources/org/apache/spark/ui/static/
+      </directory>
+      <outputDirectory>ui-resources/org/apache/spark/ui/static</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/sbin/
+      </directory>
+      <outputDirectory>sbin</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/bin/
+      </directory>
+      <outputDirectory>bin</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/conf/
+      </directory>
+      <outputDirectory>conf</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        src/main/docker/driver
+      </directory>
+      <outputDirectory></outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+  </fileSets>
+  <dependencySets>
+    <dependencySet>
+      <outputDirectory>jars</outputDirectory>
+      <useTransitiveDependencies>true</useTransitiveDependencies>
+      <unpack>false</unpack>
+      <scope>runtime</scope>
+      <useProjectArtifact>false</useProjectArtifact>
+      <excludes>
+        <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
+        <exclude>org.spark-project.spark:unused</exclude>
+      </excludes>
+    </dependencySet>
+  </dependencySets>
+</assembly>
diff --git a/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml b/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
new file mode 100644
index 0000000000000..d97ba56562a12
--- /dev/null
+++ b/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
@@ -0,0 +1,84 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<assembly>
+  <id>executor-docker-dist</id>
+  <formats>
+    <format>tar.gz</format>
+    <format>dir</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <fileSets>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/core/src/main/resources/org/apache/spark/ui/static/
+      </directory>
+      <outputDirectory>ui-resources/org/apache/spark/ui/static</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/sbin/
+      </directory>
+      <outputDirectory>sbin</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/bin/
+      </directory>
+      <outputDirectory>bin</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/conf/
+      </directory>
+      <outputDirectory>conf</outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+    <fileSet>
+      <directory>
+        src/main/docker/executor
+      </directory>
+      <outputDirectory></outputDirectory>
+      <includes>
+        <include>**/*</include>
+      </includes>
+    </fileSet>
+  </fileSets>
+  <dependencySets>
+    <dependencySet>
+      <outputDirectory>jars</outputDirectory>
+      <useTransitiveDependencies>true</useTransitiveDependencies>
+      <unpack>false</unpack>
+      <scope>runtime</scope>
+      <useProjectArtifact>false</useProjectArtifact>
+      <excludes>
+        <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
+        <exclude>org.spark-project.spark:unused</exclude>
+      </excludes>
+    </dependencySet>
+  </dependencySets>
+</assembly>
diff --git a/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
new file mode 100644
index 0000000000000..3bba38d8395ae
--- /dev/null
+++ b/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:trusty
+
+# Upgrade package index
+# install a few other useful packages plus Open Jdk 7
+# Remove unneeded /var/lib/apt/lists/* after install to reduce the
+# docker image size (by ~30MB)
+RUN apt-get update && \
+    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server procps && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /opt/spark
+RUN mkdir -p /opt/spark/ui-resources/org/apache/spark/ui/static
+RUN touch /opt/spark/RELEASE
+
+ADD jars /opt/spark/jars
+ADD bin /opt/spark/bin
+ADD sbin /opt/spark/sbin
+ADD conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64/jre
+
+WORKDIR /opt/spark
+
+# This class will also require setting a secret via the SPARK_APP_SECRET environment variable
+CMD exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.KubernetesSparkRestServer --hostname $HOSTNAME --port $SPARK_DRIVER_LAUNCHER_SERVER_PORT --secret-file $SPARK_SUBMISSION_SECRET_LOCATION
diff --git a/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
new file mode 100644
index 0000000000000..f68f1a3fb2694
--- /dev/null
+++ b/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:trusty
+
+# Upgrade package index
+# install a few other useful packages plus Open Jdk 7
+# Remove unneeded /var/lib/apt/lists/* after install to reduce the
+# docker image size (by ~30MB)
+RUN apt-get update && \
+    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server procps && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /opt/spark
+RUN mkdir -p /opt/spark/ui-resources/org/apache/spark/ui/static
+RUN touch /opt/spark/RELEASE
+
+ADD jars /opt/spark/jars
+ADD bin /opt/spark/bin
+ADD sbin /opt/spark/sbin
+ADD conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64/jre
+
+WORKDIR /opt/spark
+
+# TODO support spark.executor.extraClassPath
+CMD exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp ${SPARK_HOME}/jars/\* org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $HOSTNAME
diff --git a/kubernetes/integration-tests-spark-jobs/pom.xml b/kubernetes/integration-tests-spark-jobs/pom.xml
new file mode 100644
index 0000000000000..17f1c4906214f
--- /dev/null
+++ b/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-kubernetes-integration-tests-spark-jobs_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Kubernetes Integration Tests Spark Jobs</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala b/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
new file mode 100644
index 0000000000000..6e4660b771305
--- /dev/null
+++ b/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.jobs
+
+import scala.math.random
+
+import org.apache.spark.sql.SparkSession
+
+// Equivalent to SparkPi except does not stop the Spark Context
+// at the end and spins forever, so other things can inspect the
+// Spark UI immediately after the fact.
+private[spark] object SparkPiWithInfiniteWait {
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName("Spark Pi")
+      .getOrCreate()
+    val slices = if (args.length > 0) args(0).toInt else 10
+    val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
+    val count = spark.sparkContext.parallelize(1 until n, slices).map { i =>
+        val x = random * 2 - 1
+        val y = random * 2 - 1
+        if (x*x + y*y < 1) 1 else 0
+      }.reduce(_ + _)
+    // scalastyle:off println
+    println("Pi is roughly " + 4.0 * count / (n - 1))
+    // scalastyle:on println
+
+    // Spin forever to keep the Spark UI active, so other things can inspect the job.
+    while (true) {
+      Thread.sleep(600000)
+    }
+  }
+
+}
diff --git a/kubernetes/integration-tests/pom.xml b/kubernetes/integration-tests/pom.xml
new file mode 100644
index 0000000000000..0568cb1e21826
--- /dev/null
+++ b/kubernetes/integration-tests/pom.xml
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-kubernetes-integration-tests_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Kubernetes Integration Tests</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kubernetes-integration-tests-spark-jobs_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>tar.gz</type>
+      <classifier>driver-docker-dist</classifier>
+      <scope>test</scope>
+      <exclusions>
+        <exclusion>
+          <artifactId>*</artifactId>
+          <groupId>*</groupId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <scope>test</scope>
+      <!-- For compatibility with Docker client. Should be fine since this is just for tests.-->
+      <version>18.0</version>
+    </dependency>
+    <dependency>
+      <groupId>com.spotify</groupId>
+      <artifactId>docker-client</artifactId>
+      <scope>test</scope>
+      <!--
+        See https://github.com/spotify/docker-client/pull/272#issuecomment-155249101
+        for an explanation of why these exclusions are (necessarily) a mess.
+      -->
+      <exclusions>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.jaxrs</groupId>
+          <artifactId>jackson-jaxrs-json-provider</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-databind</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.jersey.core</groupId>
+          <artifactId>jersey-client</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.glassfish.jersey.core</groupId>
+          <artifactId>jersey-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>javax.ws.rs</groupId>
+          <artifactId>jsr311-api</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-test-spark-jobs</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy</goal>
+            </goals>
+            <configuration>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-kubernetes-integration-tests-spark-jobs_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <type>jar</type>
+                  <outputDirectory>${project.build.directory}/integration-tests-spark-jobs</outputDirectory>
+                </artifactItem>
+              </artifactItems>
+            </configuration>
+          </execution>
+          <execution>
+            <id>unpack-docker-driver-bundle</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>unpack</goal>
+            </goals>
+            <configuration>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <classifier>driver-docker-dist</classifier>
+                  <type>tar.gz</type>
+                  <overWrite>true</overWrite>
+                  <outputDirectory>${project.build.directory}/docker/driver</outputDirectory>
+                </artifactItem>
+              </artifactItems>
+            </configuration>
+          </execution>
+          <execution>
+            <id>unpack-docker-executor-bundle</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>unpack</goal>
+            </goals>
+            <configuration>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <classifier>executor-docker-dist</classifier>
+                  <type>tar.gz</type>
+                  <overWrite>true</overWrite>
+                  <outputDirectory>${project.build.directory}/docker/executor</outputDirectory>
+                </artifactItem>
+              </artifactItems>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>com.googlecode.maven-download-plugin</groupId>
+        <artifactId>download-maven-plugin</artifactId>
+        <version>1.3.0</version>
+        <executions>
+          <execution>
+            <id>download-minikube-linux</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>wget</goal>
+            </goals>
+            <configuration>
+              <url>https://storage.googleapis.com/minikube/releases/v0.12.2/minikube-linux-amd64</url>
+              <outputDirectory>${project.build.directory}/minikube-bin/linux-amd64</outputDirectory>
+              <outputFileName>minikube</outputFileName>
+            </configuration>
+          </execution>
+          <execution>
+            <id>download-minikube-darwin</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>wget</goal>
+            </goals>
+            <configuration>
+              <url>https://storage.googleapis.com/minikube/releases/v0.12.2/minikube-darwin-amd64</url>
+              <outputDirectory>${project.build.directory}/minikube-bin/darwin-amd64</outputDirectory>
+              <outputFileName>minikube</outputFileName>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+
+  </build>
+
+</project>
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
new file mode 100644
index 0000000000000..d79c75e484af5
--- /dev/null
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.nio.file.Paths
+import java.util.UUID
+
+import com.google.common.collect.ImmutableList
+import io.fabric8.kubernetes.client.{Config, KubernetesClient}
+import org.scalatest.BeforeAndAfter
+import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
+import org.scalatest.time.{Minutes, Seconds, Span}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.SparkSubmit
+import org.apache.spark.deploy.kubernetes.Client
+import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
+import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
+import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
+
+private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val EXAMPLES_JAR = Paths.get("target", "integration-tests-spark-jobs")
+      .toFile
+      .listFiles()(0)
+      .getAbsolutePath
+
+  private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
+  private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
+  private val MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
+    ".integrationtest.jobs.SparkPiWithInfiniteWait"
+  private val NAMESPACE = UUID.randomUUID().toString.replaceAll("-", "")
+  private var minikubeKubernetesClient: KubernetesClient = _
+  private var clientConfig: Config = _
+
+  override def beforeAll(): Unit = {
+    Minikube.startMinikube()
+    new SparkDockerImageBuilder(Minikube.getDockerEnv).buildSparkDockerImages()
+    Minikube.getKubernetesClient.namespaces.createNew()
+      .withNewMetadata()
+        .withName(NAMESPACE)
+        .endMetadata()
+      .done()
+    minikubeKubernetesClient = Minikube.getKubernetesClient.inNamespace(NAMESPACE)
+    clientConfig = minikubeKubernetesClient.getConfiguration
+  }
+
+  before {
+    Eventually.eventually(TIMEOUT, INTERVAL) {
+      assert(minikubeKubernetesClient.pods().list().getItems.isEmpty)
+      assert(minikubeKubernetesClient.services().list().getItems.isEmpty)
+    }
+  }
+
+  after {
+    val pods = minikubeKubernetesClient.pods().list().getItems.asScala
+    pods.par.foreach(pod => {
+      minikubeKubernetesClient
+        .pods()
+        .withName(pod.getMetadata.getName)
+        .withGracePeriod(60)
+        .delete
+    })
+  }
+
+  override def afterAll(): Unit = {
+    if (!System.getProperty("spark.docker.test.persistMinikube", "false").toBoolean) {
+      Minikube.deleteMinikube()
+    }
+  }
+
+  private def expectationsForStaticAllocation(sparkMetricsService: SparkRestApiV1): Unit = {
+    val apps = Eventually.eventually(TIMEOUT, INTERVAL) {
+      val result = sparkMetricsService
+        .getApplications(ImmutableList.of(ApplicationStatus.RUNNING, ApplicationStatus.COMPLETED))
+      assert(result.size == 1
+        && !result.head.id.equalsIgnoreCase("appid")
+        && !result.head.id.equalsIgnoreCase("{appId}"))
+      result
+    }
+    Eventually.eventually(TIMEOUT, INTERVAL) {
+      val result = sparkMetricsService.getExecutors(apps.head.id)
+      assert(result.size == 2)
+      assert(result.count(exec => exec.id != "driver") == 1)
+      result
+    }
+    Eventually.eventually(TIMEOUT, INTERVAL) {
+      val result = sparkMetricsService.getStages(
+        apps.head.id, Seq(StageStatus.COMPLETE).asJava)
+      assert(result.size == 1)
+      result
+    }
+  }
+
+  test("Run a simple example") {
+    val sparkConf = new SparkConf(true)
+      .setMaster("kubernetes")
+      .set("spark.kubernetes.master", s"https://${Minikube.getMinikubeIp}:8443")
+      .set("spark.kubernetes.submit.caCertFile", clientConfig.getCaCertFile)
+      .set("spark.kubernetes.submit.clientKeyFile", clientConfig.getClientKeyFile)
+      .set("spark.kubernetes.submit.clientCertFile", clientConfig.getClientCertFile)
+      .set("spark.kubernetes.namespace", NAMESPACE)
+      .set("spark.kubernetes.executor.docker.image", "spark-executor:latest")
+      .set("spark.executor.memory", "500m")
+      .set("spark.executor.cores", "1")
+      .set("spark.executors.instances", "1")
+      .set("spark.app.id", "spark-pi")
+    val mainAppResource = s"file://$EXAMPLES_JAR"
+
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = MAIN_CLASS,
+      mainAppResource = mainAppResource,
+      appArgs = Array.empty[String]).run()
+    val sparkMetricsService = Minikube.getService[SparkRestApiV1](
+      "spark-pi", NAMESPACE, "spark-ui-port")
+    expectationsForStaticAllocation(sparkMetricsService)
+  }
+
+  test("Run using spark-submit") {
+    val args = Array(
+      "--master", "kubernetes",
+      "--deploy-mode", "cluster",
+      "--kubernetes-master", s"https://${Minikube.getMinikubeIp}:8443",
+      "--kubernetes-namespace", NAMESPACE,
+      "--name", "spark-pi",
+      "--executor-memory", "512m",
+      "--executor-cores", "1",
+      "--num-executors", "1",
+      "--class", MAIN_CLASS,
+      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
+      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
+      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
+      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+      EXAMPLES_JAR)
+    SparkSubmit.main(args)
+    val sparkMetricsService = Minikube.getService[SparkRestApiV1](
+      "spark-pi", NAMESPACE, "spark-ui-port")
+    expectationsForStaticAllocation(sparkMetricsService)
+  }
+}
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
new file mode 100644
index 0000000000000..22d78142508c1
--- /dev/null
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.docker
+
+import java.net.URI
+import java.nio.file.Paths
+
+import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider
+import com.spotify.docker.client.{DefaultDockerClient, DockerCertificates}
+import org.apache.http.client.utils.URIBuilder
+import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
+import org.scalatest.time.{Minutes, Seconds, Span}
+
+private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String, String]) {
+
+  private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
+  private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
+  private val dockerHost = dockerEnv.getOrElse("DOCKER_HOST",
+      throw new IllegalStateException("DOCKER_HOST env not found."))
+
+  private val originalDockerUri = URI.create(dockerHost)
+  private val httpsDockerUri = new URIBuilder()
+      .setHost(originalDockerUri.getHost)
+      .setPort(originalDockerUri.getPort)
+      .setScheme("https")
+      .build()
+
+  private val dockerCerts = dockerEnv.getOrElse("DOCKER_CERT_PATH",
+      throw new IllegalStateException("DOCKER_CERT_PATH env not found."))
+
+  private val dockerClient = new DefaultDockerClient.Builder()
+    .uri(httpsDockerUri)
+    .dockerCertificates(DockerCertificates
+        .builder()
+        .dockerCertPath(Paths.get(dockerCerts))
+        .build().get())
+    .build()
+
+  def buildSparkDockerImages(): Unit = {
+    Eventually.eventually(TIMEOUT, INTERVAL) { dockerClient.ping() }
+    dockerClient.build(Paths.get("target", "docker", "driver"), "spark-driver")
+    dockerClient.build(Paths.get("target", "docker", "executor"), "spark-executor")
+  }
+
+}
\ No newline at end of file
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
new file mode 100644
index 0000000000000..92b809a4c7c59
--- /dev/null
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.minikube
+
+import java.io.{BufferedReader, InputStreamReader}
+import java.nio.file.Paths
+import java.util.concurrent.TimeUnit
+import javax.net.ssl.X509TrustManager
+
+import io.fabric8.kubernetes.client.internal.SSLUtils
+import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import org.apache.spark.deploy.rest.kubernetes.HttpClientUtil
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+// TODO support windows
+private[spark] object Minikube extends Logging {
+  private val MINIKUBE_EXECUTABLE_DEST = if (Utils.isMac) {
+    Paths.get("target", "minikube-bin", "darwin-amd64", "minikube").toFile
+  } else if (Utils.isWindows) {
+    throw new IllegalStateException("Executing Minikube based integration tests not yet " +
+      " available on Windows.")
+  } else {
+    Paths.get("target", "minikube-bin", "linux-amd64", "minikube").toFile
+  }
+
+  private val EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE = "Minikube is not downloaded, expected at " +
+    s"${MINIKUBE_EXECUTABLE_DEST.getAbsolutePath}"
+
+  private val MINIKUBE_STARTUP_TIMEOUT_SECONDS = 60
+
+  def startMinikube(): Unit = synchronized {
+    assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
+    if (getMinikubeStatus != MinikubeStatus.RUNNING) {
+      executeMinikube("start", "--memory", "6000", "--cpus", "8")
+    } else {
+      logInfo("Minikube is already started.")
+    }
+  }
+
+  def getMinikubeIp: String = synchronized {
+    assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
+    val outputs = executeMinikube("ip")
+    assert(outputs.size == 1, "Unexpected amount of output from minikube ip")
+    outputs.head
+  }
+
+  def getMinikubeStatus: MinikubeStatus.Value = synchronized {
+    assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
+    val statusString = executeMinikube("status").head.replaceFirst("minikubeVM: ", "")
+    MinikubeStatus.unapply(statusString)
+        .getOrElse(throw new IllegalStateException(s"Unknown status $statusString"))
+  }
+
+  def getDockerEnv: Map[String, String] = synchronized {
+    assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
+    executeMinikube("docker-env")
+        .filter(_.startsWith("export"))
+        .map(_.replaceFirst("export ", "").split('='))
+        .map(arr => (arr(0), arr(1).replaceAllLiterally("\"", "")))
+        .toMap
+  }
+
+  def deleteMinikube(): Unit = synchronized {
+    assert(MINIKUBE_EXECUTABLE_DEST.exists, EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
+    if (getMinikubeStatus != MinikubeStatus.DOES_NOT_EXIST) {
+      executeMinikube("delete")
+    } else {
+      logInfo("Minikube was already not running.")
+    }
+  }
+
+  def getKubernetesClient: DefaultKubernetesClient = synchronized {
+    val kubernetesMaster = s"https://$getMinikubeIp:8443"
+    val userHome = System.getProperty("user.home")
+    val kubernetesConf = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(kubernetesMaster)
+      .withCaCertFile(Paths.get(userHome, ".minikube", "ca.crt").toFile.getAbsolutePath)
+      .withClientCertFile(Paths.get(userHome, ".minikube", "apiserver.crt").toFile.getAbsolutePath)
+      .withClientKeyFile(Paths.get(userHome, ".minikube", "apiserver.key").toFile.getAbsolutePath)
+      .build()
+    new DefaultKubernetesClient(kubernetesConf)
+  }
+
+  def getService[T: ClassTag](
+      serviceName: String,
+      namespace: String,
+      servicePortName: String,
+      servicePath: String = ""): T = synchronized {
+    val kubernetesMaster = s"https://$getMinikubeIp:8443"
+    val url = s"${
+      Array[String](
+        kubernetesMaster,
+        "api", "v1", "proxy",
+        "namespaces", namespace,
+        "services", serviceName).mkString("/")}" +
+      s":$servicePortName$servicePath"
+    val userHome = System.getProperty("user.home")
+    val kubernetesConf = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(kubernetesMaster)
+      .withCaCertFile(Paths.get(userHome, ".minikube", "ca.crt").toFile.getAbsolutePath)
+      .withClientCertFile(Paths.get(userHome, ".minikube", "apiserver.crt").toFile.getAbsolutePath)
+      .withClientKeyFile(Paths.get(userHome, ".minikube", "apiserver.key").toFile.getAbsolutePath)
+      .build()
+    val sslContext = SSLUtils.sslContext(kubernetesConf)
+    val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
+    HttpClientUtil.createClient[T](url, sslContext.getSocketFactory, trustManager)
+  }
+
+  def executeMinikubeSsh(command: String): Unit = {
+    executeMinikube("ssh", command)
+  }
+
+  private def executeMinikube(action: String, args: String*): Seq[String] = {
+    if (!MINIKUBE_EXECUTABLE_DEST.canExecute) {
+      if (!MINIKUBE_EXECUTABLE_DEST.setExecutable(true)) {
+        throw new IllegalStateException("Failed to make the Minikube binary executable.")
+      }
+    }
+    val fullCommand = Array(MINIKUBE_EXECUTABLE_DEST.getAbsolutePath, action) ++ args
+    val pb = new ProcessBuilder().command(fullCommand: _*)
+    pb.redirectErrorStream(true)
+    val proc = pb.start()
+    val outputLines = new ArrayBuffer[String]
+
+    Utils.tryWithResource(new InputStreamReader(proc.getInputStream)) { procOutput =>
+      Utils.tryWithResource(new BufferedReader(procOutput)) { (bufferedOutput: BufferedReader) =>
+        var line: String = null
+        do {
+          line = bufferedOutput.readLine()
+          if (line != null) {
+            logInfo(line)
+            outputLines += line
+          }
+        } while (line != null)
+      }
+    }
+    assert(proc.waitFor(MINIKUBE_STARTUP_TIMEOUT_SECONDS, TimeUnit.SECONDS),
+      s"Timed out while executing $action on minikube.")
+    assert(proc.exitValue == 0, s"Failed to execute minikube $action ${args.mkString(" ")}")
+    outputLines.toSeq
+  }
+}
+
+private[spark] object MinikubeStatus extends Enumeration {
+
+  val RUNNING = status("Running")
+  val STOPPED = status("Stopped")
+  val DOES_NOT_EXIST = status("Does Not Exist")
+  val SAVED = status("Saved")
+
+  def status(value: String): Value = new Val(nextId, value)
+  def unapply(s: String): Option[Value] = values.find(s == _.toString)
+}
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala
new file mode 100644
index 0000000000000..7a3b06b1b5e58
--- /dev/null
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.restapis
+
+import java.util.{List => JList}
+import javax.ws.rs._
+import javax.ws.rs.core.MediaType
+
+import org.apache.spark.status.api.v1._
+
+@Path("/api/v1")
+@Consumes(Array(MediaType.APPLICATION_JSON))
+@Produces(Array(MediaType.APPLICATION_JSON))
+trait SparkRestApiV1 {
+
+  @GET
+  @Path("/applications")
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.APPLICATION_JSON))
+  def getApplications(
+      @QueryParam("status") applicationStatuses: JList[ApplicationStatus]): Seq[ApplicationInfo]
+
+  @GET
+  @Path("applications/{appId}/stages")
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.APPLICATION_JSON))
+  def getStages(
+    @PathParam("appId") appId: String,
+    @QueryParam("status") statuses: JList[StageStatus]): Seq[StageData]
+
+  @GET
+  @Path("applications/{appId}/executors")
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.APPLICATION_JSON))
+  def getExecutors(@PathParam("appId") appId: String): Seq[ExecutorSummary]
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index 6767cc5079649..94f9bc319b6a2 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -76,6 +76,12 @@ class SparkSubmitOptionParser {
   protected final String PRINCIPAL = "--principal";
   protected final String QUEUE = "--queue";
 
+  // Kubernetes-only options.
+  protected final String KUBERNETES_MASTER = "--kubernetes-master";
+  protected final String KUBERNETES_NAMESPACE = "--kubernetes-namespace";
+  protected final String KUBERNETES_UPLOAD_JARS = "--upload-jars";
+  protected final String KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH = "--upload-driver-extra-classpath";
+
   /**
    * This is the canonical list of spark-submit options. Each entry in the array contains the
    * different aliases for the same option; the first element of each entry is the "official"
@@ -115,6 +121,10 @@ class SparkSubmitOptionParser {
     { REPOSITORIES },
     { STATUS },
     { TOTAL_EXECUTOR_CORES },
+    { KUBERNETES_MASTER },
+    { KUBERNETES_NAMESPACE },
+    { KUBERNETES_UPLOAD_JARS },
+    { KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH }
   };
 
   /**
diff --git a/pom.xml b/pom.xml
index 49f12703c04df..d923941748ca2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -136,6 +136,7 @@
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.1</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
+    <feign.version>8.18.0</feign.version>
     <jetty.version>9.2.16.v20160414</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
     <chill.version>0.8.0</chill.version>
@@ -303,6 +304,33 @@
         <artifactId>chill-java</artifactId>
         <version>${chill.version}</version>
       </dependency>
+
+      <dependency>
+        <groupId>com.netflix.feign</groupId>
+        <artifactId>feign-core</artifactId>
+        <version>${feign.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.netflix.feign</groupId>
+        <artifactId>feign-okhttp</artifactId>
+        <version>${feign.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.netflix.feign</groupId>
+        <artifactId>feign-jackson</artifactId>
+        <version>${feign.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.netflix.feign</groupId>
+        <artifactId>feign-jaxrs</artifactId>
+        <version>${feign.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.squareup.okhttp3</groupId>
+        <artifactId>okhttp</artifactId>
+        <version>3.4.1</version>
+      </dependency>
+
       <!-- This artifact is a shaded version of ASM 5.0.4. The POM that was used to produce this
            is at https://github.com/apache/geronimo-xbean/tree/xbean-4.4/xbean-asm5-shaded
            For context on why we shade ASM, see SPARK-782 and SPARK-6152. -->
@@ -617,6 +645,11 @@
         <artifactId>jackson-module-jaxb-annotations</artifactId>
         <version>${fasterxml.jackson.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.jaxrs</groupId>
+        <artifactId>jackson-jaxrs-json-provider</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.glassfish.jersey.core</groupId>
         <artifactId>jersey-server</artifactId>
@@ -2592,6 +2625,22 @@
       </modules>
     </profile>
 
+    <profile>
+      <id>kubernetes</id>
+      <modules>
+        <module>kubernetes/core</module>
+      </modules>
+    </profile>
+
+    <profile>
+      <id>kubernetes-integration-tests</id>
+      <modules>
+        <module>kubernetes/docker-minimal-bundle</module>
+        <module>kubernetes/integration-tests</module>
+        <module>kubernetes/integration-tests-spark-jobs</module>
+      </modules>  
+    </profile>
+
     <profile>
       <id>hive-thriftserver</id>
       <modules>

From 00e545f6c2df74e53ac3446737d960bc86e54f64 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Tue, 6 Dec 2016 14:36:57 -0800
Subject: [PATCH 374/534] Fix style

---
 .../src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 598bafcab81dc..6d37b093a0b6b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -854,7 +854,6 @@ private[spark] object SparkSubmitUtils {
 
   /**
    * Represents a Maven Coordinate
-   *
    * @param groupId the groupId of the coordinate
    * @param artifactId the artifactId of the coordinate
    * @param version the version of the coordinate
@@ -866,7 +865,6 @@ private[spark] object SparkSubmitUtils {
 /**
  * Extracts maven coordinates from a comma-delimited string. Coordinates should be provided
  * in the format `groupId:artifactId:version` or `groupId/artifactId:version`.
- *
  * @param coordinates Comma-delimited string of maven coordinates
  * @return Sequence of Maven coordinates
  */
@@ -897,7 +895,6 @@ private[spark] object SparkSubmitUtils {
 
   /**
    * Extracts maven coordinates from a comma-delimited string
-   *
    * @param remoteRepos Comma-delimited string of remote repositories
    * @param ivySettings The Ivy settings for this session
    * @return A ChainResolver used by Ivy to search for and resolve dependencies.
@@ -962,7 +959,6 @@ private[spark] object SparkSubmitUtils {
   /**
    * Output a comma-delimited list of paths for the downloaded jars to be added to the classpath
    * (will append to jars in SparkSubmit).
-   *
    * @param artifacts Sequence of dependencies that were resolved and retrieved
    * @param cacheDirectory directory where jars are cached
    * @return a comma-delimited list of paths for the dependencies
@@ -1019,7 +1015,6 @@ private[spark] object SparkSubmitUtils {
 
   /**
    * Resolves any dependencies that were supplied through maven coordinates
-   *
    * @param coordinates Comma-delimited string of maven coordinates
    * @param remoteRepos Comma-delimited string of remote repositories other than maven central
    * @param ivyPath The path to the local ivy repository

From cdbd9bb9f96a86470589e137b58a42217837b869 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Tue, 6 Dec 2016 17:23:24 -0800
Subject: [PATCH 375/534] Make naming more consistent

---
 dev/scalastyle                                      |  2 ++
 .../org/apache/spark/deploy/kubernetes/Client.scala | 13 ++++++-------
 .../integrationtest/KubernetesSuite.scala           |  2 ++
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/dev/scalastyle b/dev/scalastyle
index f3dec833636c6..de7423913fad9 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -26,6 +26,8 @@ ERRORS=$(echo -e "q\n" \
         -Pyarn \
         -Phive \
         -Phive-thriftserver \
+        -Pkubernetes \
+        -Pkubernetes-integration-tests \
         scalastyle test:scalastyle \
     | awk '{if($1~/error/)print}' \
 )
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 4ee00e8802080..f402b6df82fc4 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -34,7 +34,7 @@ import scala.concurrent.ExecutionContext
 import scala.concurrent.duration.DurationInt
 import scala.util.Success
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{SPARK_VERSION, SparkConf}
 import org.apache.spark.deploy.rest.{AppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
@@ -60,9 +60,8 @@ private[spark] class Client(
 
   private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
   private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
-  // TODO set precise version by default
   private val driverDockerImage = sparkConf.get(
-    "spark.kubernetes.driver.docker.image", "spark-driver:latest")
+    "spark.kubernetes.driver.docker.image", s"spark-driver:$SPARK_VERSION")
   private val uploadedDriverExtraClasspath = sparkConf
     .getOption("spark.kubernetes.driver.uploads.driverExtraClasspath")
   private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
@@ -291,14 +290,14 @@ private[spark] class Client(
   def getFileContents(maybeFilePaths: Option[String]): Array[(String, String)] = {
     maybeFilePaths
       .map(_.split(",").map(filePath => {
-        val driverExtraClasspathFile = new File(filePath)
-        if (!driverExtraClasspathFile.isFile) {
+        val fileToUpload = new File(filePath)
+        if (!fileToUpload.isFile) {
           throw new IllegalStateException("Provided file to upload for driver extra classpath" +
             s" does not exist or is not a file: $filePath")
         } else {
-          val fileBytes = Files.toByteArray(driverExtraClasspathFile)
+          val fileBytes = Files.toByteArray(fileToUpload)
           val fileBase64 = Base64.encodeBase64String(fileBytes)
-          (driverExtraClasspathFile.getName, fileBase64)
+          (fileToUpload.getName, fileBase64)
         }
       })).getOrElse(Array.empty[(String, String)])
   }
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index d79c75e484af5..3f3d2e609ea4d 100644
--- a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -116,6 +116,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.kubernetes.submit.clientKeyFile", clientConfig.getClientKeyFile)
       .set("spark.kubernetes.submit.clientCertFile", clientConfig.getClientCertFile)
       .set("spark.kubernetes.namespace", NAMESPACE)
+      .set("spark.kubernetes.driver.docker.image", "spark-driver:latest")
       .set("spark.kubernetes.executor.docker.image", "spark-executor:latest")
       .set("spark.executor.memory", "500m")
       .set("spark.executor.cores", "1")
@@ -148,6 +149,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       EXAMPLES_JAR)
     SparkSubmit.main(args)
     val sparkMetricsService = Minikube.getService[SparkRestApiV1](

From 8f69fc0bda56f898ebf97d32d49e7c3ec2e34d04 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Fri, 9 Dec 2016 14:55:13 -0800
Subject: [PATCH 376/534] Fix building assembly with Kubernetes.

---
 assembly/pom.xml                                       | 10 ++++++++++
 .../kubernetes/integrationtest/minikube/Minikube.scala |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index aebfd12227751..1819fe404a1d9 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -148,6 +148,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>kubernetes</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>hive</id>
       <dependencies>
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
index 92b809a4c7c59..60c6564579a6e 100644
--- a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -21,8 +21,8 @@ import java.nio.file.Paths
 import java.util.concurrent.TimeUnit
 import javax.net.ssl.X509TrustManager
 
-import io.fabric8.kubernetes.client.internal.SSLUtils
 import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
+import io.fabric8.kubernetes.client.internal.SSLUtils
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 

From 75c6086aa52f0a98345b4fa63fde654740ddd28b Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Fri, 9 Dec 2016 16:18:11 -0800
Subject: [PATCH 377/534] Service account support, use constants from fabric8
 library.

---
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala  | 7 +++++--
 .../spark/deploy/kubernetes/KubernetesClientBuilder.scala  | 6 +++---
 .../kubernetes/integrationtest/KubernetesSuite.scala       | 6 +++---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index f402b6df82fc4..cea90a51386b5 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -55,8 +55,8 @@ private[spark] class Client(
 
   private val launchTime = System.currentTimeMillis
   private val kubernetesAppId = sparkConf.getOption("spark.app.name")
-      .orElse(sparkConf.getOption("spark.app.id"))
-      .getOrElse(s"spark-$launchTime")
+    .orElse(sparkConf.getOption("spark.app.id"))
+    .getOrElse(s"spark-$launchTime")
 
   private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
   private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
@@ -69,6 +69,8 @@ private[spark] class Client(
   private val secretBytes = new Array[Byte](128)
   SECURE_RANDOM.nextBytes(secretBytes)
   private val secretBase64String = Base64.encodeBase64String(secretBytes)
+  private val serviceAccount = sparkConf.get("spark.kubernetes.submit.serviceAccountName",
+    "default")
 
   private implicit val retryableExecutionContext = ExecutionContext
     .fromExecutorService(
@@ -191,6 +193,7 @@ private[spark] class Client(
                   .withSecretName(secret.getMetadata.getName)
                   .endSecret()
                 .endVolume
+              .withServiceAccount(serviceAccount)
               .addNewContainer()
                 .withName(DRIVER_LAUNCHER_CONTAINER_NAME)
                 .withImage(driverDockerImage)
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
index 4c715c86cc7f9..61a13dc7274d7 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
@@ -20,11 +20,11 @@ import java.io.File
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
-import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
+import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
 
 private[spark] object KubernetesClientBuilder {
-  private val API_SERVER_TOKEN = new File("/var/run/secrets/kubernetes.io/serviceaccount/token")
-  private val CA_CERT_FILE = new File("/var/run/secrets/kubernetes.io/serviceaccount/ca.crt")
+  private val API_SERVER_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
+  private val CA_CERT_FILE = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
 
   /**
     * Creates a {@link KubernetesClient}, expecting to be from
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 3f3d2e609ea4d..902631b874539 100644
--- a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -37,9 +37,9 @@ import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   private val EXAMPLES_JAR = Paths.get("target", "integration-tests-spark-jobs")
-      .toFile
-      .listFiles()(0)
-      .getAbsolutePath
+    .toFile
+    .listFiles()(0)
+    .getAbsolutePath
 
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))

From 93b75cea65e8c5d98e4d9d78a541659d927fceea Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Fri, 6 Jan 2017 16:15:35 -0800
Subject: [PATCH 378/534] Some small changes

- Don't hold the raw secret bytes
- Add CPU limits and requests
---
 .../spark/deploy/kubernetes/Client.scala      | 39 +++++++++++--------
 .../KubernetesClusterSchedulerBackend.scala   |  9 ++++-
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index cea90a51386b5..21c83dbf40e21 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -66,9 +66,12 @@ private[spark] class Client(
     .getOption("spark.kubernetes.driver.uploads.driverExtraClasspath")
   private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
 
-  private val secretBytes = new Array[Byte](128)
-  SECURE_RANDOM.nextBytes(secretBytes)
-  private val secretBase64String = Base64.encodeBase64String(secretBytes)
+  private val secretBase64String = {
+    val secretBytes = new Array[Byte](128)
+    SECURE_RANDOM.nextBytes(secretBytes)
+    Base64.encodeBase64String(secretBytes)
+  }
+
   private val serviceAccount = sparkConf.get("spark.kubernetes.submit.serviceAccountName",
     "default")
 
@@ -105,11 +108,7 @@ private[spark] class Client(
         .done()
       try {
         val selectors = Map(DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue).asJava
-        val uiPort = sparkConf
-          .getOption("spark.ui.port")
-          .map(_.toInt)
-          .getOrElse(DEFAULT_UI_PORT)
-        val (servicePorts, containerPorts) = configurePorts(uiPort)
+        val (servicePorts, containerPorts) = configurePorts()
         val service = kubernetesClient.services().createNew()
           .withNewMetadata()
             .withName(kubernetesAppId)
@@ -120,11 +119,11 @@ private[spark] class Client(
             .endSpec()
           .done()
         sparkConf.set("spark.kubernetes.driver.service.name", service.getMetadata.getName)
-        sparkConf.setIfMissing("spark.driver.port", DRIVER_PORT.toString)
-        sparkConf.setIfMissing("spark.blockmanager.port", BLOCKMANAGER_PORT.toString)
+        sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
+        sparkConf.setIfMissing("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
         val submitRequest = buildSubmissionRequest()
         val submitCompletedFuture = SettableFuture.create[Boolean]
-        val secretDirectory = s"/var/run/secrets/spark-submission/$kubernetesAppId"
+        val secretDirectory = s"$SPARK_SUBMISSION_SECRET_BASE_DIR/$kubernetesAppId"
 
         val podWatcher = new Watcher[Pod] {
           override def eventReceived(action: Action, t: Pod): Unit = {
@@ -228,7 +227,7 @@ private[spark] class Client(
     })
   }
 
-  private def configurePorts(uiPort: Int): (Seq[ServicePort], Seq[ContainerPort]) = {
+  private def configurePorts(): (Seq[ServicePort], Seq[ContainerPort]) = {
     val servicePorts = new ArrayBuffer[ServicePort]
     val containerPorts = new ArrayBuffer[ContainerPort]
 
@@ -251,15 +250,20 @@ private[spark] class Client(
       sparkConf
         .getOption("spark.driver.port")
         .map(_.toInt)
-        .getOrElse(DRIVER_PORT))
+        .getOrElse(DEFAULT_DRIVER_PORT))
     addPortToServiceAndContainer(
       BLOCKMANAGER_PORT_NAME,
       sparkConf
         .getOption("spark.blockmanager.port")
         .map(_.toInt)
-        .getOrElse(BLOCKMANAGER_PORT))
+        .getOrElse(DEFAULT_BLOCKMANAGER_PORT))
 
-    addPortToServiceAndContainer(UI_PORT_NAME, uiPort)
+    addPortToServiceAndContainer(
+      UI_PORT_NAME,
+      sparkConf
+        .getOption("spark.ui.port")
+        .map(_.toInt)
+        .getOrElse(DEFAULT_UI_PORT))
     (servicePorts.toSeq, containerPorts.toSeq)
   }
 
@@ -331,8 +335,8 @@ private object Client {
   private val SUBMISSION_SERVER_SECRET_NAME = "spark-submission-server-secret"
   private val DRIVER_LAUNCHER_SELECTOR_LABEL = "driver-launcher-selector"
   private val DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT = 7077
-  private val DRIVER_PORT = 7078
-  private val BLOCKMANAGER_PORT = 7079
+  private val DEFAULT_DRIVER_PORT = 7078
+  private val DEFAULT_BLOCKMANAGER_PORT = 7079
   private val DEFAULT_UI_PORT = 4040
   private val UI_PORT_NAME = "spark-ui-port"
   private val DRIVER_LAUNCHER_SERVICE_PORT_NAME = "driver-launcher-port"
@@ -340,6 +344,7 @@ private object Client {
   private val BLOCKMANAGER_PORT_NAME = "block-manager-port"
   private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher"
   private val SECURE_RANDOM = new SecureRandom()
+  private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission"
 
   def main(args: Array[String]): Unit = {
     require(args.length >= 2, s"Too few arguments. Usage: ${getClass.getName} <mainAppResource>" +
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index f37b97e4dd0dc..bbc95d4f4b7e3 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -34,8 +34,8 @@ import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.util.Utils
 
 private[spark] class KubernetesClusterSchedulerBackend(
-  scheduler: TaskSchedulerImpl,
-  val sc: SparkContext)
+    scheduler: TaskSchedulerImpl,
+    val sc: SparkContext)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
 
   import KubernetesClusterSchedulerBackend._
@@ -167,6 +167,9 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val executorMemoryLimitQuantity = new QuantityBuilder(false)
       .withAmount(executorMemoryWithOverhead.toString)
       .build()
+    val executorCpuQuantity = new QuantityBuilder(false)
+      .withAmount(executorCores)
+      .build()
     val requiredEnv = new ArrayBuffer[EnvVar]
     requiredEnv += new EnvVarBuilder()
       .withName("SPARK_EXECUTOR_PORT")
@@ -214,6 +217,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
           .withNewResources()
             .addToRequests("memory", executorMemoryQuantity)
             .addToLimits("memory", executorMemoryLimitQuantity)
+            .addToRequests("cpu", executorCpuQuantity)
+            .addToLimits("cpu", executorCpuQuantity)
             .endResources()
           .withEnv(requiredEnv.asJava)
           .withPorts(requiredPorts.asJava)

From e7397e814f98570a9bfefa66c34a1dd67f53865b Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Mon, 9 Jan 2017 14:25:49 -0800
Subject: [PATCH 379/534] Use k8s:// formatted URL instead of separate setting.

---
 .../main/scala/org/apache/spark/deploy/SparkSubmit.scala  | 5 +----
 .../org/apache/spark/deploy/SparkSubmitArguments.scala    | 7 -------
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala | 8 +++++---
 .../cluster/kubernetes/KubernetesClusterManager.scala     | 2 +-
 .../kubernetes/KubernetesClusterSchedulerBackend.scala    | 5 +----
 .../kubernetes/integrationtest/KubernetesSuite.scala      | 6 ++----
 6 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 6d37b093a0b6b..bd249ea377b65 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -240,7 +240,7 @@ object SparkSubmit {
         YARN
       case m if m.startsWith("spark") => STANDALONE
       case m if m.startsWith("mesos") => MESOS
-      case m if m.startsWith("kubernetes") => KUBERNETES
+      case m if m.startsWith("k8s") => KUBERNETES
       case m if m.startsWith("local") => LOCAL
       case _ =>
         printErrorAndExit("Master must either be yarn or start with spark, mesos, local")
@@ -470,9 +470,6 @@ object SparkSubmit {
       OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.principal"),
       OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, sysProp = "spark.yarn.keytab"),
 
-      // Kubernetes only
-      OptionAssigner(args.kubernetesMaster, KUBERNETES, ALL_DEPLOY_MODES,
-        sysProp = "spark.kubernetes.master"),
       OptionAssigner(args.kubernetesNamespace, KUBERNETES, ALL_DEPLOY_MODES,
         sysProp = "spark.kubernetes.namespace"),
       OptionAssigner(args.kubernetesUploadJars, KUBERNETES, CLUSTER,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index 4244742aad14c..d80f79332111f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -72,7 +72,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   var keytab: String = null
 
   // Kubernetes only
-  var kubernetesMaster: String = null
   var kubernetesNamespace: String = null
   var kubernetesUploadJars: String = null
   var kubernetesUploadDriverExtraClasspath: String = null
@@ -192,9 +191,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       .getOrElse(sparkProperties.get("spark.executor.instances").orNull)
     keytab = Option(keytab).orElse(sparkProperties.get("spark.yarn.keytab")).orNull
     principal = Option(principal).orElse(sparkProperties.get("spark.yarn.principal")).orNull
-    kubernetesMaster = Option(kubernetesMaster)
-      .orElse(sparkProperties.get("spark.kubernetes.master"))
-      .orNull
     kubernetesNamespace = Option(kubernetesNamespace)
       .orElse(sparkProperties.get("spark.kubernetes.namespace"))
       .orNull
@@ -444,9 +440,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       case KEYTAB =>
         keytab = value
 
-      case KUBERNETES_MASTER =>
-        kubernetesMaster = value
-
       case KUBERNETES_NAMESPACE =>
         kubernetesNamespace = value
 
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 21c83dbf40e21..0715c84495a2c 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -49,9 +49,11 @@ private[spark] class Client(
 
   private val namespace = sparkConf.getOption("spark.kubernetes.namespace").getOrElse(
     throw new IllegalArgumentException("Namespace must be provided in spark.kubernetes.namespace"))
-  private val master = sparkConf
-    .getOption("spark.kubernetes.master")
-    .getOrElse("Master must be provided in spark.kubernetes.master")
+  private val rawMaster = sparkConf.get("spark.master")
+  if (!rawMaster.startsWith("k8s://")) {
+    throw new IllegalArgumentException("Master should be a URL with scheme k8s://")
+  }
+  private val master = rawMaster.replaceFirst("k8s://", "")
 
   private val launchTime = System.currentTimeMillis
   private val kubernetesAppId = sparkConf.getOption("spark.app.name")
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index 0d3b97c636ca3..36f7149a832c3 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -21,7 +21,7 @@ import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, Tas
 
 private[spark] class KubernetesClusterManager extends ExternalClusterManager {
 
-  override def canCreate(masterURL: String): Boolean = masterURL.startsWith("kubernetes")
+  override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
   override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
     val scheduler = new TaskSchedulerImpl(sc)
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index bbc95d4f4b7e3..4e099cea3198b 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -43,10 +43,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val EXECUTOR_MODIFICATION_LOCK = new Object
   private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
 
-  private val kubernetesMaster = conf
-    .getOption("spark.kubernetes.master")
-    .getOrElse(
-      throw new SparkException("Kubernetes master must be specified in kubernetes mode."))
+  private val kubernetesMaster = sc.master.replaceFirst("k8s://", "")
 
   private val executorDockerImage = conf
     .get("spark.kubernetes.executor.docker.image", s"spark-executor:${sc.version}")
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 902631b874539..183f666994d38 100644
--- a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -110,8 +110,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Run a simple example") {
     val sparkConf = new SparkConf(true)
-      .setMaster("kubernetes")
-      .set("spark.kubernetes.master", s"https://${Minikube.getMinikubeIp}:8443")
+      .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
       .set("spark.kubernetes.submit.caCertFile", clientConfig.getCaCertFile)
       .set("spark.kubernetes.submit.clientKeyFile", clientConfig.getClientKeyFile)
       .set("spark.kubernetes.submit.clientCertFile", clientConfig.getClientCertFile)
@@ -136,9 +135,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Run using spark-submit") {
     val args = Array(
-      "--master", "kubernetes",
+      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
       "--deploy-mode", "cluster",
-      "--kubernetes-master", s"https://${Minikube.getMinikubeIp}:8443",
       "--kubernetes-namespace", NAMESPACE,
       "--name", "spark-pi",
       "--executor-memory", "512m",

From ed65428d64eabc3da71b8ea5f5be6ba5dac913f6 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Mon, 9 Jan 2017 14:33:41 -0800
Subject: [PATCH 380/534] Reindent comment to conforn to JavaDoc style

The build process fails ScalaStyle checks otherwise.
---
 .../deploy/kubernetes/KubernetesClientBuilder.scala    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
index 61a13dc7274d7..61d3ac17ac34a 100644
--- a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
+++ b/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
@@ -27,11 +27,11 @@ private[spark] object KubernetesClientBuilder {
   private val CA_CERT_FILE = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
 
   /**
-    * Creates a {@link KubernetesClient}, expecting to be from
-    * within the context of a pod. When doing so, credentials files
-    * are picked up from canonical locations, as they are injected
-    * into the pod's disk space.
-    */
+   * Creates a {@link KubernetesClient}, expecting to be from
+   * within the context of a pod. When doing so, credentials files
+   * are picked up from canonical locations, as they are injected
+   * into the pod's disk space.
+   */
   def buildFromWithinPod(
       kubernetesMaster: String,
       kubernetesNamespace: String): DefaultKubernetesClient = {

From f9ddb633d56561bb4272cd8888b64905362c5379 Mon Sep 17 00:00:00 2001
From: mcheah <mcheah@palantir.com>
Date: Mon, 9 Jan 2017 15:30:02 -0800
Subject: [PATCH 381/534] Move kubernetes under resource-managers folder.

---
 .../main/scala/org/apache/spark/deploy/SparkSubmit.scala  | 2 +-
 pom.xml                                                   | 8 ++++----
 {kubernetes => resource-managers/kubernetes}/core/pom.xml | 4 ++--
 .../org.apache.spark.scheduler.ExternalClusterManager     | 0
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala | 0
 .../spark/deploy/kubernetes/KubernetesClientBuilder.scala | 0
 .../scala/org/apache/spark/deploy/kubernetes/Retry.scala  | 0
 .../deploy/rest/KubernetesRestProtocolMessages.scala      | 0
 .../spark/deploy/rest/kubernetes/HttpClientUtil.scala     | 0
 .../deploy/rest/kubernetes/KubernetesSparkRestApi.scala   | 0
 .../rest/kubernetes/KubernetesSparkRestServer.scala       | 0
 .../cluster/kubernetes/KubernetesClusterManager.scala     | 0
 .../kubernetes/KubernetesClusterSchedulerBackend.scala    | 0
 .../kubernetes}/docker-minimal-bundle/pom.xml             | 4 ++--
 .../src/main/assembly/driver-assembly.xml                 | 0
 .../src/main/assembly/executor-assembly.xml               | 0
 .../src/main/docker/driver/Dockerfile                     | 0
 .../src/main/docker/executor/Dockerfile                   | 0
 .../kubernetes}/integration-tests-spark-jobs/pom.xml      | 4 ++--
 .../integrationtest/jobs/SparkPiWithInfiniteWait.scala    | 0
 .../kubernetes}/integration-tests/pom.xml                 | 4 ++--
 .../kubernetes/integrationtest/KubernetesSuite.scala      | 0
 .../integrationtest/docker/SparkDockerImageBuilder.scala  | 0
 .../kubernetes/integrationtest/minikube/Minikube.scala    | 0
 .../integrationtest/restapis/SparkRestApiV1.scala         | 0
 25 files changed, 13 insertions(+), 13 deletions(-)
 rename {kubernetes => resource-managers/kubernetes}/core/pom.xml (97%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/docker-minimal-bundle/pom.xml (98%)
 rename {kubernetes => resource-managers/kubernetes}/docker-minimal-bundle/src/main/assembly/driver-assembly.xml (100%)
 rename {kubernetes => resource-managers/kubernetes}/docker-minimal-bundle/src/main/assembly/executor-assembly.xml (100%)
 rename {kubernetes => resource-managers/kubernetes}/docker-minimal-bundle/src/main/docker/driver/Dockerfile (100%)
 rename {kubernetes => resource-managers/kubernetes}/docker-minimal-bundle/src/main/docker/executor/Dockerfile (100%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests-spark-jobs/pom.xml (95%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests/pom.xml (98%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala (100%)
 rename {kubernetes => resource-managers/kubernetes}/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala (100%)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index bd249ea377b65..5be2d8a52d84c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -243,7 +243,7 @@ object SparkSubmit {
       case m if m.startsWith("k8s") => KUBERNETES
       case m if m.startsWith("local") => LOCAL
       case _ =>
-        printErrorAndExit("Master must either be yarn or start with spark, mesos, local")
+        printErrorAndExit("Master must either be yarn or start with spark, mesos, k8s, or local")
         -1
     }
 
diff --git a/pom.xml b/pom.xml
index d923941748ca2..9ba31b4d78016 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2628,16 +2628,16 @@
     <profile>
       <id>kubernetes</id>
       <modules>
-        <module>kubernetes/core</module>
+        <module>resource-managers/kubernetes/core</module>
       </modules>
     </profile>
 
     <profile>
       <id>kubernetes-integration-tests</id>
       <modules>
-        <module>kubernetes/docker-minimal-bundle</module>
-        <module>kubernetes/integration-tests</module>
-        <module>kubernetes/integration-tests-spark-jobs</module>
+        <module>resource-managers/kubernetes/docker-minimal-bundle</module>
+        <module>resource-managers/kubernetes/integration-tests</module>
+        <module>resource-managers/kubernetes/integration-tests-spark-jobs</module>
       </modules>  
     </profile>
 
diff --git a/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
similarity index 97%
rename from kubernetes/core/pom.xml
rename to resource-managers/kubernetes/core/pom.xml
index 9c7eb52b2680a..388defd93465d 100644
--- a/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,8 +20,8 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
-    <relativePath>../../pom.xml</relativePath>
+    <version>2.2.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
   </parent>
 
   <artifactId>spark-kubernetes_2.11</artifactId>
diff --git a/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
similarity index 100%
rename from kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
rename to resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
diff --git a/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
similarity index 100%
rename from kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
diff --git a/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
similarity index 98%
rename from kubernetes/docker-minimal-bundle/pom.xml
rename to resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index 3de939ea3978a..c20e51c93e7c7 100644
--- a/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -21,8 +21,8 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
+    <version>2.2.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
   </parent>
 
   <artifactId>spark-docker-minimal-bundle_2.11</artifactId>
diff --git a/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
similarity index 100%
rename from kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
rename to resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
diff --git a/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
similarity index 100%
rename from kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
rename to resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
diff --git a/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
similarity index 100%
rename from kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
rename to resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
diff --git a/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
similarity index 100%
rename from kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
rename to resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
diff --git a/kubernetes/integration-tests-spark-jobs/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
similarity index 95%
rename from kubernetes/integration-tests-spark-jobs/pom.xml
rename to resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
index 17f1c4906214f..12b0234ae71bd 100644
--- a/kubernetes/integration-tests-spark-jobs/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -20,8 +20,8 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
-    <relativePath>../../pom.xml</relativePath>
+    <version>2.2.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
   </parent>
 
   <artifactId>spark-kubernetes-integration-tests-spark-jobs_2.11</artifactId>
diff --git a/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
similarity index 100%
rename from kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
rename to resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
diff --git a/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
similarity index 98%
rename from kubernetes/integration-tests/pom.xml
rename to resource-managers/kubernetes/integration-tests/pom.xml
index 0568cb1e21826..1e7eb0e12e6df 100644
--- a/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,8 +20,8 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-SNAPSHOT</version>
-    <relativePath>../../pom.xml</relativePath>
+    <version>2.2.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
   </parent>
 
   <artifactId>spark-kubernetes-integration-tests_2.11</artifactId>
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
similarity index 100%
rename from kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
rename to resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
similarity index 100%
rename from kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
rename to resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
similarity index 100%
rename from kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
rename to resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
diff --git a/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala
similarity index 100%
rename from kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala
rename to resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/restapis/SparkRestApiV1.scala

From 178abc1375d1c56d4d7801ea355a19a4704e5dcc Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 11 Jan 2017 14:36:45 -0800
Subject: [PATCH 382/534] Use tar and gzip to compress+archive shipped jars
 (#2)

* Use tar and gzip to archive shipped jars.

* Address comments

* Move files to resolve merge
---
 pom.xml                                       |   1 +
 .../spark/deploy/kubernetes/Client.scala      |  21 +--
 .../rest/KubernetesRestProtocolMessages.scala |  13 +-
 .../rest/kubernetes/CompressionUtils.scala    | 139 ++++++++++++++++++
 .../KubernetesSparkRestServer.scala           |  27 +---
 .../pom.xml                                   |  33 +++++
 .../kubernetes/integrationtest/PiHelper.java  |  33 +++++
 .../integration-tests-spark-jobs/pom.xml      |   6 +
 .../jobs/SparkPiWithInfiniteWait.scala        |   9 +-
 .../kubernetes/integration-tests/pom.xml      |  13 ++
 .../integrationtest/KubernetesSuite.scala     |   7 +
 11 files changed, 254 insertions(+), 48 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
 create mode 100644 resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
 create mode 100644 resource-managers/kubernetes/integration-tests-spark-jobs-helpers/src/main/java/org/apache/spark/deploy/kubernetes/integrationtest/PiHelper.java

diff --git a/pom.xml b/pom.xml
index 9ba31b4d78016..d04d7623d0584 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2638,6 +2638,7 @@
         <module>resource-managers/kubernetes/docker-minimal-bundle</module>
         <module>resource-managers/kubernetes/integration-tests</module>
         <module>resource-managers/kubernetes/integration-tests-spark-jobs</module>
+        <module>resource-managers/kubernetes/integration-tests-spark-jobs-helpers</module>
       </modules>  
     </profile>
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 0715c84495a2c..230598d63bed1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -35,7 +35,7 @@ import scala.concurrent.duration.DurationInt
 import scala.util.Success
 
 import org.apache.spark.{SPARK_VERSION, SparkConf}
-import org.apache.spark.deploy.rest.{AppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
+import org.apache.spark.deploy.rest.{AppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
@@ -284,8 +284,8 @@ private[spark] class Client(
       case other => RemoteAppResource(other)
     }
 
-    val uploadDriverExtraClasspathBase64Contents = getFileContents(uploadedDriverExtraClasspath)
-    val uploadJarsBase64Contents = getFileContents(uploadedJars)
+    val uploadDriverExtraClasspathBase64Contents = compressJars(uploadedDriverExtraClasspath)
+    val uploadJarsBase64Contents = compressJars(uploadedJars)
     KubernetesCreateSubmissionRequest(
       appResource = resolvedAppResource,
       mainClass = mainClass,
@@ -296,19 +296,10 @@ private[spark] class Client(
       uploadedJarsBase64Contents = uploadJarsBase64Contents)
   }
 
-  def getFileContents(maybeFilePaths: Option[String]): Array[(String, String)] = {
+  def compressJars(maybeFilePaths: Option[String]): Option[TarGzippedData] = {
     maybeFilePaths
-      .map(_.split(",").map(filePath => {
-        val fileToUpload = new File(filePath)
-        if (!fileToUpload.isFile) {
-          throw new IllegalStateException("Provided file to upload for driver extra classpath" +
-            s" does not exist or is not a file: $filePath")
-        } else {
-          val fileBytes = Files.toByteArray(fileToUpload)
-          val fileBase64 = Base64.encodeBase64String(fileBytes)
-          (fileToUpload.getName, fileBase64)
-        }
-      })).getOrElse(Array.empty[(String, String)])
+      .map(_.split(","))
+      .map(CompressionUtils.createTarGzip(_))
   }
 
   private def getDriverLauncherService(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
index 4b7bb66083f29..6da1a848b25e7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -27,14 +27,19 @@ case class KubernetesCreateSubmissionRequest(
   val appArgs: Array[String],
   val sparkProperties: Map[String, String],
   val secret: String,
-  val uploadedDriverExtraClasspathBase64Contents: Array[(String, String)]
-      = Array.empty[(String, String)],
-  val uploadedJarsBase64Contents: Array[(String, String)]
-      = Array.empty[(String, String)]) extends SubmitRestProtocolRequest {
+  val uploadedDriverExtraClasspathBase64Contents: Option[TarGzippedData],
+  val uploadedJarsBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
   message = "create"
   clientSparkVersion = SPARK_VERSION
 }
 
+case class TarGzippedData(
+  val dataBase64: String,
+  val blockSize: Int = 10240,
+  val recordSize: Int = 512,
+  val encoding: String
+)
+
 @JsonTypeInfo(
   use = JsonTypeInfo.Id.NAME,
   include = JsonTypeInfo.As.PROPERTY,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
new file mode 100644
index 0000000000000..805a52bada219
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
+import java.util.zip.{GZIPInputStream, GZIPOutputStream}
+
+import com.google.common.io.Files
+import org.apache.commons.codec.binary.Base64
+import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream, TarArchiveOutputStream}
+import org.apache.commons.compress.utils.CharsetNames
+import org.apache.commons.io.IOUtils
+import scala.collection.mutable
+
+import org.apache.spark.deploy.rest.TarGzippedData
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.{ByteBufferOutputStream, Utils}
+
+private[spark] object CompressionUtils extends Logging {
+  // Defaults from TarArchiveOutputStream
+  private val BLOCK_SIZE = 10240
+  private val RECORD_SIZE = 512
+  private val ENCODING = CharsetNames.UTF_8
+
+  /**
+   * Compresses all of the given paths into a gzipped-tar archive, returning the compressed data in
+   * memory as an instance of {@link TarGzippedData}. The files are taken without consideration to their
+   * original folder structure, and are added to the tar archive in a flat hierarchy. Directories are
+   * not allowed, and duplicate file names are de-duplicated by appending a numeric suffix to the file name,
+   * before the file extension. For example, if paths a/b.txt and b/b.txt were provided, then the files added
+   * to the tar archive would be b.txt and b-1.txt.
+   * @param paths A list of file paths to be archived
+   * @return An in-memory representation of the compressed data.
+   */
+  def createTarGzip(paths: Iterable[String]): TarGzippedData = {
+    val compressedBytesStream = Utils.tryWithResource(new ByteBufferOutputStream()) { raw =>
+      Utils.tryWithResource(new GZIPOutputStream(raw)) { gzipping =>
+        Utils.tryWithResource(new TarArchiveOutputStream(
+            gzipping,
+            BLOCK_SIZE,
+            RECORD_SIZE,
+            ENCODING)) { tarStream =>
+          val usedFileNames = mutable.HashSet.empty[String]
+          for (path <- paths) {
+            val file = new File(path)
+            if (!file.isFile) {
+              throw new IllegalArgumentException(s"Cannot add $path to tarball; either does" +
+                s" not exist or is a directory.")
+            }
+            var resolvedFileName = file.getName
+            val extension = Files.getFileExtension(file.getName)
+            val nameWithoutExtension = Files.getNameWithoutExtension(file.getName)
+            var deduplicationCounter = 1
+            while (usedFileNames.contains(resolvedFileName)) {
+              val oldResolvedFileName = resolvedFileName
+              resolvedFileName = s"$nameWithoutExtension-$deduplicationCounter.$extension"
+              logWarning(s"File with name $oldResolvedFileName already exists. Trying to add with" +
+                s" file name $resolvedFileName instead.")
+              deduplicationCounter += 1
+            }
+            usedFileNames += resolvedFileName
+            val tarEntry = new TarArchiveEntry(file, resolvedFileName)
+            tarStream.putArchiveEntry(tarEntry)
+            Utils.tryWithResource(new FileInputStream(file)) { fileInput =>
+              IOUtils.copy(fileInput, tarStream)
+            }
+            tarStream.closeArchiveEntry()
+          }
+        }
+      }
+      raw
+    }
+    val compressedAsBase64 = Base64.encodeBase64String(compressedBytesStream.toByteBuffer.array)
+    TarGzippedData(
+      dataBase64 = compressedAsBase64,
+      blockSize = BLOCK_SIZE,
+      recordSize = RECORD_SIZE,
+      encoding = ENCODING
+    )
+  }
+
+  /**
+   * Decompresses the provided tar archive to a directory.
+   * @param compressedData In-memory representation of the compressed data, ideally created via
+   *                       {@link createTarGzip}.
+   * @param rootOutputDir  Directory to write the output files to. All files from the tarball
+   *                       are written here in a flat hierarchy.
+   * @return List of file paths for each file that was unpacked from the archive.
+   */
+  def unpackAndWriteCompressedFiles(
+      compressedData: TarGzippedData,
+      rootOutputDir: File): Seq[String] = {
+    val paths = mutable.Buffer.empty[String]
+    val compressedBytes = Base64.decodeBase64(compressedData.dataBase64)
+    if (!rootOutputDir.exists) {
+      if (!rootOutputDir.mkdirs) {
+        throw new IllegalStateException(s"Failed to create output directory for unpacking" +
+          s" files at ${rootOutputDir.getAbsolutePath}")
+      }
+    } else if (rootOutputDir.isFile) {
+      throw new IllegalArgumentException(s"Root dir for writing decompressed files: " +
+         s"${rootOutputDir.getAbsolutePath} exists and is not a directory.")
+    }
+    Utils.tryWithResource(new ByteArrayInputStream(compressedBytes)) { compressedBytesStream =>
+      Utils.tryWithResource(new GZIPInputStream(compressedBytesStream)) { gzipped =>
+        Utils.tryWithResource(new TarArchiveInputStream(
+            gzipped,
+            compressedData.blockSize,
+            compressedData.recordSize,
+            compressedData.encoding)) { tarInputStream =>
+          var nextTarEntry = tarInputStream.getNextTarEntry
+          while (nextTarEntry != null) {
+            val outputFile = new File(rootOutputDir, nextTarEntry.getName)
+            Utils.tryWithResource(new FileOutputStream(outputFile)) { fileOutputStream =>
+              IOUtils.copy(tarInputStream, fileOutputStream)
+            }
+            paths += outputFile.getAbsolutePath
+            nextTarEntry = tarInputStream.getNextTarEntry
+          }
+        }
+      }
+    }
+    paths.toSeq
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 0a2e8176394ab..2ca3d4a8c0656 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -217,30 +217,11 @@ private[spark] class KubernetesSparkRestServer(
   }
 
   private def writeBase64ContentsToFiles(
-        filesBase64Contents: Array[(String, String)],
+        maybeCompressedFiles: Option[TarGzippedData],
         rootDir: File): Seq[String] = {
-    val resolvedFileNames = new scala.collection.mutable.HashSet[String]
-    val resolvedFilePaths = new ArrayBuffer[String]
-    for (file <- filesBase64Contents)  {
-      var currentFileName = file._1
-      var deduplicationCounter = 1
-      while (resolvedFileNames.contains(currentFileName)) {
-        // Prepend the deduplication counter so as to not mess with the extension
-        currentFileName = s"$deduplicationCounter-$currentFileName"
-        deduplicationCounter += 1
-      }
-      val resolvedFile = new File(rootDir, currentFileName)
-      val resolvedFilePath = resolvedFile.getAbsolutePath
-      if (resolvedFile.createNewFile()) {
-        val fileContents = Base64.decodeBase64(file._2)
-        Files.write(fileContents, resolvedFile)
-      } else {
-        throw new IllegalStateException(s"Could not write jar file to $resolvedFilePath")
-      }
-      resolvedFileNames += currentFileName
-      resolvedFilePaths += resolvedFilePath
-    }
-    resolvedFilePaths.toSeq
+    maybeCompressedFiles.map { compressedFiles =>
+      CompressionUtils.unpackAndWriteCompressedFiles(compressedFiles, rootDir)
+    }.getOrElse(Seq.empty[String])
   }
 }
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
new file mode 100644
index 0000000000000..f99838636b349
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.2.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Kubernetes Integration Tests Spark Jobs Helpers</name>
+
+  <dependencies>
+  </dependencies>
+</project>
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/src/main/java/org/apache/spark/deploy/kubernetes/integrationtest/PiHelper.java b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/src/main/java/org/apache/spark/deploy/kubernetes/integrationtest/PiHelper.java
new file mode 100644
index 0000000000000..99d982397bb6e
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/src/main/java/org/apache/spark/deploy/kubernetes/integrationtest/PiHelper.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest;
+
+/**
+ * Primarily extracted so that a separate jar can be added as a dependency for the
+ * test Spark job.
+ */
+public class PiHelper {
+  public static int helpPi() {
+    double x = Math.random() * 2 - 1;
+    double y = Math.random() * 2 - 1;
+    if (x*x + y*y < 1) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
index 12b0234ae71bd..59e59aca5109b 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -29,6 +29,12 @@
   <name>Spark Project Kubernetes Integration Tests Spark Jobs</name>
 
   <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
index 6e4660b771305..d3372749f999e 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/SparkPiWithInfiniteWait.scala
@@ -16,8 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest.jobs
 
-import scala.math.random
-
+import org.apache.spark.deploy.kubernetes.integrationtest.PiHelper
 import org.apache.spark.sql.SparkSession
 
 // Equivalent to SparkPi except does not stop the Spark Context
@@ -32,10 +31,8 @@ private[spark] object SparkPiWithInfiniteWait {
       .getOrCreate()
     val slices = if (args.length > 0) args(0).toInt else 10
     val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
-    val count = spark.sparkContext.parallelize(1 until n, slices).map { i =>
-        val x = random * 2 - 1
-        val y = random * 2 - 1
-        if (x*x + y*y < 1) 1 else 0
+    val count = spark.sparkContext.parallelize(1 until n, slices).map { _ =>
+        PiHelper.helpPi()
       }.reduce(_ + _)
     // scalastyle:off println
     println("Pi is roughly " + 4.0 * count / (n - 1))
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 1e7eb0e12e6df..569527de8e300 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -48,6 +48,12 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
@@ -123,6 +129,13 @@
                   <type>jar</type>
                   <outputDirectory>${project.build.directory}/integration-tests-spark-jobs</outputDirectory>
                 </artifactItem>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <type>jar</type>
+                  <outputDirectory>${project.build.directory}/integration-tests-spark-jobs-helpers</outputDirectory>
+                </artifactItem>
               </artifactItems>
             </configuration>
           </execution>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 183f666994d38..6247a1674f8d6 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -41,6 +41,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     .listFiles()(0)
     .getAbsolutePath
 
+  private val HELPER_JAR = Paths.get("target", "integration-tests-spark-jobs-helpers")
+      .toFile
+      .listFiles()(0)
+      .getAbsolutePath
+
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
   private val MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
@@ -117,6 +122,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.kubernetes.namespace", NAMESPACE)
       .set("spark.kubernetes.driver.docker.image", "spark-driver:latest")
       .set("spark.kubernetes.executor.docker.image", "spark-executor:latest")
+      .set("spark.kubernetes.driver.uploads.jars", HELPER_JAR)
       .set("spark.executor.memory", "500m")
       .set("spark.executor.cores", "1")
       .set("spark.executors.instances", "1")
@@ -142,6 +148,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-memory", "512m",
       "--executor-cores", "1",
       "--num-executors", "1",
+      "--upload-jars", HELPER_JAR,
       "--class", MAIN_CLASS,
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",

From e2787e8c1a45bda367246cc73178412206acbf33 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 11 Jan 2017 16:05:16 -0800
Subject: [PATCH 383/534] Use alpine and java 8 for docker images. (#10)

* Use alpine and java 8 for docker images.

* Remove installation of vim and redundant comment
---
 .../src/main/docker/driver/Dockerfile                 | 11 +----------
 .../src/main/docker/executor/Dockerfile               | 11 +----------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 3bba38d8395ae..7bbabc40c34fc 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -1,12 +1,4 @@
-FROM ubuntu:trusty
-
-# Upgrade package index
-# install a few other useful packages plus Open Jdk 7
-# Remove unneeded /var/lib/apt/lists/* after install to reduce the
-# docker image size (by ~30MB)
-RUN apt-get update && \
-    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server procps && \
-    rm -rf /var/lib/apt/lists/*
+FROM anapsix/alpine-java:8
 
 RUN mkdir -p /opt/spark
 RUN mkdir -p /opt/spark/ui-resources/org/apache/spark/ui/static
@@ -18,7 +10,6 @@ ADD sbin /opt/spark/sbin
 ADD conf /opt/spark/conf
 
 ENV SPARK_HOME /opt/spark
-ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64/jre
 
 WORKDIR /opt/spark
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index f68f1a3fb2694..f584525cdc5e9 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -1,12 +1,4 @@
-FROM ubuntu:trusty
-
-# Upgrade package index
-# install a few other useful packages plus Open Jdk 7
-# Remove unneeded /var/lib/apt/lists/* after install to reduce the
-# docker image size (by ~30MB)
-RUN apt-get update && \
-    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server procps && \
-    rm -rf /var/lib/apt/lists/*
+FROM anapsix/alpine-java:8
 
 RUN mkdir -p /opt/spark
 RUN mkdir -p /opt/spark/ui-resources/org/apache/spark/ui/static
@@ -18,7 +10,6 @@ ADD sbin /opt/spark/sbin
 ADD conf /opt/spark/conf
 
 ENV SPARK_HOME /opt/spark
-ENV JAVA_HOME /usr/lib/jvm/java-7-openjdk-amd64/jre
 
 WORKDIR /opt/spark
 

From acceb72d96c739918af3b83f02ebf8524727d1a1 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 11 Jan 2017 18:20:12 -0800
Subject: [PATCH 384/534] Copy the Dockerfiles from docker-minimal-bundle into
 the distribution. (#12)

---
 dev/make-distribution.sh                                   | 7 +++++++
 .../src/main/docker/driver/Dockerfile                      | 5 ++++-
 .../src/main/docker/executor/Dockerfile                    | 5 ++++-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 6ea319e4362ab..62706b0fffedc 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -175,6 +175,13 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 # Copy jars
 cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
 
+# Copy docker files
+mkdir -p "$DISTDIR/dockerfiles/driver"
+mkdir -p "$DISTDIR/dockerfiles/executor"
+DOCKERFILES_SRC="$SPARK_HOME/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker"
+cp "$DOCKERFILES_SRC/driver/Dockerfile" "$DISTDIR/dockerfiles/driver/Dockerfile"
+cp "$DOCKERFILES_SRC/executor/Dockerfile" "$DISTDIR/dockerfiles/executor/Dockerfile"
+
 # Only create the yarn directory if the yarn artifacts were build.
 if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
   mkdir "$DISTDIR"/yarn
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 7bbabc40c34fc..308bf392fb202 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -1,7 +1,10 @@
 FROM anapsix/alpine-java:8
 
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-driver:latest -f dockerfiles/driver/Dockerfile .
+
 RUN mkdir -p /opt/spark
-RUN mkdir -p /opt/spark/ui-resources/org/apache/spark/ui/static
 RUN touch /opt/spark/RELEASE
 
 ADD jars /opt/spark/jars
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index f584525cdc5e9..164c0a4289cac 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -1,7 +1,10 @@
 FROM anapsix/alpine-java:8
 
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
+
 RUN mkdir -p /opt/spark
-RUN mkdir -p /opt/spark/ui-resources/org/apache/spark/ui/static
 RUN touch /opt/spark/RELEASE
 
 ADD jars /opt/spark/jars

From 24f4bf02f64f0725c849ec1eab3d87b2fd77e594 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Thu, 12 Jan 2017 14:11:02 -0800
Subject: [PATCH 385/534] inherit IO (#13)

---
 .../deploy/rest/kubernetes/KubernetesSparkRestServer.scala   | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 2ca3d4a8c0656..837706ca9f5a8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -164,10 +164,7 @@ private[spark] class KubernetesSparkRestServer(
             command += s"-Xmx$driverMemory"
             command += mainClass
             command ++= appArgs
-            val pb = new ProcessBuilder(command: _*)
-            Paths.get(sparkHome, "logs").toFile.mkdirs
-            pb.redirectOutput(Paths.get(sparkHome, "logs", "stdout").toFile)
-            pb.redirectError(Paths.get(sparkHome, "logs", "stderr").toFile)
+            val pb = new ProcessBuilder(command: _*).inheritIO()
             val process = pb.start()
             ShutdownHookManager.addShutdownHook(() => {
               logInfo("Received stop command, shutting down the running Spark application...")

From adcc9062bbea2dcd226684afbba4791256c2b94c Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 12 Jan 2017 17:59:11 -0800
Subject: [PATCH 386/534] Error messages when the driver container fails to
 start. (#11)

* Error messages when the driver container fails to start.

* Fix messages a bit

* Use timeout constant

* Delete the pod if it fails for any reason (not just timeout)

* Actually set submit succeeded

* Fix typo
---
 .../spark/deploy/kubernetes/Client.scala      | 83 +++++++++++++++++--
 1 file changed, 77 insertions(+), 6 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 230598d63bed1..6d7de973a52c2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -18,7 +18,7 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.File
 import java.security.SecureRandom
-import java.util.concurrent.{Executors, TimeUnit}
+import java.util.concurrent.{Executors, TimeoutException, TimeUnit}
 import javax.net.ssl.X509TrustManager
 
 import com.google.common.io.Files
@@ -34,7 +34,7 @@ import scala.concurrent.ExecutionContext
 import scala.concurrent.duration.DurationInt
 import scala.util.Success
 
-import org.apache.spark.{SPARK_VERSION, SparkConf}
+import org.apache.spark.{SPARK_VERSION, SparkConf, SparkException}
 import org.apache.spark.deploy.rest.{AppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
@@ -130,8 +130,8 @@ private[spark] class Client(
         val podWatcher = new Watcher[Pod] {
           override def eventReceived(action: Action, t: Pod): Unit = {
             if ((action == Action.ADDED || action == Action.MODIFIED)
-              && t.getStatus.getPhase == "Running"
-              && !submitCompletedFuture.isDone) {
+                && t.getStatus.getPhase == "Running"
+                && !submitCompletedFuture.isDone) {
               t.getStatus
                 .getContainerStatuses
                 .asScala
@@ -216,8 +216,78 @@ private[spark] class Client(
                 .endContainer()
               .endSpec()
             .done()
-          submitCompletedFuture.get(30, TimeUnit.SECONDS)
-        }
+          var submitSucceeded = false
+          try {
+            submitCompletedFuture.get(LAUNCH_TIMEOUT_SECONDS, TimeUnit.SECONDS)
+            submitSucceeded = true
+          } catch {
+            case e: TimeoutException =>
+              val driverPod = try {
+                kubernetesClient.pods().withName(kubernetesAppId).get()
+              } catch {
+                case throwable: Throwable =>
+                  logError(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS seconds for the" +
+                    " driver pod to start, but an error occurred while fetching the driver" +
+                    " pod's details.", throwable)
+                  throw new SparkException(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS" +
+                    " seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
+                    " the latest state of the pod, another error was thrown. Check the logs for" +
+                    " the error that was thrown in looking up the driver pod.", e)
+              }
+              val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
+                s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
+                s" $LAUNCH_TIMEOUT_SECONDS seconds."
+              val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
+                s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
+              } else {
+                "The pod had no final phase."
+              }
+              val podStatusMessage = if (driverPod.getStatus.getMessage != null) {
+                s"Latest message from the pod is: ${driverPod.getStatus.getMessage}"
+              } else {
+                "The pod had no final message."
+              }
+              val failedDriverContainerStatusString = driverPod.getStatus
+                .getContainerStatuses
+                .asScala
+                .find(_.getName == DRIVER_LAUNCHER_CONTAINER_NAME)
+                .map(status => {
+                  val lastState = status.getState
+                  if (lastState.getRunning != null) {
+                    "Driver container last state: Running\n" +
+                    s"Driver container started at: ${lastState.getRunning.getStartedAt}"
+                  } else if (lastState.getWaiting != null) {
+                    "Driver container last state: Waiting\n" +
+                    s"Driver container wait reason: ${lastState.getWaiting.getReason}\n" +
+                    s"Driver container message: ${lastState.getWaiting.getMessage}\n"
+                  } else if (lastState.getTerminated != null) {
+                    "Driver container last state: Terminated\n" +
+                    s"Driver container started at: ${lastState.getTerminated.getStartedAt}\n" +
+                    s"Driver container finished at: ${lastState.getTerminated.getFinishedAt}\n" +
+                    s"Driver container exit reason: ${lastState.getTerminated.getReason}\n" +
+                    s"Driver container exit code: ${lastState.getTerminated.getExitCode}\n" +
+                    s"Driver container message: ${lastState.getTerminated.getMessage}"
+                  } else {
+                    "Driver container last state: Unknown"
+                  }
+                }).getOrElse("The driver container wasn't found in the pod; expected to find" +
+                  s" container with name $DRIVER_LAUNCHER_CONTAINER_NAME")
+              val finalErrorMessage = s"$topLevelMessage\n" +
+                s"$podStatusPhase\n" +
+                s"$podStatusMessage\n\n$failedDriverContainerStatusString"
+              logError(finalErrorMessage, e)
+              throw new SparkException(finalErrorMessage, e)
+            } finally {
+              if (!submitSucceeded) {
+                try {
+                  kubernetesClient.pods.withName(kubernetesAppId).delete
+                } catch {
+                  case throwable: Throwable =>
+                    logError("Failed to delete driver pod after it failed to run.", throwable)
+                }
+              }
+            }
+          }
 
         Utils.tryWithResource(kubernetesClient
           .pods()
@@ -338,6 +408,7 @@ private object Client {
   private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher"
   private val SECURE_RANDOM = new SecureRandom()
   private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission"
+  private val LAUNCH_TIMEOUT_SECONDS = 30
 
   def main(args: Array[String]): Unit = {
     require(args.length >= 2, s"Too few arguments. Usage: ${getClass.getName} <mainAppResource>" +

From 0b81dbf516b0c389db50d237f52f32747dda8056 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Thu, 12 Jan 2017 19:27:44 -0800
Subject: [PATCH 387/534] Fix linter error to make CI happy (#18)

---
 .../org/apache/spark/launcher/SparkSubmitOptionParser.java     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index 94f9bc319b6a2..2b7290a12f8c1 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -80,7 +80,8 @@ class SparkSubmitOptionParser {
   protected final String KUBERNETES_MASTER = "--kubernetes-master";
   protected final String KUBERNETES_NAMESPACE = "--kubernetes-namespace";
   protected final String KUBERNETES_UPLOAD_JARS = "--upload-jars";
-  protected final String KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH = "--upload-driver-extra-classpath";
+  protected final String KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH =
+          "--upload-driver-extra-classpath";
 
   /**
    * This is the canonical list of spark-submit options. Each entry in the array contains the

From e70f427ec99d4dee6668b5eb7e3889288e3db505 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 13 Jan 2017 14:11:08 -0800
Subject: [PATCH 388/534] Documentation for the current state of the world
 (#16)

* Documentation for the current state of the world.

* Adding navigation links from other pages

* Address comments, add TODO for things that should be fixed

* Address comments, mostly making images section clearer

* Virtual runtime -> container runtime
---
 docs/_layouts/global.html     |   1 +
 docs/index.md                 |   1 +
 docs/running-on-kubernetes.md | 224 ++++++++++++++++++++++++++++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 docs/running-on-kubernetes.md

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index c00d0db63cd10..3c786a6344066 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -99,6 +99,7 @@
                                 <li><a href="spark-standalone.html">Spark Standalone</a></li>
                                 <li><a href="running-on-mesos.html">Mesos</a></li>
                                 <li><a href="running-on-yarn.html">YARN</a></li>
+                                <li><a href="running-on-kubernetes.html">Kubernetes</a></li>
                             </ul>
                         </li>
 
diff --git a/docs/index.md b/docs/index.md
index 57b9fa848f4a3..81d37aa5f63a1 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -113,6 +113,7 @@ options for deployment:
   * [Mesos](running-on-mesos.html): deploy a private cluster using
       [Apache Mesos](http://mesos.apache.org)
   * [YARN](running-on-yarn.html): deploy Spark on top of Hadoop NextGen (YARN)
+  * [Kubernetes](running-on-kubernetes.html): deploy Spark on top of Kubernetes
 
 **Other Documents:**
 
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
new file mode 100644
index 0000000000000..5192d9d086618
--- /dev/null
+++ b/docs/running-on-kubernetes.md
@@ -0,0 +1,224 @@
+---
+layout: global
+title: Running Spark on Kubernetes
+---
+
+Support for running on [Kubernetes](https://kubernetes.io/) is available in experimental status. The feature set is
+currently limited and not well-tested. This should not be used in production environments.
+
+## Setting Up Docker Images
+
+Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
+be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
+frequently used with Kubernetes, so Spark provides some support for working with Docker to get started quickly.
+
+To use Spark on Kubernetes with Docker, images for the driver and the executors need to built and published to an
+accessible Docker registry. Spark distributions include the Docker files for the driver and the executor at
+`dockerfiles/driver/Dockerfile` and `docker/executor/Dockerfile`, respectively. Use these Docker files to build the
+Docker images, and then tag them with the registry that the images should be sent to. Finally, push the images to the
+registry.
+
+For example, if the registry host is `registry-host` and the registry is listening on port 5000:
+
+    cd $SPARK_HOME
+    docker build -t registry-host:5000/spark-driver:latest -f dockerfiles/driver/Dockerfile .
+    docker build -t registry-host:5000/spark-executor:latest -f dockerfiles/executor/Dockerfile .
+    docker push registry-host:5000/spark-driver:latest
+    docker push registry-host:5000/spark-executor:latest
+    
+## Submitting Applications to Kubernetes
+
+Kubernetes applications can be executed via `spark-submit`. For example, to compute the value of pi, assuming the images
+are set up as described above:
+
+    bin/spark-submit 
+      --deploy-mode cluster 
+      --class org.apache.spark.examples.SparkPi 
+      --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port>
+      --kubernetes-namespace default
+      --conf spark.executor.instances=5 
+      --conf spark.app.name=spark-pi
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+      examples/jars/spark_2.11-2.2.0.jar
+
+<!-- TODO master should default to https if no scheme is specified -->
+The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
+`spark.master` in the application's configuration, must be a URL with the format `k8s://<api_server_url>`. Prefixing the
+master string with `k8s://` will cause the Spark application to launch on the Kubernetes cluster, with the API server
+being contacted at `api_server_url`. The HTTP protocol must also be specified.
+
+Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
+the cluster.
+ 
+### Adding Other JARs
+ 
+Spark allows users to provide dependencies that are bundled into the driver's Docker image, or that are on the local
+disk of the submitter's machine. These two types of dependencies are specified via different configuration options to
+`spark-submit`:
+ 
+* Local jars provided by specifying the `--jars` command line argument to `spark-submit`, or by setting `spark.jars` in
+  the application's configuration, will be treated as jars that are located on the *disk of the driver Docker
+  container*. This only applies to jar paths that do not specify a scheme or that have the scheme `file://`. Paths with
+  other schemes are fetched from their appropriate locations.
+* Local jars provided by specifying the `--upload-jars` command line argument to `spark-submit`, or by setting
+  `spark.kubernetes.driver.uploads.jars` in the application's configuration, will be treated as jars that are located on
+  the *disk of the submitting machine*. These jars are uploaded to the driver docker container before executing the
+  application.
+  <!-- TODO support main resource bundled in the Docker image -->
+* A main application resource path that does not have a scheme or that has the scheme `file://` is assumed to be on the
+  *disk of the submitting machine*. This resource is uploaded to the driver docker container before executing the
+  application. A remote path can still be specified and the resource will be fetched from the appropriate location.
+  
+In all of these cases, the jars are placed on the driver's classpath, and are also sent to the executors. Below are some
+examples of providing application dependencies.
+
+To submit an application with both the main resource and two other jars living on the submitting user's machine:
+
+    bin/spark-submit
+      --deploy-mode cluster
+      --class com.example.applications.SampleApplication
+      --master k8s://https://192.168.99.100
+      --kubernetes-namespace default 
+      --upload-jars /home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+      /home/exampleuser/exampleapplication/main.jar
+      
+Note that since passing the jars through the `--upload-jars` command line argument is equivalent to setting the
+`spark.kubernetes.driver.uploads.jars` Spark property, the above will behave identically to this command:
+
+    bin/spark-submit
+      --deploy-mode cluster
+      --class com.example.applications.SampleApplication
+      --master k8s://https://192.168.99.100
+      --kubernetes-namespace default 
+      --conf spark.kubernetes.driver.uploads.jars=/home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+      /home/exampleuser/exampleapplication/main.jar
+
+To specify a main application resource that can be downloaded from an HTTP service, and if a plugin for that application
+is located in the jar `/opt/spark-plugins/app-plugin.jar` on the docker image's disk:
+
+    bin/spark-submit 
+      --deploy-mode cluster 
+      --class com.example.applications.PluggableApplication
+      --master k8s://https://192.168.99.100
+      --kubernetes-namespace default 
+      --jars /opt/spark-plugins/app-plugin.jar
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+      http://example.com:8080/applications/sparkpluggable/app.jar
+      
+Note that since passing the jars through the `--jars` command line argument is equivalent to setting the `spark.jars`
+Spark property, the above will behave identically to this command:
+
+    bin/spark-submit 
+      --deploy-mode cluster 
+      --class com.example.applications.PluggableApplication
+      --master k8s://https://192.168.99.100
+      --kubernetes-namespace default 
+      --conf spark.jars=file:///opt/spark-plugins/app-plugin.jar
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+      http://example.com:8080/applications/sparkpluggable/app.jar
+      
+### Spark Properties
+
+Below are some other common properties that are specific to Kubernetes. Most of the other configurations are the same
+from the other deployment modes. See the [configuration page](configuration.html) for more information on those.
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.kubernetes.namespace</code></td>
+  <!-- TODO set default to "default" -->
+  <td>(none)</td>
+  <td>
+    The namespace that will be used for running the driver and executor pods. Must be specified. When using
+    <code>spark-submit</code> in cluster mode, this can also be passed to <code>spark-submit</code> via the
+    <code>--kubernetes-namespace</code> command line argument.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.docker.image</code></td>
+  <td><code>spark-driver:2.2.0</code></td>
+  <td>
+    Docker image to use for the driver. Specify this using the standard
+    <a href="https://docs.docker.com/engine/reference/commandline/tag/">Docker tag</a> format.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.docker.image</code></td>
+  <td><code>spark-executor:2.2.0</code></td>
+  <td>
+    Docker image to use for the executors. Specify this using the standard
+    <a href="https://docs.docker.com/engine/reference/commandline/tag/">Docker tag</a> format.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.submit.caCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    CA cert file for connecting to Kubernetes over SSL. This file should be located on the submitting machine's disk.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.submit.clientKeyFile</code></td>
+  <td>(none)</td>
+  <td>
+    Client key file for authenticating against the Kubernetes API server. This file should be located on the submitting
+    machine's disk.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.submit.clientCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Client cert file for authenticating against the Kubernetes API server. This file should be located on the submitting
+    machine's disk.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.submit.serviceAccountName</code></td>
+  <td><code>default</code></td>
+  <td>
+    Service account that is used when running the driver pod. The driver pod uses this service account when requesting
+    executor pods from the API server.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.uploads.jars</code></td>
+  <td>(none)</td>
+  <td>
+    Comma-separated list of jars to sent to the driver and all executors when submitting the application in cluster
+    mode. Refer to <a href="running-on-kubernetes.html#adding-other-jars">adding other jars</a> for more information.
+  </td>
+</tr>
+<tr>
+  <!-- TODO remove this functionality -->
+  <td><code>spark.kubernetes.driver.uploads.driverExtraClasspath</code></td>
+  <td>(none)</td>
+  <td>
+    Comma-separated list of jars to be sent to the driver only when submitting the application in cluster mode. 
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.memoryOverhead</code></td>
+  <td>executorMemory * 0.10, with minimum of 384 </td>
+  <td>
+    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things
+    like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size
+    (typically 6-10%).
+  </td>
+</tr>
+</table>
+
+## Current Limitations
+
+Running Spark on Kubernetes is currently an experimental feature. Some restrictions on the current implementation that
+should be lifted in the future include:
+* Applications can only use a fixed number of executors. Dynamic allocation is not supported.
+* Applications can only run in cluster mode.
+* Only Scala and Java applications can be run.

From b25bc8b712277a88f00b017a6a69a23585a1d19b Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 13 Jan 2017 14:56:08 -0800
Subject: [PATCH 389/534] Development workflow documentation for the current
 state of the world. (#20)

* Development workflow documentation for the current state of the world.

* Address comments.

* Clarified code change and added ticket link
---
 resource-managers/kubernetes/README.md | 56 ++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 resource-managers/kubernetes/README.md

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
new file mode 100644
index 0000000000000..3c11efa38d5af
--- /dev/null
+++ b/resource-managers/kubernetes/README.md
@@ -0,0 +1,56 @@
+---
+layout: global
+title: Spark on Kubernetes Development
+---
+
+[Kubernetes](https://kubernetes.io/) is a framework for easily deploying, scaling, and managing containerized
+applications. It would be useful for a user to run their Spark jobs on a Kubernetes cluster alongside their
+other Kubernetes-managed applications. For more about the motivations for adding this feature, see the umbrella JIRA
+ticket that tracks this project: [SPARK-18278](https://issues.apache.org/jira/browse/SPARK-18278).
+
+This submodule is an initial implementation of allowing Kubernetes to be a
+supported cluster manager for Spark, along with Mesos, Hadoop YARN, and Standalone. This document provides a summary of
+important matters to keep in mind when developing this feature.
+
+# Building Spark with Kubernetes Support
+
+To build Spark with Kubernetes support, use the `kubernetes` profile when invoking Maven. For example, to simply compile
+the Kubernetes core implementation module along with its dependencies:
+
+    build/mvn compile -Pkubernetes -pl resource-managers/kubernetes/core -am
+
+To build a distribution of Spark with Kubernetes support, use the `dev/make-distribution.sh` script, and add the
+`kubernetes` profile as part of the build arguments. Any other build arguments can be specified as one would expect when
+building Spark normally. For example, to build Spark against Hadoop 2.7 and Kubernetes:
+
+    dev/make-distribution.sh --tgz -Phadoop2.7 -Pkubernetes
+
+# Kubernetes Code Modules
+
+Below is a list of the submodules for this cluster manager and what they do.
+
+* `core`: Implementation of the Kubernetes cluster manager support.
+* `integration-tests`: Integration tests for the project.
+* `docker-minimal-bundle`: Base Dockerfiles for the driver and the executors. The Dockerfiles are used for integration
+  tests as well as being provided in packaged distributions of Spark.
+* `integration-tests-spark-jobs`: Spark jobs that are only used in integration tests.
+* `integration-tests-spark-jobs-helpers`: Dependencies for the spark jobs used in integration tests. These dependencies
+  are separated out to facilitate testing the shipping of jars to drivers running on Kubernetes clusters.
+
+# Running the Kubernetes Integration Tests
+
+Note that the integration test framework is currently being heavily revised and is subject to change.
+
+Running any of the integration tests requires including `kubernetes-integration-tests` profile in the build command. In
+order to prepare the environment for running the integration tests, the `pre-integration-test` step must be run in Maven
+on the `resource-managers/kubernetes/integration-tests` module:
+
+  build/mvn pre-integration-test -Pkubernetes -Pkubernetes-integration-tests -pl resource-managers/kubernetes/integration-tests -am
+ 
+Afterwards, the integration tests can be executed with Maven or your IDE. Note that when running tests from an IDE, the
+`pre-integration-test` phase must be run every time the Spark main code changes. When running tests from the
+command line, the `pre-integration-test` phase should automatically be invoked if the `integration-test` phase is run.
+
+# Usage Guide
+
+See the [usage guide](../../docs/running-on-kubernetes.md) for more information.

From 761b3175c0cbe282dae1a23144669a5003f83e39 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 13 Jan 2017 15:05:22 -0800
Subject: [PATCH 390/534] Added service name as prefix to executor pods (#14)

* Added service name as prefix to executor pods to be able to tell them apart from kubectl output

* Addressed comments
---
 .../cluster/kubernetes/KubernetesClusterSchedulerBackend.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 4e099cea3198b..2717d2f37d910 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -155,7 +155,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private def allocateNewExecutorPod(): (String, Pod) = {
     val executorKubernetesId = UUID.randomUUID().toString.replaceAll("-", "")
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
-    val name = s"exec$executorKubernetesId"
+    val name = s"$kubernetesDriverServiceName-exec-$executorKubernetesId"
     val selectors = Map(SPARK_EXECUTOR_SELECTOR -> executorId,
       SPARK_APP_SELECTOR -> applicationId()).asJava
     val executorMemoryQuantity = new QuantityBuilder(false)

From 8739b41db6ea93f4f7f3f3e982752366611ea8bf Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Fri, 13 Jan 2017 20:44:56 -0800
Subject: [PATCH 391/534] Add kubernetes profile to travis CI yml file (#21)

* Add kubernetes profile to travis yml file

* Fix long lines in CompressionUtils.scala
---
 .travis.yml                                            |  2 +-
 .../deploy/rest/kubernetes/CompressionUtils.scala      | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8739849a20798..a118421eb45e0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,7 +44,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
index 805a52bada219..1c95dacc7eb01 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
@@ -38,11 +38,11 @@ private[spark] object CompressionUtils extends Logging {
 
   /**
    * Compresses all of the given paths into a gzipped-tar archive, returning the compressed data in
-   * memory as an instance of {@link TarGzippedData}. The files are taken without consideration to their
-   * original folder structure, and are added to the tar archive in a flat hierarchy. Directories are
-   * not allowed, and duplicate file names are de-duplicated by appending a numeric suffix to the file name,
-   * before the file extension. For example, if paths a/b.txt and b/b.txt were provided, then the files added
-   * to the tar archive would be b.txt and b-1.txt.
+   * memory as an instance of {@link TarGzippedData}. The files are taken without consideration to
+   * their original folder structure, and are added to the tar archive in a flat hierarchy.
+   * Directories are not allowed, and duplicate file names are de-duplicated by appending a numeric
+   * suffix to the file name, before the file extension. For example, if paths a/b.txt and b/b.txt
+   * were provided, then the files added to the tar archive would be b.txt and b-1.txt.
    * @param paths A list of file paths to be archived
    * @return An in-memory representation of the compressed data.
    */

From 928e00eb19be53071f247e98bf9a74897417bc62 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Tue, 17 Jan 2017 17:24:58 +0000
Subject: [PATCH 392/534] Improved the example commands in running-on-k8s
 document. (#25)

* Improved the example commands in running-on-k8s document.

* Fixed more example commands.

* Fixed typo.
---
 docs/running-on-kubernetes.md | 84 +++++++++++++++++------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 5192d9d086618..234c9870548c7 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -31,16 +31,16 @@ For example, if the registry host is `registry-host` and the registry is listeni
 Kubernetes applications can be executed via `spark-submit`. For example, to compute the value of pi, assuming the images
 are set up as described above:
 
-    bin/spark-submit 
-      --deploy-mode cluster 
-      --class org.apache.spark.examples.SparkPi 
-      --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port>
-      --kubernetes-namespace default
-      --conf spark.executor.instances=5 
-      --conf spark.app.name=spark-pi
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
-      examples/jars/spark_2.11-2.2.0.jar
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class org.apache.spark.examples.SparkPi \
+      --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
+      --kubernetes-namespace default \
+      --conf spark.executor.instances=5 \
+      --conf spark.app.name=spark-pi \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
+      examples/jars/spark_examples_2.11-2.2.0.jar
 
 <!-- TODO master should default to https if no scheme is specified -->
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
@@ -75,53 +75,53 @@ examples of providing application dependencies.
 
 To submit an application with both the main resource and two other jars living on the submitting user's machine:
 
-    bin/spark-submit
-      --deploy-mode cluster
-      --class com.example.applications.SampleApplication
-      --master k8s://https://192.168.99.100
-      --kubernetes-namespace default 
-      --upload-jars /home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class com.example.applications.SampleApplication \
+      --master k8s://https://192.168.99.100 \
+      --kubernetes-namespace default \
+      --upload-jars /home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       /home/exampleuser/exampleapplication/main.jar
       
 Note that since passing the jars through the `--upload-jars` command line argument is equivalent to setting the
 `spark.kubernetes.driver.uploads.jars` Spark property, the above will behave identically to this command:
 
-    bin/spark-submit
-      --deploy-mode cluster
-      --class com.example.applications.SampleApplication
-      --master k8s://https://192.168.99.100
-      --kubernetes-namespace default 
-      --conf spark.kubernetes.driver.uploads.jars=/home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class com.example.applications.SampleApplication \
+      --master k8s://https://192.168.99.100 \
+      --kubernetes-namespace default \
+      --conf spark.kubernetes.driver.uploads.jars=/home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       /home/exampleuser/exampleapplication/main.jar
 
 To specify a main application resource that can be downloaded from an HTTP service, and if a plugin for that application
 is located in the jar `/opt/spark-plugins/app-plugin.jar` on the docker image's disk:
 
-    bin/spark-submit 
-      --deploy-mode cluster 
-      --class com.example.applications.PluggableApplication
-      --master k8s://https://192.168.99.100
-      --kubernetes-namespace default 
-      --jars /opt/spark-plugins/app-plugin.jar
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class com.example.applications.PluggableApplication \
+      --master k8s://https://192.168.99.100 \
+      --kubernetes-namespace default \
+      --jars /opt/spark-plugins/app-plugin.jar \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       http://example.com:8080/applications/sparkpluggable/app.jar
       
 Note that since passing the jars through the `--jars` command line argument is equivalent to setting the `spark.jars`
 Spark property, the above will behave identically to this command:
 
-    bin/spark-submit 
-      --deploy-mode cluster 
-      --class com.example.applications.PluggableApplication
-      --master k8s://https://192.168.99.100
-      --kubernetes-namespace default 
-      --conf spark.jars=file:///opt/spark-plugins/app-plugin.jar
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class com.example.applications.PluggableApplication \
+      --master k8s://https://192.168.99.100 \
+      --kubernetes-namespace default \
+      --conf spark.jars=file:///opt/spark-plugins/app-plugin.jar \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       http://example.com:8080/applications/sparkpluggable/app.jar
       
 ### Spark Properties

From 3e3c4d4ac9450bff5ad27b7b20faba943623b86e Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 18 Jan 2017 11:24:43 -0800
Subject: [PATCH 393/534] Fix spacing for command highlighting (#31)

---
 resource-managers/kubernetes/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 3c11efa38d5af..62764dcb2ca03 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -45,7 +45,7 @@ Running any of the integration tests requires including `kubernetes-integration-
 order to prepare the environment for running the integration tests, the `pre-integration-test` step must be run in Maven
 on the `resource-managers/kubernetes/integration-tests` module:
 
-  build/mvn pre-integration-test -Pkubernetes -Pkubernetes-integration-tests -pl resource-managers/kubernetes/integration-tests -am
+    build/mvn pre-integration-test -Pkubernetes -Pkubernetes-integration-tests -pl resource-managers/kubernetes/integration-tests -am
  
 Afterwards, the integration tests can be executed with Maven or your IDE. Note that when running tests from an IDE, the
 `pre-integration-test` phase must be run every time the Spark main code changes. When running tests from the

From 36c4e949756a389d4502ccb6529b9af4541b9805 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 18 Jan 2017 17:30:09 -0800
Subject: [PATCH 394/534] Support custom labels on the driver pod. (#27)

* Support custom labels on the driver pod.

* Add integration test and fix logic.

* Fix tests

* Fix minor formatting mistake

* Reduce unnecessary diff
---
 docs/running-on-kubernetes.md                 |  8 +++++
 .../spark/deploy/kubernetes/Client.scala      | 35 +++++++++++++++----
 .../integrationtest/KubernetesSuite.scala     | 34 ++++++++++++++++++
 3 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 234c9870548c7..14e2df4ed0702 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -213,6 +213,14 @@ from the other deployment modes. See the [configuration page](configuration.html
     (typically 6-10%).
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.labels</code></td>
+  <td>(none)</td>
+  <td>
+    Custom labels that will be added to the driver pod. This should be a comma-separated list of label key-value pairs,
+    where each label is in the format <code>key=value</code>.
+  </td>
+</tr>
 </table>
 
 ## Current Limitations
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 6d7de973a52c2..073afcbba7b52 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -77,6 +77,8 @@ private[spark] class Client(
   private val serviceAccount = sparkConf.get("spark.kubernetes.submit.serviceAccountName",
     "default")
 
+  private val customLabels = sparkConf.get("spark.kubernetes.driver.labels", "")
+
   private implicit val retryableExecutionContext = ExecutionContext
     .fromExecutorService(
       Executors.newSingleThreadExecutor(new ThreadFactoryBuilder()
@@ -85,6 +87,7 @@ private[spark] class Client(
         .build()))
 
   def run(): Unit = {
+    val parsedCustomLabels = parseCustomLabels(customLabels)
     var k8ConfBuilder = new ConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(master)
@@ -109,14 +112,15 @@ private[spark] class Client(
         .withType("Opaque")
         .done()
       try {
-        val selectors = Map(DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue).asJava
+        val resolvedSelectors = (Map(DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue)
+          ++ parsedCustomLabels).asJava
         val (servicePorts, containerPorts) = configurePorts()
         val service = kubernetesClient.services().createNew()
           .withNewMetadata()
             .withName(kubernetesAppId)
             .endMetadata()
           .withNewSpec()
-            .withSelector(selectors)
+            .withSelector(resolvedSelectors)
             .withPorts(servicePorts.asJava)
             .endSpec()
           .done()
@@ -137,7 +141,7 @@ private[spark] class Client(
                 .asScala
                 .find(status =>
                   status.getName == DRIVER_LAUNCHER_CONTAINER_NAME && status.getReady) match {
-                case Some(status) =>
+                case Some(_) =>
                   try {
                     val driverLauncher = getDriverLauncherService(
                       k8ClientConfig, master)
@@ -184,7 +188,7 @@ private[spark] class Client(
           kubernetesClient.pods().createNew()
             .withNewMetadata()
               .withName(kubernetesAppId)
-              .withLabels(selectors)
+              .withLabels(resolvedSelectors)
               .endMetadata()
             .withNewSpec()
               .withRestartPolicy("OnFailure")
@@ -291,7 +295,7 @@ private[spark] class Client(
 
         Utils.tryWithResource(kubernetesClient
           .pods()
-          .withLabels(selectors)
+          .withLabels(resolvedSelectors)
           .watch(podWatcher)) { createDriverPod }
       } finally {
         kubernetesClient.secrets().delete(secret)
@@ -336,7 +340,7 @@ private[spark] class Client(
         .getOption("spark.ui.port")
         .map(_.toInt)
         .getOrElse(DEFAULT_UI_PORT))
-    (servicePorts.toSeq, containerPorts.toSeq)
+    (servicePorts, containerPorts)
   }
 
   private def buildSubmissionRequest(): KubernetesCreateSubmissionRequest = {
@@ -366,7 +370,7 @@ private[spark] class Client(
       uploadedJarsBase64Contents = uploadJarsBase64Contents)
   }
 
-  def compressJars(maybeFilePaths: Option[String]): Option[TarGzippedData] = {
+  private def compressJars(maybeFilePaths: Option[String]): Option[TarGzippedData] = {
     maybeFilePaths
       .map(_.split(","))
       .map(CompressionUtils.createTarGzip(_))
@@ -391,6 +395,23 @@ private[spark] class Client(
       sslSocketFactory = sslContext.getSocketFactory,
       trustContext = trustManager)
   }
+
+  private def parseCustomLabels(labels: String): Map[String, String] = {
+    labels.split(",").map(_.trim).filterNot(_.isEmpty).map(label => {
+      label.split("=", 2).toSeq match {
+        case Seq(k, v) =>
+          require(k != DRIVER_LAUNCHER_SELECTOR_LABEL, "Label with key" +
+            s" $DRIVER_LAUNCHER_SELECTOR_LABEL cannot be used in" +
+            " spark.kubernetes.driver.labels, as it is reserved for Spark's" +
+            " internal configuration.")
+          (k, v)
+        case _ =>
+          throw new SparkException("Custom labels set by spark.kubernetes.driver.labels" +
+            " must be a comma-separated list of key-value pairs, with format <key>=<value>." +
+            s" Got label: $label. All labels: $labels")
+      }
+    }).toMap
+  }
 }
 
 private object Client {
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 6247a1674f8d6..7b3c2b93b865b 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -161,4 +161,38 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "spark-pi", NAMESPACE, "spark-ui-port")
     expectationsForStaticAllocation(sparkMetricsService)
   }
+
+  test("Run with custom labels") {
+    val args = Array(
+      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
+      "--deploy-mode", "cluster",
+      "--kubernetes-namespace", NAMESPACE,
+      "--name", "spark-pi",
+      "--executor-memory", "512m",
+      "--executor-cores", "1",
+      "--num-executors", "1",
+      "--upload-jars", HELPER_JAR,
+      "--class", MAIN_CLASS,
+      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
+      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
+      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
+      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      "--conf", "spark.kubernetes.driver.labels=label1=label1value,label2=label2value",
+      EXAMPLES_JAR)
+    SparkSubmit.main(args)
+    val driverPodLabels = minikubeKubernetesClient
+      .pods
+      .withName("spark-pi")
+      .get
+      .getMetadata
+      .getLabels
+    // We can't match all of the selectors directly since one of the selectors is based on the
+    // launch time.
+    assert(driverPodLabels.size == 3, "Unexpected number of pod labels.")
+    assert(driverPodLabels.containsKey("driver-launcher-selector"), "Expected driver launcher" +
+      " selector label to be present.")
+    assert(driverPodLabels.get("label1") == "label1value", "Unexpected value for label1")
+    assert(driverPodLabels.get("label2") == "label2value", "Unexpected value for label2")
+  }
 }

From b6c57c707a93b5d201ee381fac60c751bf068f9e Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 18 Jan 2017 17:34:02 -0800
Subject: [PATCH 395/534] Make pod name unique using the submission timestamp
 (#32)

---
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 073afcbba7b52..30eaa6269cf47 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -56,10 +56,10 @@ private[spark] class Client(
   private val master = rawMaster.replaceFirst("k8s://", "")
 
   private val launchTime = System.currentTimeMillis
-  private val kubernetesAppId = sparkConf.getOption("spark.app.name")
+  private val appName = sparkConf.getOption("spark.app.name")
     .orElse(sparkConf.getOption("spark.app.id"))
-    .getOrElse(s"spark-$launchTime")
-
+    .getOrElse("spark")
+  private val kubernetesAppId = s"$appName-$launchTime"
   private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
   private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
   private val driverDockerImage = sparkConf.get(

From 3fd9c62f4b9c23fe72901970d62687cc47a36c81 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 23 Jan 2017 18:02:45 -0800
Subject: [PATCH 396/534] A number of small tweaks to the MVP. (#23)

* A number of small tweaks to the MVP.

- Master protocol defaults to https if not specified
- Removed upload driver extra classpath functionality
- Added ability to specify main app resource with container:// URI
- Updated docs to reflect all of the above
- Add examples to Docker images, mostly for integration testing but
could be useful for easily getting started without shipping anything

* Add example to documentation.
---
 docs/running-on-kubernetes.md                 |  49 +++--
 .../spark/deploy/kubernetes/Client.scala      |  40 ++--
 .../rest/KubernetesRestProtocolMessages.scala |   4 +-
 .../KubernetesSparkRestServer.scala           | 183 ++++++++++--------
 .../KubernetesClusterSchedulerBackend.scala   |   9 +-
 .../kubernetes/docker-minimal-bundle/pom.xml  |   7 +
 .../src/main/assembly/driver-assembly.xml     |  20 +-
 .../src/main/assembly/executor-assembly.xml   |  11 ++
 .../src/main/docker/driver/Dockerfile         |   1 +
 .../src/main/docker/executor/Dockerfile       |   1 +
 .../integrationtest/KubernetesSuite.scala     | 104 +++++++++-
 11 files changed, 287 insertions(+), 142 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 14e2df4ed0702..5a73b1ad1ea29 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -42,11 +42,12 @@ are set up as described above:
       --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
-<!-- TODO master should default to https if no scheme is specified -->
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
 `spark.master` in the application's configuration, must be a URL with the format `k8s://<api_server_url>`. Prefixing the
 master string with `k8s://` will cause the Spark application to launch on the Kubernetes cluster, with the API server
-being contacted at `api_server_url`. The HTTP protocol must also be specified.
+being contacted at `api_server_url`. If no HTTP protocol is specified in the URL, it defaults to `https`. For example,
+setting the master to `k8s://example.com:443` is equivalent to setting it to `k8s://https://example.com:443`, but to
+connect without SSL on a different port, the master would be set to `k8s://http://example.com:8443`.
 
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
@@ -58,17 +59,18 @@ disk of the submitter's machine. These two types of dependencies are specified v
 `spark-submit`:
  
 * Local jars provided by specifying the `--jars` command line argument to `spark-submit`, or by setting `spark.jars` in
-  the application's configuration, will be treated as jars that are located on the *disk of the driver Docker
-  container*. This only applies to jar paths that do not specify a scheme or that have the scheme `file://`. Paths with
-  other schemes are fetched from their appropriate locations.
+  the application's configuration, will be treated as jars that are located on the *disk of the driver container*. This
+  only applies to jar paths that do not specify a scheme or that have the scheme `file://`. Paths with other schemes are
+  fetched from their appropriate locations.
 * Local jars provided by specifying the `--upload-jars` command line argument to `spark-submit`, or by setting
   `spark.kubernetes.driver.uploads.jars` in the application's configuration, will be treated as jars that are located on
   the *disk of the submitting machine*. These jars are uploaded to the driver docker container before executing the
   application.
-  <!-- TODO support main resource bundled in the Docker image -->
 * A main application resource path that does not have a scheme or that has the scheme `file://` is assumed to be on the
   *disk of the submitting machine*. This resource is uploaded to the driver docker container before executing the
   application. A remote path can still be specified and the resource will be fetched from the appropriate location.
+* A main application resource path that has the scheme `container://` is assumed to be on the *disk of the driver
+  container*.
   
 In all of these cases, the jars are placed on the driver's classpath, and are also sent to the executors. Below are some
 examples of providing application dependencies.
@@ -78,8 +80,7 @@ To submit an application with both the main resource and two other jars living o
     bin/spark-submit \
       --deploy-mode cluster \
       --class com.example.applications.SampleApplication \
-      --master k8s://https://192.168.99.100 \
-      --kubernetes-namespace default \
+      --master k8s://192.168.99.100 \
       --upload-jars /home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar \
       --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
       --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
@@ -91,8 +92,7 @@ Note that since passing the jars through the `--upload-jars` command line argume
     bin/spark-submit \
       --deploy-mode cluster \
       --class com.example.applications.SampleApplication \
-      --master k8s://https://192.168.99.100 \
-      --kubernetes-namespace default \
+      --master k8s://192.168.99.100 \
       --conf spark.kubernetes.driver.uploads.jars=/home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar \
       --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
       --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
@@ -104,8 +104,7 @@ is located in the jar `/opt/spark-plugins/app-plugin.jar` on the docker image's
     bin/spark-submit \
       --deploy-mode cluster \
       --class com.example.applications.PluggableApplication \
-      --master k8s://https://192.168.99.100 \
-      --kubernetes-namespace default \
+      --master k8s://192.168.99.100 \
       --jars /opt/spark-plugins/app-plugin.jar \
       --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
       --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
@@ -117,13 +116,22 @@ Spark property, the above will behave identically to this command:
     bin/spark-submit \
       --deploy-mode cluster \
       --class com.example.applications.PluggableApplication \
-      --master k8s://https://192.168.99.100 \
-      --kubernetes-namespace default \
+      --master k8s://192.168.99.100 \
       --conf spark.jars=file:///opt/spark-plugins/app-plugin.jar \
       --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
       --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       http://example.com:8080/applications/sparkpluggable/app.jar
       
+To specify a main application resource that is in the Docker image, and if it has no other dependencies:
+
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class com.example.applications.PluggableApplication \
+      --master k8s://192.168.99.100:8443 \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
+      container:///home/applications/examples/example.jar
+
 ### Spark Properties
 
 Below are some other common properties that are specific to Kubernetes. Most of the other configurations are the same
@@ -133,10 +141,9 @@ from the other deployment modes. See the [configuration page](configuration.html
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
 <tr>
   <td><code>spark.kubernetes.namespace</code></td>
-  <!-- TODO set default to "default" -->
-  <td>(none)</td>
+  <td><code>default</code></td>
   <td>
-    The namespace that will be used for running the driver and executor pods. Must be specified. When using
+    The namespace that will be used for running the driver and executor pods. When using
     <code>spark-submit</code> in cluster mode, this can also be passed to <code>spark-submit</code> via the
     <code>--kubernetes-namespace</code> command line argument.
   </td>
@@ -196,14 +203,6 @@ from the other deployment modes. See the [configuration page](configuration.html
     mode. Refer to <a href="running-on-kubernetes.html#adding-other-jars">adding other jars</a> for more information.
   </td>
 </tr>
-<tr>
-  <!-- TODO remove this functionality -->
-  <td><code>spark.kubernetes.driver.uploads.driverExtraClasspath</code></td>
-  <td>(none)</td>
-  <td>
-    Comma-separated list of jars to be sent to the driver only when submitting the application in cluster mode. 
-  </td>
-</tr>
 <tr>
   <td><code>spark.kubernetes.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 30eaa6269cf47..fe3256b9e12be 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -35,7 +35,7 @@ import scala.concurrent.duration.DurationInt
 import scala.util.Success
 
 import org.apache.spark.{SPARK_VERSION, SparkConf, SparkException}
-import org.apache.spark.deploy.rest.{AppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
+import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
@@ -47,13 +47,8 @@ private[spark] class Client(
     appArgs: Array[String]) extends Logging {
   import Client._
 
-  private val namespace = sparkConf.getOption("spark.kubernetes.namespace").getOrElse(
-    throw new IllegalArgumentException("Namespace must be provided in spark.kubernetes.namespace"))
-  private val rawMaster = sparkConf.get("spark.master")
-  if (!rawMaster.startsWith("k8s://")) {
-    throw new IllegalArgumentException("Master should be a URL with scheme k8s://")
-  }
-  private val master = rawMaster.replaceFirst("k8s://", "")
+  private val namespace = sparkConf.get("spark.kubernetes.namespace", "default")
+  private val master = resolveK8sMaster(sparkConf.get("spark.master"))
 
   private val launchTime = System.currentTimeMillis
   private val appName = sparkConf.getOption("spark.app.name")
@@ -64,8 +59,6 @@ private[spark] class Client(
   private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
   private val driverDockerImage = sparkConf.get(
     "spark.kubernetes.driver.docker.image", s"spark-driver:$SPARK_VERSION")
-  private val uploadedDriverExtraClasspath = sparkConf
-    .getOption("spark.kubernetes.driver.uploads.driverExtraClasspath")
   private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
 
   private val secretBase64String = {
@@ -112,12 +105,15 @@ private[spark] class Client(
         .withType("Opaque")
         .done()
       try {
-        val resolvedSelectors = (Map(DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue)
+        val resolvedSelectors = (Map(
+            DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue,
+            SPARK_APP_NAME_LABEL -> appName)
           ++ parsedCustomLabels).asJava
         val (servicePorts, containerPorts) = configurePorts()
         val service = kubernetesClient.services().createNew()
           .withNewMetadata()
             .withName(kubernetesAppId)
+            .withLabels(Map(SPARK_APP_NAME_LABEL -> appName).asJava)
             .endMetadata()
           .withNewSpec()
             .withSelector(resolvedSelectors)
@@ -355,10 +351,10 @@ private[spark] class Client(
         val fileBytes = Files.toByteArray(appFile)
         val fileBase64 = Base64.encodeBase64String(fileBytes)
         UploadedAppResource(resourceBase64Contents = fileBase64, name = appFile.getName)
+      case "container" => ContainerAppResource(appResourceUri.getPath)
       case other => RemoteAppResource(other)
     }
 
-    val uploadDriverExtraClasspathBase64Contents = compressJars(uploadedDriverExtraClasspath)
     val uploadJarsBase64Contents = compressJars(uploadedJars)
     KubernetesCreateSubmissionRequest(
       appResource = resolvedAppResource,
@@ -366,7 +362,6 @@ private[spark] class Client(
       appArgs = appArgs,
       secret = secretBase64String,
       sparkProperties = sparkConf.getAll.toMap,
-      uploadedDriverExtraClasspathBase64Contents = uploadDriverExtraClasspathBase64Contents,
       uploadedJarsBase64Contents = uploadJarsBase64Contents)
   }
 
@@ -414,7 +409,7 @@ private[spark] class Client(
   }
 }
 
-private object Client {
+private[spark] object Client extends Logging {
 
   private val SUBMISSION_SERVER_SECRET_NAME = "spark-submission-server-secret"
   private val DRIVER_LAUNCHER_SELECTOR_LABEL = "driver-launcher-selector"
@@ -430,6 +425,7 @@ private object Client {
   private val SECURE_RANDOM = new SecureRandom()
   private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission"
   private val LAUNCH_TIMEOUT_SECONDS = 30
+  private val SPARK_APP_NAME_LABEL = "spark-app-name"
 
   def main(args: Array[String]): Unit = {
     require(args.length >= 2, s"Too few arguments. Usage: ${getClass.getName} <mainAppResource>" +
@@ -444,4 +440,20 @@ private object Client {
       sparkConf = sparkConf,
       appArgs = appArgs).run()
   }
+
+  def resolveK8sMaster(rawMasterString: String): String = {
+    if (!rawMasterString.startsWith("k8s://")) {
+      throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
+    }
+    val masterWithoutK8sPrefix = rawMasterString.replaceFirst("k8s://", "")
+    if (masterWithoutK8sPrefix.startsWith("http://")
+        || masterWithoutK8sPrefix.startsWith("https://")) {
+      masterWithoutK8sPrefix
+    } else {
+      val resolvedURL = s"https://$masterWithoutK8sPrefix"
+      logDebug(s"No scheme specified for kubernetes master URL, so defaulting to https. Resolved" +
+        s" URL is $resolvedURL")
+      resolvedURL
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
index 6da1a848b25e7..813d070e0f876 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -27,7 +27,6 @@ case class KubernetesCreateSubmissionRequest(
   val appArgs: Array[String],
   val sparkProperties: Map[String, String],
   val secret: String,
-  val uploadedDriverExtraClasspathBase64Contents: Option[TarGzippedData],
   val uploadedJarsBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
   message = "create"
   clientSparkVersion = SPARK_VERSION
@@ -46,6 +45,7 @@ case class TarGzippedData(
   property = "type")
 @JsonSubTypes(value = Array(
   new JsonSubTypes.Type(value = classOf[UploadedAppResource], name = "UploadedAppResource"),
+  new JsonSubTypes.Type(value = classOf[ContainerAppResource], name = "ContainerLocalAppResource"),
   new JsonSubTypes.Type(value = classOf[RemoteAppResource], name = "RemoteAppResource")))
 abstract class AppResource
 
@@ -53,6 +53,8 @@ case class UploadedAppResource(
   resourceBase64Contents: String,
   name: String = "spark-app-resource") extends AppResource
 
+case class ContainerAppResource(resourcePath: String) extends AppResource
+
 case class RemoteAppResource(resource: String) extends AppResource
 
 class PingResponse extends SubmitRestProtocolResponse {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 837706ca9f5a8..08ddbaf5e50dc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -18,7 +18,6 @@ package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.File
 import java.net.URI
-import java.nio.file.Paths
 import java.util.concurrent.CountDownLatch
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
@@ -30,12 +29,12 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.{SecurityManager, SPARK_VERSION, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.rest._
-import org.apache.spark.util.{ShutdownHookManager, Utils}
+import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
 
 private case class KubernetesSparkRestServerArguments(
-  val host: Option[String] = None,
-  val port: Option[Int] = None,
-  val secretFile: Option[String] = None) {
+    val host: Option[String] = None,
+    val port: Option[Int] = None,
+    val secretFile: Option[String] = None) {
   def validate(): KubernetesSparkRestServerArguments = {
     require(host.isDefined, "Hostname not set via --hostname.")
     require(port.isDefined, "Port not set via --port")
@@ -68,13 +67,21 @@ private object KubernetesSparkRestServerArguments {
   }
 }
 
+/**
+ * Runs in the driver pod and receives a request to run an application. Note that
+ * unlike the submission rest server in standalone mode, this server is expected
+ * to be used to run one application only, and then shut down once that application
+ * is complete.
+ */
 private[spark] class KubernetesSparkRestServer(
     host: String,
     port: Int,
     conf: SparkConf,
-    expectedApplicationSecret: Array[Byte])
+    expectedApplicationSecret: Array[Byte],
+    shutdownLock: CountDownLatch)
   extends RestSubmissionServer(host, port, conf) {
 
+  private val SERVLET_LOCK = new Object
   private val javaExecutable = s"${System.getenv("JAVA_HOME")}/bin/java"
   private val sparkHome = System.getenv("SPARK_HOME")
   private val securityManager = new SecurityManager(conf)
@@ -99,87 +106,105 @@ private[spark] class KubernetesSparkRestServer(
 
   private class KubernetesSubmitRequestServlet extends SubmitRequestServlet {
 
+    private val waitForProcessCompleteExecutor = ThreadUtils
+        .newDaemonSingleThreadExecutor("wait-for-spark-app-complete")
+    private var startedApplication = false
+
     // TODO validating the secret should be done as part of a header of the request.
     // Instead here we have to specify the secret in the body.
     override protected def handleSubmit(
-      requestMessageJson: String,
-      requestMessage: SubmitRestProtocolMessage,
-      responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
-      requestMessage match {
-        case KubernetesCreateSubmissionRequest(
+        requestMessageJson: String,
+        requestMessage: SubmitRestProtocolMessage,
+        responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
+      SERVLET_LOCK.synchronized {
+        if (startedApplication) {
+          throw new IllegalStateException("Application has already been submitted.")
+        } else {
+          requestMessage match {
+            case KubernetesCreateSubmissionRequest(
             appResource,
             mainClass,
             appArgs,
             sparkProperties,
             secret,
-            uploadedDriverExtraClasspath,
             uploadedJars) =>
-          val decodedSecret = Base64.decodeBase64(secret)
-          if (!expectedApplicationSecret.sameElements(decodedSecret)) {
-            responseServlet.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
-            handleError("Unauthorized to submit application.")
-          } else {
-            val tempDir = Utils.createTempDir()
-            val appResourcePath = resolvedAppResource(appResource, tempDir)
-            val driverClasspathDirectory = new File(tempDir, "driver-extra-classpath")
-            if (!driverClasspathDirectory.mkdir) {
-              throw new IllegalStateException("Failed to create driver extra classpath" +
-                s" dir at ${driverClasspathDirectory.getAbsolutePath}")
-            }
-            val jarsDirectory = new File(tempDir, "jars")
-            if (!jarsDirectory.mkdir) {
-              throw new IllegalStateException("Failed to create jars dir at" +
-                 s"${jarsDirectory.getAbsolutePath}")
-            }
-            val writtenDriverExtraClasspath = writeBase64ContentsToFiles(
-              uploadedDriverExtraClasspath, driverClasspathDirectory)
-            val writtenJars = writeBase64ContentsToFiles(uploadedJars, jarsDirectory)
-            val originalDriverExtraClasspath = sparkProperties.get("spark.driver.extraClassPath")
-              .map(_.split(","))
-              .getOrElse(Array.empty[String])
-            val resolvedDriverExtraClasspath = writtenDriverExtraClasspath ++
-              originalDriverExtraClasspath
-            val originalJars = sparkProperties.get("spark.jars")
-              .map(_.split(","))
-              .getOrElse(Array.empty[String])
-            val resolvedJars = writtenJars ++ originalJars ++ Array(appResourcePath)
-            val sparkJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
-            val driverClasspath = resolvedDriverExtraClasspath ++
-              resolvedJars ++
-              sparkJars ++
-              Array(appResourcePath)
-            val resolvedSparkProperties = new mutable.HashMap[String, String]
-            resolvedSparkProperties ++= sparkProperties
-            resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
-
-            val command = new ArrayBuffer[String]
-            command += javaExecutable
-            command += "-cp"
-            command += s"${driverClasspath.mkString(":")}"
-            for (prop <- resolvedSparkProperties) {
-              command += s"-D${prop._1}=${prop._2}"
-            }
-            val driverMemory = resolvedSparkProperties.getOrElse("spark.driver.memory", "1g")
-            command += s"-Xms$driverMemory"
-            command += s"-Xmx$driverMemory"
-            command += mainClass
-            command ++= appArgs
-            val pb = new ProcessBuilder(command: _*).inheritIO()
-            val process = pb.start()
-            ShutdownHookManager.addShutdownHook(() => {
-              logInfo("Received stop command, shutting down the running Spark application...")
-              process.destroy()
-            })
-            val response = new CreateSubmissionResponse
-            response.success = true
-            response.submissionId = null
-            response.message = "success"
-            response.serverSparkVersion = SPARK_VERSION
-            response
+              val decodedSecret = Base64.decodeBase64(secret)
+              if (!expectedApplicationSecret.sameElements(decodedSecret)) {
+                responseServlet.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
+                handleError("Unauthorized to submit application.")
+              } else {
+                val tempDir = Utils.createTempDir()
+                val appResourcePath = resolvedAppResource(appResource, tempDir)
+                val driverClasspathDirectory = new File(tempDir, "driver-extra-classpath")
+                if (!driverClasspathDirectory.mkdir) {
+                  throw new IllegalStateException("Failed to create driver extra classpath" +
+                    s" dir at ${driverClasspathDirectory.getAbsolutePath}")
+                }
+                val jarsDirectory = new File(tempDir, "jars")
+                if (!jarsDirectory.mkdir) {
+                  throw new IllegalStateException("Failed to create jars dir at" +
+                    s"${jarsDirectory.getAbsolutePath}")
+                }
+                val writtenJars = writeBase64ContentsToFiles(uploadedJars, jarsDirectory)
+                val driverExtraClasspath = sparkProperties
+                  .get("spark.driver.extraClassPath")
+                  .map(_.split(","))
+                  .getOrElse(Array.empty[String])
+                val originalJars = sparkProperties.get("spark.jars")
+                  .map(_.split(","))
+                  .getOrElse(Array.empty[String])
+                val resolvedJars = writtenJars ++ originalJars ++ Array(appResourcePath)
+                val sparkJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
+                val driverClasspath = driverExtraClasspath ++
+                  resolvedJars ++
+                  sparkJars ++
+                  Array(appResourcePath)
+                val resolvedSparkProperties = new mutable.HashMap[String, String]
+                resolvedSparkProperties ++= sparkProperties
+                resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
+
+                val command = new ArrayBuffer[String]
+                command += javaExecutable
+                command += "-cp"
+                command += s"${driverClasspath.mkString(":")}"
+                for (prop <- resolvedSparkProperties) {
+                  command += s"-D${prop._1}=${prop._2}"
+                }
+                val driverMemory = resolvedSparkProperties.getOrElse("spark.driver.memory", "1g")
+                command += s"-Xms$driverMemory"
+                command += s"-Xmx$driverMemory"
+                command += mainClass
+                command ++= appArgs
+                val pb = new ProcessBuilder(command: _*).inheritIO()
+                val process = pb.start()
+                ShutdownHookManager.addShutdownHook(() => {
+                  logInfo("Received stop command, shutting down the running Spark application...")
+                  process.destroy()
+                  shutdownLock.countDown()
+                })
+                waitForProcessCompleteExecutor.submit(new Runnable {
+                  override def run(): Unit = {
+                    process.waitFor
+                    SERVLET_LOCK.synchronized {
+                      logInfo("Spark application complete. Shutting down submission server...")
+                      KubernetesSparkRestServer.this.stop
+                      shutdownLock.countDown()
+                    }
+                  }
+                })
+                startedApplication = true
+                val response = new CreateSubmissionResponse
+                response.success = true
+                response.submissionId = null
+                response.message = "success"
+                response.serverSparkVersion = SPARK_VERSION
+                response
+              }
+            case unexpected =>
+              responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+              handleError(s"Received message of unexpected type ${unexpected.messageType}.")
           }
-        case unexpected =>
-          responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
-          handleError(s"Received message of unexpected type ${unexpected.messageType}.")
+        }
       }
     }
 
@@ -196,6 +221,7 @@ private[spark] class KubernetesSparkRestServer(
             throw new IllegalStateException(s"Failed to write main app resource file" +
               s" to $resourceFilePath")
           }
+        case ContainerAppResource(resource) => resource
         case RemoteAppResource(resource) =>
           Utils.fetchFile(resource, tempDir, conf,
             securityManager, SparkHadoopUtil.get.newConfiguration(conf),
@@ -237,7 +263,8 @@ private[spark] object KubernetesSparkRestServer {
       parsedArguments.host.get,
       parsedArguments.port.get,
       sparkConf,
-      secretBytes)
+      secretBytes,
+      barrier)
     server.start()
     ShutdownHookManager.addShutdownHook(() => {
       try {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 2717d2f37d910..b7110ba901842 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -27,7 +27,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.deploy.kubernetes.KubernetesClientBuilder
+import org.apache.spark.deploy.kubernetes.{Client, KubernetesClientBuilder}
 import org.apache.spark.rpc.RpcEndpointAddress
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
@@ -43,15 +43,12 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val EXECUTOR_MODIFICATION_LOCK = new Object
   private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
 
-  private val kubernetesMaster = sc.master.replaceFirst("k8s://", "")
+  private val kubernetesMaster = Client.resolveK8sMaster(sc.master)
 
   private val executorDockerImage = conf
     .get("spark.kubernetes.executor.docker.image", s"spark-executor:${sc.version}")
 
-  private val kubernetesNamespace = conf
-    .getOption("spark.kubernetes.namespace")
-    .getOrElse(
-      throw new SparkException("Kubernetes namespace must be specified in kubernetes mode."))
+  private val kubernetesNamespace = conf.get("spark.kubernetes.namespace", "default")
 
   private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index c20e51c93e7c7..0ec2f36075db3 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -43,6 +43,13 @@
       <version>${project.version}</version>
       <type>pom</type>
     </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-examples_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <!--
       Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
       that the libraries Spark depend on have it available. We'll package the version that Spark
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
index 145244f34d1d9..37fde921a6816 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
@@ -22,15 +22,6 @@
   </formats>
   <includeBaseDirectory>false</includeBaseDirectory>
   <fileSets>
-    <fileSet>
-      <directory>
-        ${project.parent.basedir}/core/src/main/resources/org/apache/spark/ui/static/
-      </directory>
-      <outputDirectory>ui-resources/org/apache/spark/ui/static</outputDirectory>
-      <includes>
-        <include>**/*</include>
-      </includes>
-    </fileSet>
     <fileSet>
       <directory>
         ${project.parent.basedir}/sbin/
@@ -78,7 +69,18 @@
       <excludes>
         <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
         <exclude>org.spark-project.spark:unused</exclude>
+        <exclude>org.apache.spark:spark-examples_${scala.binary.version}</exclude>
       </excludes>
     </dependencySet>
+    <dependencySet>
+      <outputDirectory>examples/jars</outputDirectory>
+      <useTransitiveDependencies>true</useTransitiveDependencies>
+      <unpack>false</unpack>
+      <scope>provided</scope>
+      <useProjectArtifact>false</useProjectArtifact>
+      <includes>
+        <include>org.apache.spark:spark-examples_${scala.binary.version}:jar</include>
+      </includes>
+    </dependencySet>
   </dependencySets>
 </assembly>
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
index d97ba56562a12..620a90137aafa 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
@@ -78,7 +78,18 @@
       <excludes>
         <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
         <exclude>org.spark-project.spark:unused</exclude>
+        <exclude>org.apache.spark:spark-examples_${scala.binary.version}</exclude>
       </excludes>
     </dependencySet>
+    <dependencySet>
+      <outputDirectory>examples/jars</outputDirectory>
+      <useTransitiveDependencies>true</useTransitiveDependencies>
+      <unpack>false</unpack>
+      <scope>provided</scope>
+      <useProjectArtifact>false</useProjectArtifact>
+      <includes>
+        <include>org.apache.spark:spark-examples_${scala.binary.version}:jar</include>
+      </includes>
+    </dependencySet>
   </dependencySets>
 </assembly>
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 308bf392fb202..c2b562a39b572 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -8,6 +8,7 @@ RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
 ADD jars /opt/spark/jars
+ADD examples /opt/spark/examples
 ADD bin /opt/spark/bin
 ADD sbin /opt/spark/sbin
 ADD conf /opt/spark/conf
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index 164c0a4289cac..35f0ca3f645b9 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -8,6 +8,7 @@ RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
 ADD jars /opt/spark/jars
+ADD examples /opt/spark/examples
 ADD bin /opt/spark/bin
 ADD sbin /opt/spark/sbin
 ADD conf /opt/spark/conf
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 7b3c2b93b865b..6a92ae1cba49f 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -18,21 +18,26 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.nio.file.Paths
 import java.util.UUID
+import java.util.concurrent.TimeUnit
 
 import com.google.common.collect.ImmutableList
-import io.fabric8.kubernetes.client.{Config, KubernetesClient}
+import com.google.common.util.concurrent.SettableFuture
+import io.fabric8.kubernetes.api.model.Pod
+import io.fabric8.kubernetes.client.{Config, KubernetesClient, KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
 import org.scalatest.BeforeAndAfter
 import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
 import scala.collection.JavaConverters._
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkSubmit
 import org.apache.spark.deploy.kubernetes.Client
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
+import org.apache.spark.util.Utils
 
 private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
@@ -46,6 +51,15 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .listFiles()(0)
       .getAbsolutePath
 
+  private val EXAMPLES_JAR_FILE_NAME = Paths.get("target", "docker", "driver", "examples", "jars")
+    .toFile
+    .listFiles()
+    .toList
+    .map(_.getName)
+    .find(_.startsWith("spark-examples"))
+    .getOrElse(throw new IllegalStateException("Expected to find spark-examples jar; was the" +
+        " pre-integration-test phase run?"))
+
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
   private val MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
@@ -90,6 +104,17 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  private def getSparkMetricsService(sparkBaseAppName: String): SparkRestApiV1 = {
+    val serviceName = minikubeKubernetesClient.services()
+      .withLabel("spark-app-name", sparkBaseAppName)
+      .list()
+      .getItems
+      .get(0)
+      .getMetadata
+      .getName
+    Minikube.getService[SparkRestApiV1](serviceName, NAMESPACE, "spark-ui-port")
+  }
+
   private def expectationsForStaticAllocation(sparkMetricsService: SparkRestApiV1): Unit = {
     val apps = Eventually.eventually(TIMEOUT, INTERVAL) {
       val result = sparkMetricsService
@@ -134,8 +159,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       mainClass = MAIN_CLASS,
       mainAppResource = mainAppResource,
       appArgs = Array.empty[String]).run()
-    val sparkMetricsService = Minikube.getService[SparkRestApiV1](
-      "spark-pi", NAMESPACE, "spark-ui-port")
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
     expectationsForStaticAllocation(sparkMetricsService)
   }
 
@@ -157,11 +181,69 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       EXAMPLES_JAR)
     SparkSubmit.main(args)
-    val sparkMetricsService = Minikube.getService[SparkRestApiV1](
-      "spark-pi", NAMESPACE, "spark-ui-port")
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
     expectationsForStaticAllocation(sparkMetricsService)
   }
 
+  test("Run using spark-submit with the examples jar on the docker image") {
+    val args = Array(
+      "--master", s"k8s://${Minikube.getMinikubeIp}:8443",
+      "--deploy-mode", "cluster",
+      "--kubernetes-namespace", NAMESPACE,
+      "--name", "spark-pi",
+      "--executor-memory", "512m",
+      "--executor-cores", "1",
+      "--num-executors", "1",
+      "--class", "org.apache.spark.examples.SparkPi",
+      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
+      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
+      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
+      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      s"container:///opt/spark/examples/jars/$EXAMPLES_JAR_FILE_NAME")
+    val allContainersSucceeded = SettableFuture.create[Boolean]
+    val watcher = new Watcher[Pod] {
+      override def eventReceived(action: Action, pod: Pod): Unit = {
+        if (action == Action.ERROR) {
+          allContainersSucceeded.setException(
+              new SparkException("The execution of the driver pod failed."))
+        } else if (action == Action.MODIFIED &&
+            pod.getStatus.getContainerStatuses.asScala.nonEmpty &&
+            pod.getStatus
+              .getContainerStatuses
+              .asScala
+              .forall(_.getState.getTerminated != null)) {
+          allContainersSucceeded.set(
+            pod.getStatus
+              .getContainerStatuses
+              .asScala
+              .forall(_.getState.getTerminated.getExitCode == 0)
+          )
+        }
+      }
+
+      override def onClose(e: KubernetesClientException): Unit = {
+        logError("Integration test pod watch closed", e)
+      }
+    }
+    Utils.tryWithResource(
+      minikubeKubernetesClient
+        .pods
+        .withLabel("spark-app-name", "spark-pi")
+        .watch(watcher)) { _ =>
+      SparkSubmit.main(args)
+      assert(allContainersSucceeded.get(2, TimeUnit.MINUTES),
+          "Some containers exited with a non-zero status.")
+    }
+    val driverPod = minikubeKubernetesClient.pods
+      .withLabel("spark-app-name", "spark-pi")
+      .list
+      .getItems
+      .get(0)
+    val jobLog = minikubeKubernetesClient.pods.withName(driverPod.getMetadata.getName).getLog
+    assert(jobLog.contains("Pi is roughly"), "Pi was not computed by the job...")
+  }
+
   test("Run with custom labels") {
     val args = Array(
       "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
@@ -183,15 +265,19 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     SparkSubmit.main(args)
     val driverPodLabels = minikubeKubernetesClient
       .pods
-      .withName("spark-pi")
-      .get
+      .withLabel("spark-app-name", "spark-pi")
+      .list()
+      .getItems
+      .get(0)
       .getMetadata
       .getLabels
     // We can't match all of the selectors directly since one of the selectors is based on the
     // launch time.
-    assert(driverPodLabels.size == 3, "Unexpected number of pod labels.")
+    assert(driverPodLabels.size == 4, "Unexpected number of pod labels.")
     assert(driverPodLabels.containsKey("driver-launcher-selector"), "Expected driver launcher" +
       " selector label to be present.")
+    assert(driverPodLabels.get("spark-app-name") == "spark-pi", "Unexpected value for" +
+      " spark-app-name label.")
     assert(driverPodLabels.get("label1") == "label1value", "Unexpected value for label1")
     assert(driverPodLabels.get("label2") == "label2value", "Unexpected value for label2")
   }

From 81875a613e09dae3dafe9f1777935c4c4bf18831 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Tue, 24 Jan 2017 16:11:19 -0800
Subject: [PATCH 397/534] Correct hadoop profile: hadoop2.7 -> hadoop-2.7 (#41)

See https://github.com/apache-spark-on-k8s/spark/blob/k8s-support-alternate-incremental/pom.xml#L2615
---
 resource-managers/kubernetes/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 62764dcb2ca03..33aca58c256cf 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -23,7 +23,7 @@ To build a distribution of Spark with Kubernetes support, use the `dev/make-dist
 `kubernetes` profile as part of the build arguments. Any other build arguments can be specified as one would expect when
 building Spark normally. For example, to build Spark against Hadoop 2.7 and Kubernetes:
 
-    dev/make-distribution.sh --tgz -Phadoop2.7 -Pkubernetes
+    dev/make-distribution.sh --tgz -Phadoop-2.7 -Pkubernetes
 
 # Kubernetes Code Modules
 

From 2a26ebd8f90ae125c5a13e7bff68145c78ecf632 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Wed, 25 Jan 2017 19:07:11 +0000
Subject: [PATCH 398/534] Support setting the driver pod launching timeout.
 (#36)

* Support setting the driver pod launching timeout.

And increase the default value from 30s to 60s. The current value of
30s is kind of short for pulling the image from public docker registry
plus the container/JVM start time.

* Use a better name for the default timeout.
---
 .../org/apache/spark/deploy/kubernetes/Client.scala  | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index fe3256b9e12be..93471a97e9ccd 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -60,6 +60,8 @@ private[spark] class Client(
   private val driverDockerImage = sparkConf.get(
     "spark.kubernetes.driver.docker.image", s"spark-driver:$SPARK_VERSION")
   private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
+  private val driverLaunchTimeoutSecs = sparkConf.getTimeAsSeconds(
+    "spark.kubernetes.driverLaunchTimeout", s"${DEFAULT_LAUNCH_TIMEOUT_SECONDS}s")
 
   private val secretBase64String = {
     val secretBytes = new Array[Byte](128)
@@ -218,7 +220,7 @@ private[spark] class Client(
             .done()
           var submitSucceeded = false
           try {
-            submitCompletedFuture.get(LAUNCH_TIMEOUT_SECONDS, TimeUnit.SECONDS)
+            submitCompletedFuture.get(driverLaunchTimeoutSecs, TimeUnit.SECONDS)
             submitSucceeded = true
           } catch {
             case e: TimeoutException =>
@@ -226,17 +228,17 @@ private[spark] class Client(
                 kubernetesClient.pods().withName(kubernetesAppId).get()
               } catch {
                 case throwable: Throwable =>
-                  logError(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS seconds for the" +
+                  logError(s"Timed out while waiting $driverLaunchTimeoutSecs seconds for the" +
                     " driver pod to start, but an error occurred while fetching the driver" +
                     " pod's details.", throwable)
-                  throw new SparkException(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS" +
+                  throw new SparkException(s"Timed out while waiting $driverLaunchTimeoutSecs" +
                     " seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
                     " the latest state of the pod, another error was thrown. Check the logs for" +
                     " the error that was thrown in looking up the driver pod.", e)
               }
               val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
                 s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
-                s" $LAUNCH_TIMEOUT_SECONDS seconds."
+                s" $driverLaunchTimeoutSecs seconds."
               val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
                 s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
               } else {
@@ -424,7 +426,7 @@ private[spark] object Client extends Logging {
   private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher"
   private val SECURE_RANDOM = new SecureRandom()
   private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission"
-  private val LAUNCH_TIMEOUT_SECONDS = 30
+  private val DEFAULT_LAUNCH_TIMEOUT_SECONDS = 60
   private val SPARK_APP_NAME_LABEL = "spark-app-name"
 
   def main(args: Array[String]): Unit = {

From b98c8524f062482261a2d2b0269a8c9e47ad5733 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 25 Jan 2017 15:41:26 -0800
Subject: [PATCH 399/534] Sanitize kubernetesAppId for use in secret, service,
 and pod names (#45)

---
 .../main/scala/org/apache/spark/deploy/kubernetes/Client.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 93471a97e9ccd..537f6b6a115e9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -54,7 +54,7 @@ private[spark] class Client(
   private val appName = sparkConf.getOption("spark.app.name")
     .orElse(sparkConf.getOption("spark.app.id"))
     .getOrElse("spark")
-  private val kubernetesAppId = s"$appName-$launchTime"
+  private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
   private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
   private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
   private val driverDockerImage = sparkConf.get(

From 27f3005f3aea1a67651fa99f4b8919ca14531686 Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Wed, 25 Jan 2017 18:17:30 -0800
Subject: [PATCH 400/534] Support spark.driver.extraJavaOptions (#48)

---
 .../deploy/rest/kubernetes/KubernetesSparkRestServer.scala    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 08ddbaf5e50dc..4b8173381be3b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -173,6 +173,10 @@ private[spark] class KubernetesSparkRestServer(
                 val driverMemory = resolvedSparkProperties.getOrElse("spark.driver.memory", "1g")
                 command += s"-Xms$driverMemory"
                 command += s"-Xmx$driverMemory"
+                val extraJavaOpts = resolvedSparkProperties.get("spark.driver.extraJavaOptions")
+                  .map(Utils.splitCommandString)
+                  .getOrElse(Seq.empty)
+                command ++= extraJavaOpts
                 command += mainClass
                 command ++= appArgs
                 val pb = new ProcessBuilder(command: _*).inheritIO()

From 48f5884df11678202aabcbf031c5e2906d549763 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Thu, 26 Jan 2017 18:25:20 +0000
Subject: [PATCH 401/534] Use "extraScalaTestArgs" to pass extra options to
 scalatest. (#52)

* Use "extraTestArgLine" to pass extra options to scalatest.

Because the "argLine" option of scalatest is set in pom.xml and we can't
overwrite it from the command line.

Ref apache-spark-on-k8s/spark#37

* Added a default value for extraTestArgLine

* Use a better name.

* Added a tip for this in the dev docs.
---
 pom.xml                                |  3 ++-
 resource-managers/kubernetes/README.md | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index d04d7623d0584..fde1673f106cb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -223,6 +223,7 @@
     <PermGen>64m</PermGen>
     <MaxPermGen>512m</MaxPermGen>
     <CodeCacheSize>512m</CodeCacheSize>
+    <extraScalaTestArgs></extraScalaTestArgs>
   </properties>
   <repositories>
     <repository>
@@ -2085,7 +2086,7 @@
             <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
             <junitxml>.</junitxml>
             <filereports>SparkTestSuite.txt</filereports>
-            <argLine>-ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
+            <argLine>-ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} ${extraScalaTestArgs}</argLine>
             <stderr/>
             <environmentVariables>
               <!--
diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 33aca58c256cf..5e4ffaa54cb55 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -51,6 +51,20 @@ Afterwards, the integration tests can be executed with Maven or your IDE. Note t
 `pre-integration-test` phase must be run every time the Spark main code changes. When running tests from the
 command line, the `pre-integration-test` phase should automatically be invoked if the `integration-test` phase is run.
 
+# Preserve the Minikube VM
+
+The integration tests make use of [Minikube](https://github.com/kubernetes/minikube), which fires up a virtual machine
+and setup a single-node kubernetes cluster within it. By default the vm is destroyed after the tests are finished.
+If you want to preserve the vm, e.g. to reduce the running time of tests during development, you can pass the property
+`spark.docker.test.persistMinikube` to the test process:
+
+```sh
+build/mvn integration-test \
+    -Pkubernetes -Pkubernetes-integration-tests \
+    -pl resource-managers/kubernetes/integration-tests -am \
+    -DextraScalaTestArgs=-Dspark.docker.test.persistMinikube=true
+```
+
 # Usage Guide
 
 See the [usage guide](../../docs/running-on-kubernetes.md) for more information.

From 81bd35552a07fc031f28ef8b8e307f34493b9b02 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 26 Jan 2017 11:02:44 -0800
Subject: [PATCH 402/534] Use OpenJDK8's official Alpine image. (#51)

---
 .../docker-minimal-bundle/src/main/docker/driver/Dockerfile   | 4 +++-
 .../docker-minimal-bundle/src/main/docker/executor/Dockerfile | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index c2b562a39b572..4d345158f356a 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -1,9 +1,11 @@
-FROM anapsix/alpine-java:8
+FROM openjdk:8-alpine
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-driver:latest -f dockerfiles/driver/Dockerfile .
 
+RUN apk upgrade --update
+RUN apk add --update bash
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index 35f0ca3f645b9..a225110d55c14 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -1,9 +1,11 @@
-FROM anapsix/alpine-java:8
+FROM openjdk:8-alpine
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
+RUN apk upgrade --update
+RUN apk add --update bash
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 

From 86bd58951174846b8228102281435591833fb115 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 26 Jan 2017 12:18:41 -0800
Subject: [PATCH 403/534] Remove unused driver extra classpath upload code
 (#54)

---
 .../main/scala/org/apache/spark/deploy/SparkSubmit.scala   | 2 --
 .../org/apache/spark/deploy/SparkSubmitArguments.scala     | 7 -------
 .../org/apache/spark/launcher/SparkSubmitOptionParser.java | 5 +----
 .../deploy/rest/kubernetes/KubernetesSparkRestServer.scala | 5 -----
 4 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 5be2d8a52d84c..db6ec22ca919f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -474,8 +474,6 @@ object SparkSubmit {
         sysProp = "spark.kubernetes.namespace"),
       OptionAssigner(args.kubernetesUploadJars, KUBERNETES, CLUSTER,
         sysProp = "spark.kubernetes.driver.uploads.jars"),
-      OptionAssigner(args.kubernetesUploadDriverExtraClasspath, KUBERNETES, CLUSTER,
-        sysProp = "spark.kubernetes.driver.uploads.driverExtraClasspath"),
 
         // Other options
       OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index d80f79332111f..ae1bee7ee4d14 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -74,7 +74,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   // Kubernetes only
   var kubernetesNamespace: String = null
   var kubernetesUploadJars: String = null
-  var kubernetesUploadDriverExtraClasspath: String = null
 
   // Standalone cluster mode only
   var supervise: Boolean = false
@@ -197,9 +196,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     kubernetesUploadJars = Option(kubernetesUploadJars)
       .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.jars"))
       .orNull
-    kubernetesUploadDriverExtraClasspath = Option(kubernetesUploadDriverExtraClasspath)
-      .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.driverExtraClasspath"))
-      .orNull
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && !isR && primaryResource != null) {
@@ -446,9 +442,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       case KUBERNETES_UPLOAD_JARS =>
         kubernetesUploadJars = value
 
-      case KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH =>
-        kubernetesUploadDriverExtraClasspath = value
-
       case HELP =>
         printUsageAndExit(0)
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index 2b7290a12f8c1..f1dac20f52f0d 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -80,8 +80,6 @@ class SparkSubmitOptionParser {
   protected final String KUBERNETES_MASTER = "--kubernetes-master";
   protected final String KUBERNETES_NAMESPACE = "--kubernetes-namespace";
   protected final String KUBERNETES_UPLOAD_JARS = "--upload-jars";
-  protected final String KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH =
-          "--upload-driver-extra-classpath";
 
   /**
    * This is the canonical list of spark-submit options. Each entry in the array contains the
@@ -124,8 +122,7 @@ class SparkSubmitOptionParser {
     { TOTAL_EXECUTOR_CORES },
     { KUBERNETES_MASTER },
     { KUBERNETES_NAMESPACE },
-    { KUBERNETES_UPLOAD_JARS },
-    { KUBERNETES_UPLOAD_DRIVER_EXTRA_CLASSPATH }
+    { KUBERNETES_UPLOAD_JARS }
   };
 
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 4b8173381be3b..38fa4d1d3f0b2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -135,11 +135,6 @@ private[spark] class KubernetesSparkRestServer(
               } else {
                 val tempDir = Utils.createTempDir()
                 val appResourcePath = resolvedAppResource(appResource, tempDir)
-                val driverClasspathDirectory = new File(tempDir, "driver-extra-classpath")
-                if (!driverClasspathDirectory.mkdir) {
-                  throw new IllegalStateException("Failed to create driver extra classpath" +
-                    s" dir at ${driverClasspathDirectory.getAbsolutePath}")
-                }
                 val jarsDirectory = new File(tempDir, "jars")
                 if (!jarsDirectory.mkdir) {
                   throw new IllegalStateException("Failed to create jars dir at" +

From e6f35d2991e6128288072cca457412d4b1605fdd Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Fri, 27 Jan 2017 00:18:17 +0000
Subject: [PATCH 404/534] Fix k8s integration tests (#44)

* Fixed k8s integration test

- Enable spark ui explicitly for in-process submit
- Fixed some broken assertions in integration tests
- Fixed a scalastyle error in SparkDockerImageBuilder.scala
- Log into target/integration-tests.log like other modules

* Fixed line length.

* CR
---
 .../src/test/resources/log4j.properties       | 31 +++++++++++++++++++
 .../integrationtest/KubernetesSuite.scala     | 19 ++++++++++--
 .../docker/SparkDockerImageBuilder.scala      |  3 +-
 3 files changed, 49 insertions(+), 4 deletions(-)
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/resources/log4j.properties

diff --git a/resource-managers/kubernetes/integration-tests/src/test/resources/log4j.properties b/resource-managers/kubernetes/integration-tests/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..866126bc3c1c2
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/resources/log4j.properties
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/integration-tests.log
+log4j.rootCategory=INFO, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/integration-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Ignore messages below warning level from a few verbose libraries.
+log4j.logger.com.sun.jersey=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.mortbay=WARN
+log4j.logger.org.spark_project.jetty=WARN
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 6a92ae1cba49f..c4bb389f5ada2 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -36,6 +36,7 @@ import org.apache.spark.deploy.kubernetes.Client
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
+import org.apache.spark.internal.Logging
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 import org.apache.spark.util.Utils
 
@@ -82,8 +83,15 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   before {
     Eventually.eventually(TIMEOUT, INTERVAL) {
-      assert(minikubeKubernetesClient.pods().list().getItems.isEmpty)
-      assert(minikubeKubernetesClient.services().list().getItems.isEmpty)
+      val podsList = minikubeKubernetesClient.pods().list()
+      assert(podsList == null
+        || podsList.getItems == null
+        || podsList.getItems.isEmpty
+      )
+      val servicesList = minikubeKubernetesClient.services().list()
+      assert(servicesList == null
+        || servicesList.getItems == null
+        || servicesList.getItems.isEmpty)
     }
   }
 
@@ -139,6 +147,9 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Run a simple example") {
+    // We'll make assertions based on spark rest api, so we need to turn on
+    // spark.ui.enabled explicitly since the scalatest-maven-plugin would set it
+    // to false by default.
     val sparkConf = new SparkConf(true)
       .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
       .set("spark.kubernetes.submit.caCertFile", clientConfig.getCaCertFile)
@@ -152,6 +163,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.executor.cores", "1")
       .set("spark.executors.instances", "1")
       .set("spark.app.id", "spark-pi")
+      .set("spark.ui.enabled", "true")
+      .set("spark.testing", "false")
     val mainAppResource = s"file://$EXAMPLES_JAR"
 
     new Client(
@@ -174,6 +187,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--num-executors", "1",
       "--upload-jars", HELPER_JAR,
       "--class", MAIN_CLASS,
+      "--conf", "spark.ui.enabled=true",
+      "--conf", "spark.testing=false",
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index 22d78142508c1..1aa6a7b7e70c2 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -55,5 +55,4 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
     dockerClient.build(Paths.get("target", "docker", "driver"), "spark-driver")
     dockerClient.build(Paths.get("target", "docker", "executor"), "spark-executor")
   }
-
-}
\ No newline at end of file
+}

From 6cceb5999fbf785ce1d96fd78c94b6202222f701 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 27 Jan 2017 11:15:05 -0800
Subject: [PATCH 405/534] Added GC to components (#56)

---
 resource-managers/kubernetes/core/pom.xml     |  2 +-
 .../spark/deploy/kubernetes/Client.scala      | 19 +++++++++++++++
 .../KubernetesClusterSchedulerBackend.scala   | 24 ++++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 388defd93465d..86d7dec2c076f 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -29,7 +29,7 @@
   <name>Spark Project Kubernetes</name>
   <properties>
     <sbt.project.name>kubernetes</sbt.project.name>
-    <kubernetes.client.version>1.4.17</kubernetes.client.version>
+    <kubernetes.client.version>1.4.34</kubernetes.client.version>
   </properties>
 
   <dependencies>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 537f6b6a115e9..77b7c793dc37e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -123,6 +123,8 @@ private[spark] class Client(
             .endSpec()
           .done()
         sparkConf.set("spark.kubernetes.driver.service.name", service.getMetadata.getName)
+        sparkConf.set("spark.kubernetes.driver.pod.name", kubernetesAppId)
+
         sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
         sparkConf.setIfMissing("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
         val submitRequest = buildSubmissionRequest()
@@ -131,6 +133,23 @@ private[spark] class Client(
 
         val podWatcher = new Watcher[Pod] {
           override def eventReceived(action: Action, t: Pod): Unit = {
+            if (action == Action.ADDED) {
+              val ownerRefs = new ArrayBuffer[OwnerReference]
+              ownerRefs += new OwnerReferenceBuilder()
+                .withApiVersion(t.getApiVersion)
+                .withController(true)
+                .withKind(t.getKind)
+                .withName(t.getMetadata.getName)
+                .withUid(t.getMetadata.getUid)
+                .build()
+
+              secret.getMetadata().setOwnerReferences(ownerRefs.asJava)
+              kubernetesClient.secrets().createOrReplace(secret)
+
+              service.getMetadata().setOwnerReferences(ownerRefs.asJava)
+              kubernetesClient.services().createOrReplace(service)
+            }
+
             if ((action == Action.ADDED || action == Action.MODIFIED)
                 && t.getStatus.getPhase == "Running"
                 && !submitCompletedFuture.isDone) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index b7110ba901842..f512c50a9a934 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -60,6 +60,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
     .getOrElse(
       throw new SparkException("Must specify the service name the driver is running with"))
 
+  private val kubernetesDriverPodName = conf
+    .getOption("spark.kubernetes.driver.pod.name")
+    .getOrElse(
+      throw new SparkException("Must specify the driver pod name"))
+
   private val executorMemory = conf.getOption("spark.executor.memory").getOrElse("1g")
   private val executorMemoryBytes = Utils.byteStringAsBytes(executorMemory)
 
@@ -82,6 +87,15 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val kubernetesClient = KubernetesClientBuilder
     .buildFromWithinPod(kubernetesMaster, kubernetesNamespace)
 
+  val driverPod = try {
+    kubernetesClient.pods().inNamespace(kubernetesNamespace).
+      withName(kubernetesDriverPodName).get()
+  } catch {
+    case throwable: Throwable =>
+      logError(s"Executor cannot find driver pod.", throwable)
+      throw new SparkException(s"Executor cannot find driver pod", throwable)
+  }
+
   override val minRegisteredRatio =
     if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
       0.8
@@ -202,7 +216,15 @@ private[spark] class KubernetesClusterSchedulerBackend(
       .withNewMetadata()
         .withName(name)
         .withLabels(selectors)
-        .endMetadata()
+        .withOwnerReferences()
+        .addNewOwnerReference()
+          .withController(true)
+          .withApiVersion(driverPod.getApiVersion)
+          .withKind(driverPod.getKind)
+          .withName(driverPod.getMetadata.getName)
+          .withUid(driverPod.getMetadata.getUid)
+        .endOwnerReference()
+      .endMetadata()
       .withNewSpec()
         .addNewContainer()
           .withName(s"exec-${applicationId()}-container")

From 3b5901ad9df770685b8036f5d3c99003703979b7 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 27 Jan 2017 16:20:55 -0800
Subject: [PATCH 406/534] Create README to better describe project purpose
 (#50)

* Create README to better describe project purpose

* Add links to usage guide and dev docs

* Minor changes
---
 README.md | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/README.md b/README.md
index f5983239c043f..484fef67dc180 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,41 @@
+# Apache Spark On Kubernetes
+
+This repository, located at https://github.com/apache-spark-on-k8s/spark, contains a fork of Apache Spark that enables running Spark jobs natively on a Kubernetes cluster.
+
+## What is this?
+
+This is a collaboratively maintained project working on [SPARK-18278](https://issues.apache.org/jira/browse/SPARK-18278). The goal is to bring native support for Spark to use Kubernetes as a cluster manager, in a fully supported way on par with the Spark Standalone, Mesos, and Apache YARN cluster managers.
+
+## Getting Started
+
+- [Usage guide](docs/running-on-kubernetes.md) shows how to run the code
+- [Development docs](resource-managers/kubernetes/README.md) shows how to get set up for development
+- Code is primarily located in the [resource-managers/kubernetes](resource-managers/kubernetes) folder
+
+## Why does this fork exist?
+
+Adding native integration for a new cluster manager is a large undertaking.  If poorly executed, it could introduce bugs into Spark when run on other cluster managers, cause release blockers slowing down the overall Spark project, or require hotfixes which divert attention away from development towards managing additional releases.  Any work this deep inside Spark needs to be done carefully to minimize the risk of those negative externalities.
+
+At the same time, an increasing number of people from various companies and organizations desire to work together to natively run Spark on Kubernetes.  The group needs a code repository, communication forum, issue tracking, and continuous integration, all in order to work together effectively on an open source product.
+
+We've been asked by an Apache Spark Committer to work outside of the Apache infrastructure for a short period of time to allow this feature to be hardened and improved without creating risk for Apache Spark.  The aim is to rapidly bring it to the point where it can be brought into the mainline Apache Spark repository for continued development within the Apache umbrella.  If all goes well, this should be a short-lived fork rather than a long-lived one.
+
+## Who are we?
+
+This is a collaborative effort by several folks from different companies who are interested in seeing this feature be successful.  Companies active in this project include (alphabetically):
+
+- Google
+- Haiwen
+- Hyperpilot
+- Intel
+- Palantir
+- Pepperdata
+- Red Hat
+
+--------------------
+
+(original README below)
+
 # Apache Spark
 
 Spark is a fast and general cluster computing system for Big Data. It provides

From 2e992be858dd276bd7dce238b7c40f9cc8aeaa04 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 30 Jan 2017 14:12:10 -0800
Subject: [PATCH 407/534] Access the Driver Launcher Server over NodePort for
 app launch + submit jars (#30)

* Revamp ports and service setup for the driver.

- Expose the driver-submission service on NodePort and contact that as
opposed to going through the API server proxy
- Restrict the ports that are exposed on the service to only the driver
submission service when uploading content and then only the Spark UI
after the job has started

* Move service creation down and more thorough error handling

* Fix missed merge conflict

* Add braces

* Fix bad merge

* Address comments and refactor run() more.

Method nesting was getting confusing so pulled out the inner class and
removed the extra method indirection from createDriverPod()

* Remove unused method

* Support SSL configuration for the driver application submission (#49)

* Support SSL when setting up the driver.

The user can provide a keyStore to load onto the driver pod and the
driver pod will use that keyStore to set up SSL on its server.

* Clean up SSL secrets after finishing submission.

We don't need to persist these after the pod has them mounted and is
running already.

* Fix compilation error

* Revert image change

* Address comments

* Programmatically generate certificates for integration tests.

* Address comments

* Resolve merge conflicts

* Fix bad merge

* Remove unnecessary braces

* Fix compiler error
---
 .../deploy/rest/RestSubmissionServer.scala    |  40 +-
 docs/running-on-kubernetes.md                 |  18 +
 pom.xml                                       |   7 +-
 .../spark/deploy/kubernetes/Client.scala      | 624 +++++++++++-------
 .../KubernetesSparkRestServer.scala           |  58 +-
 .../KubernetesClusterSchedulerBackend.scala   |   5 +-
 .../src/main/docker/driver/Dockerfile         |  13 +-
 .../kubernetes/integration-tests/pom.xml      |   4 +
 .../integrationtest/KubernetesSuite.scala     |  40 +-
 .../integrationtest/sslutil/SSLUtils.scala    |  80 +++
 10 files changed, 637 insertions(+), 252 deletions(-)
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
index b30c980e95a9a..524726c2ccf92 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala
@@ -19,16 +19,16 @@ package org.apache.spark.deploy.rest
 
 import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse}
 
-import scala.io.Source
-
 import com.fasterxml.jackson.core.JsonProcessingException
-import org.eclipse.jetty.server.{HttpConnectionFactory, Server, ServerConnector}
+import org.eclipse.jetty.http.HttpVersion
+import org.eclipse.jetty.server.{HttpConfiguration, HttpConnectionFactory, Server, ServerConnector, SslConnectionFactory}
 import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 import org.eclipse.jetty.util.thread.{QueuedThreadPool, ScheduledExecutorScheduler}
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
+import scala.io.Source
 
-import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf}
+import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf, SSLOptions}
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
@@ -50,7 +50,8 @@ import org.apache.spark.util.Utils
 private[spark] abstract class RestSubmissionServer(
     val host: String,
     val requestedPort: Int,
-    val masterConf: SparkConf) extends Logging {
+    val masterConf: SparkConf,
+    val sslOptions: SSLOptions = SSLOptions()) extends Logging {
   protected val submitRequestServlet: SubmitRequestServlet
   protected val killRequestServlet: KillRequestServlet
   protected val statusRequestServlet: StatusRequestServlet
@@ -79,19 +80,32 @@ private[spark] abstract class RestSubmissionServer(
    * Return a 2-tuple of the started server and the bound port.
    */
   private def doStart(startPort: Int): (Server, Int) = {
+    // TODO consider using JettyUtils#startServer to do this instead
     val threadPool = new QueuedThreadPool
     threadPool.setDaemon(true)
     val server = new Server(threadPool)
 
+    val resolvedConnectionFactories = sslOptions
+      .createJettySslContextFactory()
+      .map(sslFactory => {
+        val sslConnectionFactory = new SslConnectionFactory(
+          sslFactory, HttpVersion.HTTP_1_1.asString())
+        val rawHttpConfiguration = new HttpConfiguration()
+        rawHttpConfiguration.setSecureScheme("https")
+        rawHttpConfiguration.setSecurePort(startPort)
+        val rawHttpConnectionFactory = new HttpConnectionFactory(rawHttpConfiguration)
+        Array(sslConnectionFactory, rawHttpConnectionFactory)
+      }).getOrElse(Array(new HttpConnectionFactory()))
+
     val connector = new ServerConnector(
-      server,
-      null,
-      // Call this full constructor to set this, which forces daemon threads:
-      new ScheduledExecutorScheduler("RestSubmissionServer-JettyScheduler", true),
-      null,
-      -1,
-      -1,
-      new HttpConnectionFactory())
+        server,
+        null,
+        // Call this full constructor to set this, which forces daemon threads:
+        new ScheduledExecutorScheduler("RestSubmissionServer-JettyScheduler", true),
+        null,
+        -1,
+        -1,
+        resolvedConnectionFactories: _*)
     connector.setHost(host)
     connector.setPort(startPort)
     server.addConnector(connector)
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 5a73b1ad1ea29..e25e189aa6d74 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -132,6 +132,24 @@ To specify a main application resource that is in the Docker image, and if it ha
       --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
       container:///home/applications/examples/example.jar
 
+### Setting Up SSL For Submitting the Driver
+
+When submitting to Kubernetes, a pod is started for the driver, and the pod starts an HTTP server. This HTTP server
+receives the driver's configuration, including uploaded driver jars, from the client before starting the application.
+Spark supports using SSL to encrypt the traffic in this bootstrapping process. It is recommended to configure this
+whenever possible. 
+
+See the [security page](security.html) and [configuration](configuration.html) sections for more information on
+configuring SSL; use the prefix `spark.ssl.kubernetes.driverlaunch` in configuring the SSL-related fields in the context
+of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
+pod in starting the application, set `spark.ssl.kubernetes.driverlaunch.trustStore`.
+
+One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
+container image's disk. Thus `spark.ssl.kubernetes.driverlaunch.keyStore` can be a URI with a scheme of either `file:`
+or `container:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
+the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
+`container:`, the file is assumed to already be on the container's disk at the appropriate path.
+
 ### Spark Properties
 
 Below are some other common properties that are specific to Kubernetes. Most of the other configurations are the same
diff --git a/pom.xml b/pom.xml
index fde1673f106cb..5fc088bc2d174 100644
--- a/pom.xml
+++ b/pom.xml
@@ -137,6 +137,7 @@
     <parquet.version>1.8.1</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <feign.version>8.18.0</feign.version>
+    <bouncycastle.version>1.52</bouncycastle.version>
     <jetty.version>9.2.16.v20160414</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
     <chill.version>0.8.0</chill.version>
@@ -331,7 +332,11 @@
         <artifactId>okhttp</artifactId>
         <version>3.4.1</version>
       </dependency>
-
+      <dependency>
+        <groupId>org.bouncycastle</groupId>
+        <artifactId>bcpkix-jdk15on</artifactId>
+        <version>${bouncycastle.version}</version>
+      </dependency>
       <!-- This artifact is a shaded version of ASM 5.0.4. The POM that was used to produce this
            is at https://github.com/apache/geronimo-xbean/tree/xbean-4.4/xbean-asm5-shaded
            For context on why we shade ASM, see SPARK-782 and SPARK-6152. -->
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 77b7c793dc37e..07a45c7577bcd 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -16,25 +16,25 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import java.io.File
-import java.security.SecureRandom
+import java.io.{File, FileInputStream}
+import java.security.{KeyStore, SecureRandom}
 import java.util.concurrent.{Executors, TimeoutException, TimeUnit}
-import javax.net.ssl.X509TrustManager
+import java.util.concurrent.atomic.AtomicBoolean
+import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
+import com.google.common.base.Charsets
 import com.google.common.io.Files
 import com.google.common.util.concurrent.{SettableFuture, ThreadFactoryBuilder}
 import io.fabric8.kubernetes.api.model._
-import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient, KubernetesClientException, Watch, Watcher}
+import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
-import io.fabric8.kubernetes.client.internal.SSLUtils
 import org.apache.commons.codec.binary.Base64
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.ExecutionContext
+import scala.collection.mutable
+import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.DurationInt
-import scala.util.Success
 
-import org.apache.spark.{SPARK_VERSION, SparkConf, SparkException}
+import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
@@ -56,10 +56,14 @@ private[spark] class Client(
     .getOrElse("spark")
   private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
   private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
+  private val secretDirectory = s"$SPARK_SUBMISSION_SECRET_BASE_DIR/$kubernetesAppId"
+  private val sslSecretsDirectory = s"$SPARK_SUBMISSION_SECRET_BASE_DIR/$kubernetesAppId-ssl"
+  private val sslSecretsName = s"spark-submission-server-ssl-$kubernetesAppId"
   private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
   private val driverDockerImage = sparkConf.get(
-    "spark.kubernetes.driver.docker.image", s"spark-driver:$SPARK_VERSION")
+    "spark.kubernetes.driver.docker.image", s"spark-driver:$sparkVersion")
   private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
+  private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverLaunchTimeoutSecs = sparkConf.getTimeAsSeconds(
     "spark.kubernetes.driverLaunchTimeout", s"${DEFAULT_LAUNCH_TIMEOUT_SECONDS}s")
 
@@ -82,6 +86,7 @@ private[spark] class Client(
         .build()))
 
   def run(): Unit = {
+    val (driverLaunchSslOptions, isKeyStoreLocalFile) = parseDriverLaunchSslOptions()
     val parsedCustomLabels = parseCustomLabels(customLabels)
     var k8ConfBuilder = new ConfigBuilder()
       .withApiVersion("v1")
@@ -98,123 +103,50 @@ private[spark] class Client(
     }
 
     val k8ClientConfig = k8ConfBuilder.build
-    Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig))(kubernetesClient => {
-      val secret = kubernetesClient.secrets().createNew()
+    Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig)) { kubernetesClient =>
+      val submitServerSecret = kubernetesClient.secrets().createNew()
         .withNewMetadata()
-        .withName(secretName)
-        .endMetadata()
+          .withName(secretName)
+          .endMetadata()
         .withData(Map((SUBMISSION_SERVER_SECRET_NAME, secretBase64String)).asJava)
         .withType("Opaque")
         .done()
+      val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(kubernetesClient,
+        driverLaunchSslOptions,
+        isKeyStoreLocalFile)
       try {
-        val resolvedSelectors = (Map(
+        val driverKubernetesSelectors = (Map(
             DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue,
             SPARK_APP_NAME_LABEL -> appName)
           ++ parsedCustomLabels).asJava
-        val (servicePorts, containerPorts) = configurePorts()
-        val service = kubernetesClient.services().createNew()
-          .withNewMetadata()
-            .withName(kubernetesAppId)
-            .withLabels(Map(SPARK_APP_NAME_LABEL -> appName).asJava)
-            .endMetadata()
-          .withNewSpec()
-            .withSelector(resolvedSelectors)
-            .withPorts(servicePorts.asJava)
-            .endSpec()
-          .done()
-        sparkConf.set("spark.kubernetes.driver.service.name", service.getMetadata.getName)
-        sparkConf.set("spark.kubernetes.driver.pod.name", kubernetesAppId)
-
-        sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
-        sparkConf.setIfMissing("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
-        val submitRequest = buildSubmissionRequest()
+        val containerPorts = buildContainerPorts()
         val submitCompletedFuture = SettableFuture.create[Boolean]
-        val secretDirectory = s"$SPARK_SUBMISSION_SECRET_BASE_DIR/$kubernetesAppId"
-
-        val podWatcher = new Watcher[Pod] {
-          override def eventReceived(action: Action, t: Pod): Unit = {
-            if (action == Action.ADDED) {
-              val ownerRefs = new ArrayBuffer[OwnerReference]
-              ownerRefs += new OwnerReferenceBuilder()
-                .withApiVersion(t.getApiVersion)
-                .withController(true)
-                .withKind(t.getKind)
-                .withName(t.getMetadata.getName)
-                .withUid(t.getMetadata.getUid)
-                .build()
-
-              secret.getMetadata().setOwnerReferences(ownerRefs.asJava)
-              kubernetesClient.secrets().createOrReplace(secret)
-
-              service.getMetadata().setOwnerReferences(ownerRefs.asJava)
-              kubernetesClient.services().createOrReplace(service)
-            }
-
-            if ((action == Action.ADDED || action == Action.MODIFIED)
-                && t.getStatus.getPhase == "Running"
-                && !submitCompletedFuture.isDone) {
-              t.getStatus
-                .getContainerStatuses
-                .asScala
-                .find(status =>
-                  status.getName == DRIVER_LAUNCHER_CONTAINER_NAME && status.getReady) match {
-                case Some(_) =>
-                  try {
-                    val driverLauncher = getDriverLauncherService(
-                      k8ClientConfig, master)
-                    val ping = Retry.retry(5, 5.seconds) {
-                      driverLauncher.ping()
-                    }
-                    ping onFailure {
-                      case t: Throwable =>
-                        if (!submitCompletedFuture.isDone) {
-                          submitCompletedFuture.setException(t)
-                        }
-                    }
-                    val submitComplete = ping andThen {
-                      case Success(_) =>
-                        driverLauncher.create(submitRequest)
-                        submitCompletedFuture.set(true)
-                    }
-                    submitComplete onFailure {
-                      case t: Throwable =>
-                        if (!submitCompletedFuture.isDone) {
-                          submitCompletedFuture.setException(t)
-                        }
-                    }
-                  } catch {
-                    case e: Throwable =>
-                      if (!submitCompletedFuture.isDone) {
-                        submitCompletedFuture.setException(e)
-                        throw e
-                      }
-                  }
-                case None =>
-              }
-            }
-          }
-
-          override def onClose(e: KubernetesClientException): Unit = {
-            if (!submitCompletedFuture.isDone) {
-              submitCompletedFuture.setException(e)
-            }
-          }
-        }
-
-        def createDriverPod(unused: Watch): Unit = {
+        val submitPending = new AtomicBoolean(false)
+        val podWatcher = new DriverPodWatcher(
+          submitCompletedFuture,
+          submitPending,
+          kubernetesClient,
+          driverLaunchSslOptions,
+          Array(submitServerSecret) ++ sslSecrets,
+          driverKubernetesSelectors)
+        Utils.tryWithResource(kubernetesClient
+            .pods()
+            .withLabels(driverKubernetesSelectors)
+            .watch(podWatcher)) { _ =>
           kubernetesClient.pods().createNew()
             .withNewMetadata()
               .withName(kubernetesAppId)
-              .withLabels(resolvedSelectors)
+              .withLabels(driverKubernetesSelectors)
               .endMetadata()
             .withNewSpec()
               .withRestartPolicy("OnFailure")
               .addNewVolume()
                 .withName(s"spark-submission-secret-volume")
-                  .withNewSecret()
-                  .withSecretName(secret.getMetadata.getName)
+                .withNewSecret()
+                  .withSecretName(submitServerSecret.getMetadata.getName)
                   .endSecret()
                 .endVolume
+              .addToVolumes(sslVolumes: _*)
               .withServiceAccount(serviceAccount)
               .addNewContainer()
                 .withName(DRIVER_LAUNCHER_CONTAINER_NAME)
@@ -225,6 +157,7 @@ private[spark] class Client(
                   .withMountPath(secretDirectory)
                   .withReadOnly(true)
                   .endVolumeMount()
+                .addToVolumeMounts(sslVolumeMounts: _*)
                 .addNewEnv()
                   .withName("SPARK_SUBMISSION_SECRET_LOCATION")
                   .withValue(s"$secretDirectory/$SUBMISSION_SERVER_SECRET_NAME")
@@ -233,6 +166,7 @@ private[spark] class Client(
                   .withName("SPARK_DRIVER_LAUNCHER_SERVER_PORT")
                   .withValue(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT.toString)
                   .endEnv()
+                .addToEnv(sslEnvs: _*)
                 .withPorts(containerPorts.asJava)
                 .endContainer()
               .endSpec()
@@ -243,121 +177,321 @@ private[spark] class Client(
             submitSucceeded = true
           } catch {
             case e: TimeoutException =>
-              val driverPod = try {
-                kubernetesClient.pods().withName(kubernetesAppId).get()
-              } catch {
-                case throwable: Throwable =>
-                  logError(s"Timed out while waiting $driverLaunchTimeoutSecs seconds for the" +
-                    " driver pod to start, but an error occurred while fetching the driver" +
-                    " pod's details.", throwable)
-                  throw new SparkException(s"Timed out while waiting $driverLaunchTimeoutSecs" +
-                    " seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
-                    " the latest state of the pod, another error was thrown. Check the logs for" +
-                    " the error that was thrown in looking up the driver pod.", e)
-              }
-              val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
-                s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
-                s" $driverLaunchTimeoutSecs seconds."
-              val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
-                s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
-              } else {
-                "The pod had no final phase."
-              }
-              val podStatusMessage = if (driverPod.getStatus.getMessage != null) {
-                s"Latest message from the pod is: ${driverPod.getStatus.getMessage}"
-              } else {
-                "The pod had no final message."
-              }
-              val failedDriverContainerStatusString = driverPod.getStatus
-                .getContainerStatuses
-                .asScala
-                .find(_.getName == DRIVER_LAUNCHER_CONTAINER_NAME)
-                .map(status => {
-                  val lastState = status.getState
-                  if (lastState.getRunning != null) {
-                    "Driver container last state: Running\n" +
-                    s"Driver container started at: ${lastState.getRunning.getStartedAt}"
-                  } else if (lastState.getWaiting != null) {
-                    "Driver container last state: Waiting\n" +
-                    s"Driver container wait reason: ${lastState.getWaiting.getReason}\n" +
-                    s"Driver container message: ${lastState.getWaiting.getMessage}\n"
-                  } else if (lastState.getTerminated != null) {
-                    "Driver container last state: Terminated\n" +
-                    s"Driver container started at: ${lastState.getTerminated.getStartedAt}\n" +
-                    s"Driver container finished at: ${lastState.getTerminated.getFinishedAt}\n" +
-                    s"Driver container exit reason: ${lastState.getTerminated.getReason}\n" +
-                    s"Driver container exit code: ${lastState.getTerminated.getExitCode}\n" +
-                    s"Driver container message: ${lastState.getTerminated.getMessage}"
-                  } else {
-                    "Driver container last state: Unknown"
-                  }
-                }).getOrElse("The driver container wasn't found in the pod; expected to find" +
-                  s" container with name $DRIVER_LAUNCHER_CONTAINER_NAME")
-              val finalErrorMessage = s"$topLevelMessage\n" +
-                s"$podStatusPhase\n" +
-                s"$podStatusMessage\n\n$failedDriverContainerStatusString"
+              val finalErrorMessage: String = buildSubmitFailedErrorMessage(kubernetesClient, e)
               logError(finalErrorMessage, e)
               throw new SparkException(finalErrorMessage, e)
-            } finally {
-              if (!submitSucceeded) {
-                try {
-                  kubernetesClient.pods.withName(kubernetesAppId).delete
-                } catch {
-                  case throwable: Throwable =>
-                    logError("Failed to delete driver pod after it failed to run.", throwable)
-                }
+          } finally {
+            if (!submitSucceeded) {
+              Utils.tryLogNonFatalError {
+                kubernetesClient.pods.withName(kubernetesAppId).delete()
               }
             }
           }
-
-        Utils.tryWithResource(kubernetesClient
-          .pods()
-          .withLabels(resolvedSelectors)
-          .watch(podWatcher)) { createDriverPod }
+        }
       } finally {
-        kubernetesClient.secrets().delete(secret)
+        Utils.tryLogNonFatalError {
+          kubernetesClient.secrets().delete(submitServerSecret)
+        }
+        Utils.tryLogNonFatalError {
+          kubernetesClient.secrets().delete(sslSecrets: _*)
+        }
       }
-    })
+    }
   }
 
-  private def configurePorts(): (Seq[ServicePort], Seq[ContainerPort]) = {
-    val servicePorts = new ArrayBuffer[ServicePort]
-    val containerPorts = new ArrayBuffer[ContainerPort]
+  private def parseDriverLaunchSslOptions(): (SSLOptions, Boolean) = {
+    val maybeKeyStore = sparkConf.getOption("spark.ssl.kubernetes.driverlaunch.keyStore")
+    val resolvedSparkConf = sparkConf.clone()
+    val (isLocalKeyStore, resolvedKeyStore) = maybeKeyStore.map(keyStore => {
+      val keyStoreURI = Utils.resolveURI(keyStore)
+      val isProvidedKeyStoreLocal = keyStoreURI.getScheme match {
+        case "file" | null => true
+        case "container" => false
+        case _ => throw new SparkException(s"Invalid KeyStore URI $keyStore; keyStore URI" +
+          " for submit server must have scheme file:// or container:// (no scheme defaults" +
+          " to file://)")
+      }
+      (isProvidedKeyStoreLocal, Option.apply(keyStoreURI.getPath))
+    }).getOrElse((true, Option.empty[String]))
+    resolvedKeyStore.foreach {
+      resolvedSparkConf.set("spark.ssl.kubernetes.driverlaunch.keyStore", _)
+    }
+    sparkConf.getOption("spark.ssl.kubernetes.driverlaunch.trustStore").foreach { trustStore =>
+      val trustStoreURI = Utils.resolveURI(trustStore)
+      trustStoreURI.getScheme match {
+        case "file" | null =>
+          resolvedSparkConf.set("spark.ssl.kubernetes.driverlaunch.trustStore",
+            trustStoreURI.getPath)
+        case _ => throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
+          " for submit server must have no scheme, or scheme file://")
+      }
+    }
+    val securityManager = new SecurityManager(resolvedSparkConf)
+    (securityManager.getSSLOptions("kubernetes.driverlaunch"), isLocalKeyStore)
+  }
 
-    def addPortToServiceAndContainer(portName: String, portValue: Int): Unit = {
-      servicePorts += new ServicePortBuilder()
-        .withName(portName)
-        .withPort(portValue)
-        .withNewTargetPort(portValue)
+  private def configureSsl(kubernetesClient: KubernetesClient, driverLaunchSslOptions: SSLOptions,
+        isKeyStoreLocalFile: Boolean):
+        (Array[EnvVar], Array[Volume], Array[VolumeMount], Array[Secret]) = {
+    if (driverLaunchSslOptions.enabled) {
+      val sslSecretsMap = mutable.HashMap[String, String]()
+      val sslEnvs = mutable.Buffer[EnvVar]()
+      val secrets = mutable.Buffer[Secret]()
+      driverLaunchSslOptions.keyStore.foreach(store => {
+        val resolvedKeyStoreFile = if (isKeyStoreLocalFile) {
+          if (!store.isFile) {
+            throw new SparkException(s"KeyStore specified at $store is not a file or" +
+              s" does not exist.")
+          }
+          val keyStoreBytes = Files.toByteArray(store)
+          val keyStoreBase64 = Base64.encodeBase64String(keyStoreBytes)
+          sslSecretsMap += (SSL_KEYSTORE_SECRET_NAME -> keyStoreBase64)
+          s"$sslSecretsDirectory/$SSL_KEYSTORE_SECRET_NAME"
+        } else {
+          store.getAbsolutePath
+        }
+        sslEnvs += new EnvVarBuilder()
+          .withName("SPARK_SUBMISSION_KEYSTORE_FILE")
+          .withValue(resolvedKeyStoreFile)
+          .build()
+      })
+      driverLaunchSslOptions.keyStorePassword.foreach(password => {
+        val passwordBase64 = Base64.encodeBase64String(password.getBytes(Charsets.UTF_8))
+        sslSecretsMap += (SSL_KEYSTORE_PASSWORD_SECRET_NAME -> passwordBase64)
+        sslEnvs += new EnvVarBuilder()
+          .withName("SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE")
+          .withValue(s"$sslSecretsDirectory/$SSL_KEYSTORE_PASSWORD_SECRET_NAME")
+          .build()
+      })
+      driverLaunchSslOptions.keyPassword.foreach(password => {
+        val passwordBase64 = Base64.encodeBase64String(password.getBytes(Charsets.UTF_8))
+        sslSecretsMap += (SSL_KEY_PASSWORD_SECRET_NAME -> passwordBase64)
+        sslEnvs += new EnvVarBuilder()
+          .withName("SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE")
+          .withValue(s"$sslSecretsDirectory/$SSL_KEY_PASSWORD_SECRET_NAME")
+          .build()
+      })
+      driverLaunchSslOptions.keyStoreType.foreach(storeType => {
+        sslEnvs += new EnvVarBuilder()
+          .withName("SPARK_SUBMISSION_KEYSTORE_TYPE")
+          .withValue(storeType)
+          .build()
+      })
+      sslEnvs += new EnvVarBuilder()
+        .withName("SPARK_SUBMISSION_USE_SSL")
+        .withValue("true")
+        .build()
+      val sslSecrets = kubernetesClient.secrets().createNew()
+        .withNewMetadata()
+        .withName(sslSecretsName)
+        .endMetadata()
+        .withData(sslSecretsMap.asJava)
+        .withType("Opaque")
+        .done()
+      secrets += sslSecrets
+      val sslVolume = new VolumeBuilder()
+        .withName("spark-submission-server-ssl-secrets")
+        .withNewSecret()
+          .withSecretName(sslSecrets.getMetadata.getName)
+          .endSecret()
         .build()
-      containerPorts += new ContainerPortBuilder()
-        .withContainerPort(portValue)
+      val sslVolumeMount = new VolumeMountBuilder()
+        .withName("spark-submission-server-ssl-secrets")
+        .withReadOnly(true)
+        .withMountPath(sslSecretsDirectory)
         .build()
+      (sslEnvs.toArray, Array(sslVolume), Array(sslVolumeMount), secrets.toArray)
+    } else {
+      (Array[EnvVar](), Array[Volume](), Array[VolumeMount](), Array[Secret]())
     }
+  }
+
+  private class DriverPodWatcher(
+      submitCompletedFuture: SettableFuture[Boolean],
+      submitPending: AtomicBoolean,
+      kubernetesClient: KubernetesClient,
+      driverLaunchSslOptions: SSLOptions,
+      applicationSecrets: Array[Secret],
+      driverKubernetesSelectors: java.util.Map[String, String]) extends Watcher[Pod] {
+    override def eventReceived(action: Action, pod: Pod): Unit = {
+      if ((action == Action.ADDED || action == Action.MODIFIED)
+        && pod.getStatus.getPhase == "Running"
+        && !submitCompletedFuture.isDone) {
+        if (!submitPending.getAndSet(true)) {
+          pod.getStatus
+            .getContainerStatuses
+            .asScala
+            .find(status =>
+              status.getName == DRIVER_LAUNCHER_CONTAINER_NAME && status.getReady) match {
+            case Some(_) =>
+              val ownerRefs = Seq(new OwnerReferenceBuilder()
+                .withName(pod.getMetadata.getName)
+                .withUid(pod.getMetadata.getUid)
+                .withApiVersion(pod.getApiVersion)
+                .withKind(pod.getKind)
+                .withController(true)
+                .build())
+
+              applicationSecrets.foreach(secret => {
+                secret.getMetadata.setOwnerReferences(ownerRefs.asJava)
+                kubernetesClient.secrets().createOrReplace(secret)
+              })
 
-    addPortToServiceAndContainer(
-      DRIVER_LAUNCHER_SERVICE_PORT_NAME,
-      DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT)
-    addPortToServiceAndContainer(
-      DRIVER_PORT_NAME,
-      sparkConf
-        .getOption("spark.driver.port")
-        .map(_.toInt)
-        .getOrElse(DEFAULT_DRIVER_PORT))
-    addPortToServiceAndContainer(
-      BLOCKMANAGER_PORT_NAME,
-      sparkConf
-        .getOption("spark.blockmanager.port")
-        .map(_.toInt)
-        .getOrElse(DEFAULT_BLOCKMANAGER_PORT))
+              val driverLauncherServicePort = new ServicePortBuilder()
+                .withName(DRIVER_LAUNCHER_SERVICE_PORT_NAME)
+                .withPort(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT)
+                .withNewTargetPort(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT)
+                .build()
+              val service = kubernetesClient.services().createNew()
+                .withNewMetadata()
+                  .withName(kubernetesAppId)
+                  .withLabels(driverKubernetesSelectors)
+                  .withOwnerReferences(ownerRefs.asJava)
+                  .endMetadata()
+                .withNewSpec()
+                  .withType("NodePort")
+                  .withSelector(driverKubernetesSelectors)
+                  .withPorts(driverLauncherServicePort)
+                  .endSpec()
+                .done()
+              try {
+                sparkConf.set("spark.kubernetes.driver.service.name",
+                  service.getMetadata.getName)
+                sparkConf.set("spark.kubernetes.driver.pod.name", kubernetesAppId)
+                sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
+                sparkConf.setIfMissing("spark.blockmanager.port",
+                  DEFAULT_BLOCKMANAGER_PORT.toString)
+                val driverLauncher = buildDriverLauncherClient(kubernetesClient, service,
+                    driverLaunchSslOptions)
+                val ping = Retry.retry(5, 5.seconds) {
+                  driverLauncher.ping()
+                }
+                ping onFailure {
+                  case t: Throwable =>
+                    submitCompletedFuture.setException(t)
+                    kubernetesClient.services().delete(service)
+                }
+                val submitComplete = ping.flatMap { _ =>
+                  Future {
+                    sparkConf.set("spark.driver.host", pod.getStatus.getPodIP)
+                    val submitRequest = buildSubmissionRequest()
+                    driverLauncher.create(submitRequest)
+                  }
+                }
+                submitComplete onFailure {
+                  case t: Throwable =>
+                    submitCompletedFuture.setException(t)
+                    kubernetesClient.services().delete(service)
+                }
+                val adjustServicePort = submitComplete.flatMap { _ =>
+                  Future {
+                    // After submitting, adjust the service to only expose the Spark UI
+                    val uiServicePort = new ServicePortBuilder()
+                      .withName(UI_PORT_NAME)
+                      .withPort(uiPort)
+                      .withNewTargetPort(uiPort)
+                      .build()
+                    kubernetesClient.services().withName(kubernetesAppId).edit()
+                      .editSpec()
+                        .withType("ClusterIP")
+                        .withPorts(uiServicePort)
+                        .endSpec()
+                      .done
+                  }
+                }
+                adjustServicePort onSuccess {
+                  case _ =>
+                    submitCompletedFuture.set(true)
+                }
+                adjustServicePort onFailure {
+                  case throwable: Throwable =>
+                    submitCompletedFuture.setException(throwable)
+                    kubernetesClient.services().delete(service)
+                }
+              } catch {
+                case e: Throwable =>
+                  submitCompletedFuture.setException(e)
+                  Utils.tryLogNonFatalError({
+                    kubernetesClient.services().delete(service)
+                  })
+                  throw e
+              }
+            case None =>
+          }
+        }
+      }
+    }
 
-    addPortToServiceAndContainer(
-      UI_PORT_NAME,
-      sparkConf
-        .getOption("spark.ui.port")
-        .map(_.toInt)
-        .getOrElse(DEFAULT_UI_PORT))
-    (servicePorts, containerPorts)
+    override def onClose(e: KubernetesClientException): Unit = {
+      if (!submitCompletedFuture.isDone) {
+        submitCompletedFuture.setException(e)
+      }
+    }
+  }
+
+  private def buildSubmitFailedErrorMessage(
+      kubernetesClient: DefaultKubernetesClient,
+      e: TimeoutException): String = {
+    val driverPod = try {
+      kubernetesClient.pods().withName(kubernetesAppId).get()
+    } catch {
+      case throwable: Throwable =>
+        logError(s"Timed out while waiting $driverLaunchTimeoutSecs seconds for the" +
+          " driver pod to start, but an error occurred while fetching the driver" +
+          " pod's details.", throwable)
+        throw new SparkException(s"Timed out while waiting $driverLaunchTimeoutSecs" +
+          " seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
+          " the latest state of the pod, another error was thrown. Check the logs for" +
+          " the error that was thrown in looking up the driver pod.", e)
+    }
+    val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
+      s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
+      s" $driverLaunchTimeoutSecs seconds."
+    val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
+      s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
+    } else {
+      "The pod had no final phase."
+    }
+    val podStatusMessage = if (driverPod.getStatus.getMessage != null) {
+      s"Latest message from the pod is: ${driverPod.getStatus.getMessage}"
+    } else {
+      "The pod had no final message."
+    }
+    val failedDriverContainerStatusString = driverPod.getStatus
+      .getContainerStatuses
+      .asScala
+      .find(_.getName == DRIVER_LAUNCHER_CONTAINER_NAME)
+      .map(status => {
+        val lastState = status.getState
+        if (lastState.getRunning != null) {
+          "Driver container last state: Running\n" +
+            s"Driver container started at: ${lastState.getRunning.getStartedAt}"
+        } else if (lastState.getWaiting != null) {
+          "Driver container last state: Waiting\n" +
+            s"Driver container wait reason: ${lastState.getWaiting.getReason}\n" +
+            s"Driver container message: ${lastState.getWaiting.getMessage}\n"
+        } else if (lastState.getTerminated != null) {
+          "Driver container last state: Terminated\n" +
+            s"Driver container started at: ${lastState.getTerminated.getStartedAt}\n" +
+            s"Driver container finished at: ${lastState.getTerminated.getFinishedAt}\n" +
+            s"Driver container exit reason: ${lastState.getTerminated.getReason}\n" +
+            s"Driver container exit code: ${lastState.getTerminated.getExitCode}\n" +
+            s"Driver container message: ${lastState.getTerminated.getMessage}"
+        } else {
+          "Driver container last state: Unknown"
+        }
+      }).getOrElse("The driver container wasn't found in the pod; expected to find" +
+      s" container with name $DRIVER_LAUNCHER_CONTAINER_NAME")
+    s"$topLevelMessage\n" +
+      s"$podStatusPhase\n" +
+      s"$podStatusMessage\n\n$failedDriverContainerStatusString"
+  }
+
+  private def buildContainerPorts(): Seq[ContainerPort] = {
+    Seq(sparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT),
+      sparkConf.getInt("spark.blockManager.port", DEFAULT_BLOCKMANAGER_PORT),
+      DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT,
+      uiPort).map(new ContainerPortBuilder().withContainerPort(_).build())
   }
 
   private def buildSubmissionRequest(): KubernetesCreateSubmissionRequest = {
@@ -392,26 +526,67 @@ private[spark] class Client(
       .map(CompressionUtils.createTarGzip(_))
   }
 
-  private def getDriverLauncherService(
-      k8ClientConfig: Config,
-      kubernetesMaster: String): KubernetesSparkRestApi = {
-    val url = s"${
-      Array[String](
-        kubernetesMaster,
-        "api", "v1", "proxy",
-        "namespaces", namespace,
-        "services", kubernetesAppId).mkString("/")}" +
-      s":$DRIVER_LAUNCHER_SERVICE_PORT_NAME/"
-
-    val sslContext = SSLUtils.sslContext(k8ClientConfig)
-    val trustManager = SSLUtils.trustManagers(
-      k8ClientConfig)(0).asInstanceOf[X509TrustManager]
+  private def buildDriverLauncherClient(
+      kubernetesClient: KubernetesClient,
+      service: Service,
+      driverLaunchSslOptions: SSLOptions): KubernetesSparkRestApi = {
+    val servicePort = service
+      .getSpec
+      .getPorts
+      .asScala
+      .filter(_.getName == DRIVER_LAUNCHER_SERVICE_PORT_NAME)
+      .head
+      .getNodePort
+    // NodePort is exposed on every node, so just pick one of them.
+    // TODO be resilient to node failures and try all of them
+    val node = kubernetesClient.nodes.list.getItems.asScala.head
+    val nodeAddress = node.getStatus.getAddresses.asScala.head.getAddress
+    val urlScheme = if (driverLaunchSslOptions.enabled) {
+      "https"
+    } else {
+      logWarning("Submitting application details, application secret, and local" +
+        " jars to the cluster over an insecure connection. You should configure SSL" +
+        " to secure this step.")
+      "http"
+    }
+    val (trustManager, sslContext): (X509TrustManager, SSLContext) =
+      if (driverLaunchSslOptions.enabled) {
+        buildSslConnectionConfiguration(driverLaunchSslOptions)
+      } else {
+        (null, SSLContext.getDefault)
+      }
+    val url = s"$urlScheme://$nodeAddress:$servicePort"
     HttpClientUtil.createClient[KubernetesSparkRestApi](
-      uri = url,
+      url,
       sslSocketFactory = sslContext.getSocketFactory,
       trustContext = trustManager)
   }
 
+  private def buildSslConnectionConfiguration(driverLaunchSslOptions: SSLOptions) = {
+    driverLaunchSslOptions.trustStore.map(trustStoreFile => {
+      val trustManagerFactory = TrustManagerFactory.getInstance(
+        TrustManagerFactory.getDefaultAlgorithm)
+      val trustStore = KeyStore.getInstance(
+        driverLaunchSslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType))
+      if (!trustStoreFile.isFile) {
+        throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
+          s" does not exist or is not a file.")
+      }
+      Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
+        driverLaunchSslOptions.trustStorePassword match {
+          case Some(password) =>
+            trustStore.load(trustStoreStream, password.toCharArray)
+          case None => trustStore.load(trustStoreStream, null)
+        }
+      }
+      trustManagerFactory.init(trustStore)
+      val trustManagers = trustManagerFactory.getTrustManagers
+      val sslContext = SSLContext.getInstance("TLSv1.2")
+      sslContext.init(null, trustManagers, SECURE_RANDOM)
+      (trustManagers(0).asInstanceOf[X509TrustManager], sslContext)
+    }).getOrElse((null, SSLContext.getDefault))
+  }
+
   private def parseCustomLabels(labels: String): Map[String, String] = {
     labels.split(",").map(_.trim).filterNot(_.isEmpty).map(label => {
       label.split("=", 2).toSeq match {
@@ -433,6 +608,9 @@ private[spark] class Client(
 private[spark] object Client extends Logging {
 
   private val SUBMISSION_SERVER_SECRET_NAME = "spark-submission-server-secret"
+  private val SSL_KEYSTORE_SECRET_NAME = "spark-submission-server-keystore"
+  private val SSL_KEYSTORE_PASSWORD_SECRET_NAME = "spark-submission-server-keystore-password"
+  private val SSL_KEY_PASSWORD_SECRET_NAME = "spark-submission-server-key-password"
   private val DRIVER_LAUNCHER_SELECTOR_LABEL = "driver-launcher-selector"
   private val DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT = 7077
   private val DEFAULT_DRIVER_PORT = 7078
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 38fa4d1d3f0b2..451dc96dd65ed 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -21,20 +21,26 @@ import java.net.URI
 import java.util.concurrent.CountDownLatch
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
+import com.google.common.base.Charsets
 import com.google.common.io.Files
 import org.apache.commons.codec.binary.Base64
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.{SecurityManager, SPARK_VERSION, SparkConf}
+import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.rest._
 import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
 
 private case class KubernetesSparkRestServerArguments(
-    val host: Option[String] = None,
-    val port: Option[Int] = None,
-    val secretFile: Option[String] = None) {
+    host: Option[String] = None,
+    port: Option[Int] = None,
+    useSsl: Boolean = false,
+    secretFile: Option[String] = None,
+    keyStoreFile: Option[String] = None,
+    keyStorePasswordFile: Option[String] = None,
+    keyStoreType: Option[String] = None,
+    keyPasswordFile: Option[String] = None) {
   def validate(): KubernetesSparkRestServerArguments = {
     require(host.isDefined, "Hostname not set via --hostname.")
     require(port.isDefined, "Port not set via --port")
@@ -58,6 +64,21 @@ private object KubernetesSparkRestServerArguments {
         case "--secret-file" :: value :: tail =>
           args = tail
           resolvedArguments.copy(secretFile = Some(value))
+        case "--use-ssl" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(useSsl = value.toBoolean)
+        case "--keystore-file" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(keyStoreFile = Some(value))
+        case "--keystore-password-file" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(keyStorePasswordFile = Some(value))
+        case "--keystore-type" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(keyStoreType = Some(value))
+        case "--keystore-key-password-file" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(keyPasswordFile = Some(value))
         // TODO polish usage message
         case Nil => resolvedArguments
         case unknown => throw new IllegalStateException(s"Unknown argument(s) found: $unknown")
@@ -78,8 +99,9 @@ private[spark] class KubernetesSparkRestServer(
     port: Int,
     conf: SparkConf,
     expectedApplicationSecret: Array[Byte],
-    shutdownLock: CountDownLatch)
-  extends RestSubmissionServer(host, port, conf) {
+    shutdownLock: CountDownLatch,
+    sslOptions: SSLOptions = new SSLOptions)
+  extends RestSubmissionServer(host, port, conf, sslOptions) {
 
   private val SERVLET_LOCK = new Object
   private val javaExecutable = s"${System.getenv("JAVA_HOME")}/bin/java"
@@ -196,7 +218,7 @@ private[spark] class KubernetesSparkRestServer(
                 response.success = true
                 response.submissionId = null
                 response.message = "success"
-                response.serverSparkVersion = SPARK_VERSION
+                response.serverSparkVersion = sparkVersion
                 response
               }
             case unexpected =>
@@ -249,6 +271,7 @@ private[spark] class KubernetesSparkRestServer(
 
 private[spark] object KubernetesSparkRestServer {
   private val barrier = new CountDownLatch(1)
+
   def main(args: Array[String]): Unit = {
     val parsedArguments = KubernetesSparkRestServerArguments.fromArgsArray(args)
     val secretFile = new File(parsedArguments.secretFile.get)
@@ -256,6 +279,24 @@ private[spark] object KubernetesSparkRestServer {
       throw new IllegalArgumentException(s"Secret file specified by --secret-file" +
         " is not a file, or does not exist.")
     }
+    val sslOptions = if (parsedArguments.useSsl) {
+      val keyStorePassword = parsedArguments
+        .keyStorePasswordFile
+        .map(new File(_))
+        .map(Files.toString(_, Charsets.UTF_8))
+      val keyPassword = parsedArguments
+        .keyPasswordFile
+        .map(new File(_))
+        .map(Files.toString(_, Charsets.UTF_8))
+      new SSLOptions(
+        enabled = true,
+        keyStore = parsedArguments.keyStoreFile.map(new File(_)),
+        keyStoreType = parsedArguments.keyStoreType,
+        keyStorePassword = keyStorePassword,
+        keyPassword = keyPassword)
+    } else {
+      new SSLOptions
+    }
     val secretBytes = Files.toByteArray(secretFile)
     val sparkConf = new SparkConf(true)
     val server = new KubernetesSparkRestServer(
@@ -263,7 +304,8 @@ private[spark] object KubernetesSparkRestServer {
       parsedArguments.port.get,
       sparkConf,
       secretBytes,
-      barrier)
+      barrier,
+      sslOptions)
     server.start()
     ShutdownHookManager.addShutdownHook(() => {
       try {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index f512c50a9a934..dae4b2714b4e4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -106,13 +106,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
   protected var totalExpectedExecutors = new AtomicInteger(0)
 
   private val driverUrl = RpcEndpointAddress(
-    System.getenv(s"${convertToEnvMode(kubernetesDriverServiceName)}_SERVICE_HOST"),
+    sc.getConf.get("spark.driver.host"),
     sc.getConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT),
     CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
 
-  private def convertToEnvMode(value: String): String =
-    value.toUpperCase.map { c => if (c == '-') '_' else c }
-
   private val initialExecutors = getInitialTargetExecutorNumber(1)
 
   private def getInitialTargetExecutorNumber(defaultNumExecutors: Int = 1): Int = {
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 4d345158f356a..070008fce7410 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -19,5 +19,14 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-# This class will also require setting a secret via the SPARK_APP_SECRET environment variable
-CMD exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.KubernetesSparkRestServer --hostname $HOSTNAME --port $SPARK_DRIVER_LAUNCHER_SERVER_PORT --secret-file $SPARK_SUBMISSION_SECRET_LOCATION
+CMD SSL_ARGS="" && \
+    if ! [ -z ${SPARK_SUBMISSION_USE_SSL+x} ]; then SSL_ARGS="$SSL_ARGS --use-ssl $SPARK_SUBMISSION_USE_SSL"; fi && \
+    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-file $SPARK_SUBMISSION_KEYSTORE_FILE"; fi && \
+    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_TYPE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-type $SPARK_SUBMISSION_KEYSTORE_TYPE"; fi && \
+    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-password-file $SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"; fi && \
+    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-key-password-file $SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"; fi && \
+    exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.KubernetesSparkRestServer \
+      --hostname $HOSTNAME \
+      --port $SPARK_DRIVER_LAUNCHER_SERVER_PORT \
+      --secret-file $SPARK_SUBMISSION_SECRET_LOCATION \
+      ${SSL_ARGS}
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 569527de8e300..f6a322f18cd75 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -106,6 +106,10 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcpkix-jdk15on</artifactId>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index c4bb389f5ada2..13edea02dce9a 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest
 
+import java.io.File
 import java.nio.file.Paths
 import java.util.UUID
 import java.util.concurrent.TimeUnit
@@ -36,7 +37,7 @@ import org.apache.spark.deploy.kubernetes.Client
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
-import org.apache.spark.internal.Logging
+import org.apache.spark.deploy.kubernetes.integrationtest.sslutil.SSLUtils
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 import org.apache.spark.util.Utils
 
@@ -68,6 +69,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   private val NAMESPACE = UUID.randomUUID().toString.replaceAll("-", "")
   private var minikubeKubernetesClient: KubernetesClient = _
   private var clientConfig: Config = _
+  private var keyStoreFile: File = _
+  private var trustStoreFile: File = _
 
   override def beforeAll(): Unit = {
     Minikube.startMinikube()
@@ -79,6 +82,13 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .done()
     minikubeKubernetesClient = Minikube.getKubernetesClient.inNamespace(NAMESPACE)
     clientConfig = minikubeKubernetesClient.getConfiguration
+    val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
+      Minikube.getMinikubeIp,
+      "changeit",
+      "changeit",
+      "changeit")
+    keyStoreFile = keyStore
+    trustStoreFile = trustStore
   }
 
   before {
@@ -296,4 +306,32 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     assert(driverPodLabels.get("label1") == "label1value", "Unexpected value for label1")
     assert(driverPodLabels.get("label2") == "label2value", "Unexpected value for label2")
   }
+
+  test("Enable SSL on the driver submit server") {
+    val args = Array(
+      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
+      "--deploy-mode", "cluster",
+      "--kubernetes-namespace", NAMESPACE,
+      "--name", "spark-pi",
+      "--executor-memory", "512m",
+      "--executor-cores", "1",
+      "--num-executors", "1",
+      "--upload-jars", HELPER_JAR,
+      "--class", MAIN_CLASS,
+      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
+      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
+      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
+      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      "--conf", "spark.ssl.kubernetes.driverlaunch.enabled=true",
+      "--conf", "spark.ssl.kubernetes.driverlaunch.keyStore=" +
+        s"file://${keyStoreFile.getAbsolutePath}",
+      "--conf", "spark.ssl.kubernetes.driverlaunch.keyStorePassword=changeit",
+      "--conf", "spark.ssl.kubernetes.driverlaunch.keyPassword=changeit",
+      "--conf", "spark.ssl.kubernetes.driverlaunch.trustStore=" +
+        s"file://${trustStoreFile.getAbsolutePath}",
+      "--conf", s"spark.ssl.kubernetes.driverlaunch.trustStorePassword=changeit",
+      EXAMPLES_JAR)
+    SparkSubmit.main(args)
+  }
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
new file mode 100644
index 0000000000000..bde7b43226660
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.sslutil
+
+import java.io.{File, FileOutputStream}
+import java.math.BigInteger
+import java.nio.file.Files
+import java.security.{KeyPairGenerator, KeyStore, SecureRandom}
+import java.util.{Calendar, Random}
+import javax.security.auth.x500.X500Principal
+
+import org.bouncycastle.asn1.x509.{Extension, GeneralName, GeneralNames}
+import org.bouncycastle.cert.jcajce.{JcaX509CertificateConverter, JcaX509v3CertificateBuilder}
+import org.bouncycastle.operator.jcajce.JcaContentSignerBuilder
+
+import org.apache.spark.util.Utils
+
+private[spark] object SSLUtils {
+
+  def generateKeyStoreTrustStorePair(
+      ipAddress: String,
+      keyStorePassword: String,
+      keyPassword: String,
+      trustStorePassword: String): (File, File) = {
+    val keyPairGenerator = KeyPairGenerator.getInstance("RSA")
+    keyPairGenerator.initialize(512)
+    val keyPair = keyPairGenerator.generateKeyPair()
+    val selfPrincipal = new X500Principal(s"cn=$ipAddress")
+    val currentDate = Calendar.getInstance
+    val validForOneHundredYears = Calendar.getInstance
+    validForOneHundredYears.add(Calendar.YEAR, 100)
+    val certificateBuilder = new JcaX509v3CertificateBuilder(
+      selfPrincipal,
+      new BigInteger(4096, new Random()),
+      currentDate.getTime,
+      validForOneHundredYears.getTime,
+      selfPrincipal,
+      keyPair.getPublic)
+    certificateBuilder.addExtension(Extension.subjectAlternativeName, false,
+      new GeneralNames(new GeneralName(GeneralName.iPAddress, ipAddress)))
+    val signer = new JcaContentSignerBuilder("SHA1WithRSA")
+      .setSecureRandom(new SecureRandom())
+      .build(keyPair.getPrivate)
+    val bcCertificate = certificateBuilder.build(signer)
+    val jcaCertificate = new JcaX509CertificateConverter().getCertificate(bcCertificate)
+    val keyStore = KeyStore.getInstance("JKS")
+    keyStore.load(null, null)
+    keyStore.setKeyEntry("key", keyPair.getPrivate,
+      keyPassword.toCharArray, Array(jcaCertificate))
+    val tempDir = Files.createTempDirectory("temp-ssl-stores").toFile()
+    tempDir.deleteOnExit()
+    val keyStoreFile = new File(tempDir, "keyStore.jks")
+    Utils.tryWithResource(new FileOutputStream(keyStoreFile)) {
+      keyStore.store(_, keyStorePassword.toCharArray)
+    }
+    val trustStore = KeyStore.getInstance("JKS")
+    trustStore.load(null, null)
+    trustStore.setCertificateEntry("key", jcaCertificate)
+    val trustStoreFile = new File(tempDir, "trustStore.jks")
+    Utils.tryWithResource(new FileOutputStream(trustStoreFile)) {
+      trustStore.store(_, trustStorePassword.toCharArray)
+    }
+    (keyStoreFile, trustStoreFile)
+  }
+
+}

From b2e687715308a953c29ed9b597568819eabf9f65 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 31 Jan 2017 12:07:01 -0800
Subject: [PATCH 408/534] Extract constants and config into separate file.
 Launch => Submit. (#65)

* Extract constants and config into separate file. Launch => Submit.

* Address comments

* A small shorthand

* Refactor more ThreadUtils

* Fix scalastyle, use cached thread pool

* Tiny Scala style change
---
 docs/running-on-kubernetes.md                 |  16 +-
 .../spark/deploy/kubernetes/Client.scala      | 251 +++++++++---------
 .../spark/deploy/kubernetes/config.scala      | 177 ++++++++++++
 .../spark/deploy/kubernetes/constants.scala   |  70 +++++
 .../rest/KubernetesRestProtocolMessages.scala |  21 +-
 .../kubernetes/KubernetesSparkRestApi.scala   |   3 +-
 .../KubernetesClusterSchedulerBackend.scala   | 162 +++++------
 .../src/main/docker/driver/Dockerfile         |   2 +-
 .../integrationtest/KubernetesSuite.scala     |  18 +-
 9 files changed, 470 insertions(+), 250 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e25e189aa6d74..e256535fbbc9d 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -140,12 +140,12 @@ Spark supports using SSL to encrypt the traffic in this bootstrapping process. I
 whenever possible. 
 
 See the [security page](security.html) and [configuration](configuration.html) sections for more information on
-configuring SSL; use the prefix `spark.ssl.kubernetes.driverlaunch` in configuring the SSL-related fields in the context
+configuring SSL; use the prefix `spark.ssl.kubernetes.submit` in configuring the SSL-related fields in the context
 of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
-pod in starting the application, set `spark.ssl.kubernetes.driverlaunch.trustStore`.
+pod in starting the application, set `spark.ssl.kubernetes.submit.trustStore`.
 
 One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
-container image's disk. Thus `spark.ssl.kubernetes.driverlaunch.keyStore` can be a URI with a scheme of either `file:`
+container image's disk. Thus `spark.ssl.kubernetes.submit.keyStore` can be a URI with a scheme of either `file:`
 or `container:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
 the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
 `container:`, the file is assumed to already be on the container's disk at the appropriate path.
@@ -235,7 +235,15 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td>(none)</td>
   <td>
     Custom labels that will be added to the driver pod. This should be a comma-separated list of label key-value pairs,
-    where each label is in the format <code>key=value</code>.
+    where each label is in the format <code>key=value</code>. Note that Spark also adds its own labels to the driver pod
+    for bookkeeping purposes.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driverSubmitTimeout</code></td>
+  <td>60s</td>
+  <td>
+    Time to wait for the driver pod to start running before aborting its execution.
   </td>
 </tr>
 </table>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 07a45c7577bcd..fed9334dbbab4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -18,13 +18,13 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.{File, FileInputStream}
 import java.security.{KeyStore, SecureRandom}
-import java.util.concurrent.{Executors, TimeoutException, TimeUnit}
+import java.util.concurrent.{TimeoutException, TimeUnit}
 import java.util.concurrent.atomic.AtomicBoolean
 import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
-import com.google.common.util.concurrent.{SettableFuture, ThreadFactoryBuilder}
+import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model._
 import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
@@ -34,11 +34,13 @@ import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.DurationInt
 
-import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.{SecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 private[spark] class Client(
     sparkConf: SparkConf,
@@ -47,25 +49,21 @@ private[spark] class Client(
     appArgs: Array[String]) extends Logging {
   import Client._
 
-  private val namespace = sparkConf.get("spark.kubernetes.namespace", "default")
+  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
   private val master = resolveK8sMaster(sparkConf.get("spark.master"))
 
   private val launchTime = System.currentTimeMillis
   private val appName = sparkConf.getOption("spark.app.name")
-    .orElse(sparkConf.getOption("spark.app.id"))
     .getOrElse("spark")
   private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
-  private val secretName = s"spark-submission-server-secret-$kubernetesAppId"
-  private val secretDirectory = s"$SPARK_SUBMISSION_SECRET_BASE_DIR/$kubernetesAppId"
-  private val sslSecretsDirectory = s"$SPARK_SUBMISSION_SECRET_BASE_DIR/$kubernetesAppId-ssl"
-  private val sslSecretsName = s"spark-submission-server-ssl-$kubernetesAppId"
-  private val driverLauncherSelectorValue = s"driver-launcher-$launchTime"
-  private val driverDockerImage = sparkConf.get(
-    "spark.kubernetes.driver.docker.image", s"spark-driver:$sparkVersion")
-  private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
+  private val secretName = s"$SUBMISSION_APP_SECRET_PREFIX-$kubernetesAppId"
+  private val secretDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId"
+  private val sslSecretsDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId-ssl"
+  private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
+  private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
+  private val uploadedJars = sparkConf.get(KUBERNETES_DRIVER_UPLOAD_JARS)
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
-  private val driverLaunchTimeoutSecs = sparkConf.getTimeAsSeconds(
-    "spark.kubernetes.driverLaunchTimeout", s"${DEFAULT_LAUNCH_TIMEOUT_SECONDS}s")
+  private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
 
   private val secretBase64String = {
     val secretBytes = new Array[Byte](128)
@@ -73,32 +71,27 @@ private[spark] class Client(
     Base64.encodeBase64String(secretBytes)
   }
 
-  private val serviceAccount = sparkConf.get("spark.kubernetes.submit.serviceAccountName",
-    "default")
-
-  private val customLabels = sparkConf.get("spark.kubernetes.driver.labels", "")
+  private val serviceAccount = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
+  private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
 
   private implicit val retryableExecutionContext = ExecutionContext
     .fromExecutorService(
-      Executors.newSingleThreadExecutor(new ThreadFactoryBuilder()
-        .setNameFormat("kubernetes-client-retryable-futures-%d")
-        .setDaemon(true)
-        .build()))
+      ThreadUtils.newDaemonSingleThreadExecutor("kubernetes-client-retryable-futures"))
 
   def run(): Unit = {
-    val (driverLaunchSslOptions, isKeyStoreLocalFile) = parseDriverLaunchSslOptions()
+    val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
     val parsedCustomLabels = parseCustomLabels(customLabels)
     var k8ConfBuilder = new ConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(master)
       .withNamespace(namespace)
-    sparkConf.getOption("spark.kubernetes.submit.caCertFile").foreach {
+    sparkConf.get(KUBERNETES_CA_CERT_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withCaCertFile(f)
     }
-    sparkConf.getOption("spark.kubernetes.submit.clientKeyFile").foreach {
+    sparkConf.get(KUBERNETES_CLIENT_KEY_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withClientKeyFile(f)
     }
-    sparkConf.getOption("spark.kubernetes.submit.clientCertFile").foreach {
+    sparkConf.get(KUBERNETES_CLIENT_CERT_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
     }
 
@@ -108,15 +101,16 @@ private[spark] class Client(
         .withNewMetadata()
           .withName(secretName)
           .endMetadata()
-        .withData(Map((SUBMISSION_SERVER_SECRET_NAME, secretBase64String)).asJava)
+        .withData(Map((SUBMISSION_APP_SECRET_NAME, secretBase64String)).asJava)
         .withType("Opaque")
         .done()
       val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(kubernetesClient,
-        driverLaunchSslOptions,
+        driverSubmitSslOptions,
         isKeyStoreLocalFile)
       try {
         val driverKubernetesSelectors = (Map(
-            DRIVER_LAUNCHER_SELECTOR_LABEL -> driverLauncherSelectorValue,
+            SPARK_DRIVER_LABEL -> kubernetesAppId,
+            SPARK_APP_ID_LABEL -> kubernetesAppId,
             SPARK_APP_NAME_LABEL -> appName)
           ++ parsedCustomLabels).asJava
         val containerPorts = buildContainerPorts()
@@ -126,7 +120,7 @@ private[spark] class Client(
           submitCompletedFuture,
           submitPending,
           kubernetesClient,
-          driverLaunchSslOptions,
+          driverSubmitSslOptions,
           Array(submitServerSecret) ++ sslSecrets,
           driverKubernetesSelectors)
         Utils.tryWithResource(kubernetesClient
@@ -141,7 +135,7 @@ private[spark] class Client(
             .withNewSpec()
               .withRestartPolicy("OnFailure")
               .addNewVolume()
-                .withName(s"spark-submission-secret-volume")
+                .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
                 .withNewSecret()
                   .withSecretName(submitServerSecret.getMetadata.getName)
                   .endSecret()
@@ -149,22 +143,22 @@ private[spark] class Client(
               .addToVolumes(sslVolumes: _*)
               .withServiceAccount(serviceAccount)
               .addNewContainer()
-                .withName(DRIVER_LAUNCHER_CONTAINER_NAME)
+                .withName(DRIVER_CONTAINER_NAME)
                 .withImage(driverDockerImage)
                 .withImagePullPolicy("IfNotPresent")
                 .addNewVolumeMount()
-                  .withName("spark-submission-secret-volume")
+                  .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
                   .withMountPath(secretDirectory)
                   .withReadOnly(true)
                   .endVolumeMount()
                 .addToVolumeMounts(sslVolumeMounts: _*)
                 .addNewEnv()
-                  .withName("SPARK_SUBMISSION_SECRET_LOCATION")
-                  .withValue(s"$secretDirectory/$SUBMISSION_SERVER_SECRET_NAME")
+                  .withName(ENV_SUBMISSION_SECRET_LOCATION)
+                  .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
                   .endEnv()
                 .addNewEnv()
-                  .withName("SPARK_DRIVER_LAUNCHER_SERVER_PORT")
-                  .withValue(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT.toString)
+                  .withName(ENV_SUBMISSION_SERVER_PORT)
+                  .withValue(SUBMISSION_SERVER_PORT.toString)
                   .endEnv()
                 .addToEnv(sslEnvs: _*)
                 .withPorts(containerPorts.asJava)
@@ -173,7 +167,7 @@ private[spark] class Client(
             .done()
           var submitSucceeded = false
           try {
-            submitCompletedFuture.get(driverLaunchTimeoutSecs, TimeUnit.SECONDS)
+            submitCompletedFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
             submitSucceeded = true
           } catch {
             case e: TimeoutException =>
@@ -199,8 +193,8 @@ private[spark] class Client(
     }
   }
 
-  private def parseDriverLaunchSslOptions(): (SSLOptions, Boolean) = {
-    val maybeKeyStore = sparkConf.getOption("spark.ssl.kubernetes.driverlaunch.keyStore")
+  private def parseDriverSubmitSslOptions(): (SSLOptions, Boolean) = {
+    val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_KEYSTORE)
     val resolvedSparkConf = sparkConf.clone()
     val (isLocalKeyStore, resolvedKeyStore) = maybeKeyStore.map(keyStore => {
       val keyStoreURI = Utils.resolveURI(keyStore)
@@ -214,30 +208,29 @@ private[spark] class Client(
       (isProvidedKeyStoreLocal, Option.apply(keyStoreURI.getPath))
     }).getOrElse((true, Option.empty[String]))
     resolvedKeyStore.foreach {
-      resolvedSparkConf.set("spark.ssl.kubernetes.driverlaunch.keyStore", _)
+      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, _)
     }
-    sparkConf.getOption("spark.ssl.kubernetes.driverlaunch.trustStore").foreach { trustStore =>
+    sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE).foreach { trustStore =>
       val trustStoreURI = Utils.resolveURI(trustStore)
       trustStoreURI.getScheme match {
         case "file" | null =>
-          resolvedSparkConf.set("spark.ssl.kubernetes.driverlaunch.trustStore",
-            trustStoreURI.getPath)
+          resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE, trustStoreURI.getPath)
         case _ => throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
           " for submit server must have no scheme, or scheme file://")
       }
     }
     val securityManager = new SecurityManager(resolvedSparkConf)
-    (securityManager.getSSLOptions("kubernetes.driverlaunch"), isLocalKeyStore)
+    (securityManager.getSSLOptions(KUBERNETES_SUBMIT_SSL_NAMESPACE), isLocalKeyStore)
   }
 
-  private def configureSsl(kubernetesClient: KubernetesClient, driverLaunchSslOptions: SSLOptions,
+  private def configureSsl(kubernetesClient: KubernetesClient, driverSubmitSslOptions: SSLOptions,
         isKeyStoreLocalFile: Boolean):
         (Array[EnvVar], Array[Volume], Array[VolumeMount], Array[Secret]) = {
-    if (driverLaunchSslOptions.enabled) {
+    if (driverSubmitSslOptions.enabled) {
       val sslSecretsMap = mutable.HashMap[String, String]()
       val sslEnvs = mutable.Buffer[EnvVar]()
       val secrets = mutable.Buffer[Secret]()
-      driverLaunchSslOptions.keyStore.foreach(store => {
+      driverSubmitSslOptions.keyStore.foreach(store => {
         val resolvedKeyStoreFile = if (isKeyStoreLocalFile) {
           if (!store.isFile) {
             throw new SparkException(s"KeyStore specified at $store is not a file or" +
@@ -245,40 +238,40 @@ private[spark] class Client(
           }
           val keyStoreBytes = Files.toByteArray(store)
           val keyStoreBase64 = Base64.encodeBase64String(keyStoreBytes)
-          sslSecretsMap += (SSL_KEYSTORE_SECRET_NAME -> keyStoreBase64)
-          s"$sslSecretsDirectory/$SSL_KEYSTORE_SECRET_NAME"
+          sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_SECRET_NAME -> keyStoreBase64)
+          s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_SECRET_NAME"
         } else {
           store.getAbsolutePath
         }
         sslEnvs += new EnvVarBuilder()
-          .withName("SPARK_SUBMISSION_KEYSTORE_FILE")
+          .withName(ENV_SUBMISSION_KEYSTORE_FILE)
           .withValue(resolvedKeyStoreFile)
           .build()
       })
-      driverLaunchSslOptions.keyStorePassword.foreach(password => {
+      driverSubmitSslOptions.keyStorePassword.foreach(password => {
         val passwordBase64 = Base64.encodeBase64String(password.getBytes(Charsets.UTF_8))
-        sslSecretsMap += (SSL_KEYSTORE_PASSWORD_SECRET_NAME -> passwordBase64)
+        sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME -> passwordBase64)
         sslEnvs += new EnvVarBuilder()
-          .withName("SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE")
-          .withValue(s"$sslSecretsDirectory/$SSL_KEYSTORE_PASSWORD_SECRET_NAME")
+          .withName(ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE)
+          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME")
           .build()
       })
-      driverLaunchSslOptions.keyPassword.foreach(password => {
+      driverSubmitSslOptions.keyPassword.foreach(password => {
         val passwordBase64 = Base64.encodeBase64String(password.getBytes(Charsets.UTF_8))
-        sslSecretsMap += (SSL_KEY_PASSWORD_SECRET_NAME -> passwordBase64)
+        sslSecretsMap += (SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME -> passwordBase64)
         sslEnvs += new EnvVarBuilder()
-          .withName("SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE")
-          .withValue(s"$sslSecretsDirectory/$SSL_KEY_PASSWORD_SECRET_NAME")
+          .withName(ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE)
+          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME")
           .build()
       })
-      driverLaunchSslOptions.keyStoreType.foreach(storeType => {
+      driverSubmitSslOptions.keyStoreType.foreach(storeType => {
         sslEnvs += new EnvVarBuilder()
-          .withName("SPARK_SUBMISSION_KEYSTORE_TYPE")
+          .withName(ENV_SUBMISSION_KEYSTORE_TYPE)
           .withValue(storeType)
           .build()
       })
       sslEnvs += new EnvVarBuilder()
-        .withName("SPARK_SUBMISSION_USE_SSL")
+        .withName(ENV_SUBMISSION_USE_SSL)
         .withValue("true")
         .build()
       val sslSecrets = kubernetesClient.secrets().createNew()
@@ -290,13 +283,13 @@ private[spark] class Client(
         .done()
       secrets += sslSecrets
       val sslVolume = new VolumeBuilder()
-        .withName("spark-submission-server-ssl-secrets")
+        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
         .withNewSecret()
           .withSecretName(sslSecrets.getMetadata.getName)
           .endSecret()
         .build()
       val sslVolumeMount = new VolumeMountBuilder()
-        .withName("spark-submission-server-ssl-secrets")
+        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
         .withReadOnly(true)
         .withMountPath(sslSecretsDirectory)
         .build()
@@ -310,7 +303,7 @@ private[spark] class Client(
       submitCompletedFuture: SettableFuture[Boolean],
       submitPending: AtomicBoolean,
       kubernetesClient: KubernetesClient,
-      driverLaunchSslOptions: SSLOptions,
+      driverSubmitSslOptions: SSLOptions,
       applicationSecrets: Array[Secret],
       driverKubernetesSelectors: java.util.Map[String, String]) extends Watcher[Pod] {
     override def eventReceived(action: Action, pod: Pod): Unit = {
@@ -322,7 +315,7 @@ private[spark] class Client(
             .getContainerStatuses
             .asScala
             .find(status =>
-              status.getName == DRIVER_LAUNCHER_CONTAINER_NAME && status.getReady) match {
+              status.getName == DRIVER_CONTAINER_NAME && status.getReady) match {
             case Some(_) =>
               val ownerRefs = Seq(new OwnerReferenceBuilder()
                 .withName(pod.getMetadata.getName)
@@ -337,10 +330,10 @@ private[spark] class Client(
                 kubernetesClient.secrets().createOrReplace(secret)
               })
 
-              val driverLauncherServicePort = new ServicePortBuilder()
-                .withName(DRIVER_LAUNCHER_SERVICE_PORT_NAME)
-                .withPort(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT)
-                .withNewTargetPort(DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT)
+              val driverSubmissionServicePort = new ServicePortBuilder()
+                .withName(SUBMISSION_SERVER_PORT_NAME)
+                .withPort(SUBMISSION_SERVER_PORT)
+                .withNewTargetPort(SUBMISSION_SERVER_PORT)
                 .build()
               val service = kubernetesClient.services().createNew()
                 .withNewMetadata()
@@ -351,20 +344,25 @@ private[spark] class Client(
                 .withNewSpec()
                   .withType("NodePort")
                   .withSelector(driverKubernetesSelectors)
-                  .withPorts(driverLauncherServicePort)
+                  .withPorts(driverSubmissionServicePort)
                   .endSpec()
                 .done()
               try {
-                sparkConf.set("spark.kubernetes.driver.service.name",
-                  service.getMetadata.getName)
-                sparkConf.set("spark.kubernetes.driver.pod.name", kubernetesAppId)
+                sparkConf.getOption("spark.app.id").foreach { id =>
+                  logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
+                    s" overridden as $kubernetesAppId")
+                }
+                sparkConf.set(KUBERNETES_DRIVER_POD_NAME, kubernetesAppId)
+                sparkConf.set(KUBERNETES_DRIVER_SERVICE_NAME, service.getMetadata.getName)
+                sparkConf.set("spark.app.id", kubernetesAppId)
+                sparkConf.setIfMissing("spark.app.name", appName)
                 sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
                 sparkConf.setIfMissing("spark.blockmanager.port",
                   DEFAULT_BLOCKMANAGER_PORT.toString)
-                val driverLauncher = buildDriverLauncherClient(kubernetesClient, service,
-                    driverLaunchSslOptions)
+                val driverSubmitter = buildDriverSubmissionClient(kubernetesClient, service,
+                    driverSubmitSslOptions)
                 val ping = Retry.retry(5, 5.seconds) {
-                  driverLauncher.ping()
+                  driverSubmitter.ping()
                 }
                 ping onFailure {
                   case t: Throwable =>
@@ -375,7 +373,7 @@ private[spark] class Client(
                   Future {
                     sparkConf.set("spark.driver.host", pod.getStatus.getPodIP)
                     val submitRequest = buildSubmissionRequest()
-                    driverLauncher.create(submitRequest)
+                    driverSubmitter.submitApplication(submitRequest)
                   }
                 }
                 submitComplete onFailure {
@@ -436,17 +434,17 @@ private[spark] class Client(
       kubernetesClient.pods().withName(kubernetesAppId).get()
     } catch {
       case throwable: Throwable =>
-        logError(s"Timed out while waiting $driverLaunchTimeoutSecs seconds for the" +
+        logError(s"Timed out while waiting $driverSubmitTimeoutSecs seconds for the" +
           " driver pod to start, but an error occurred while fetching the driver" +
           " pod's details.", throwable)
-        throw new SparkException(s"Timed out while waiting $driverLaunchTimeoutSecs" +
+        throw new SparkException(s"Timed out while waiting $driverSubmitTimeoutSecs" +
           " seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
           " the latest state of the pod, another error was thrown. Check the logs for" +
           " the error that was thrown in looking up the driver pod.", e)
     }
     val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
       s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
-      s" $driverLaunchTimeoutSecs seconds."
+      s" $driverSubmitTimeoutSecs seconds."
     val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
       s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
     } else {
@@ -460,7 +458,7 @@ private[spark] class Client(
     val failedDriverContainerStatusString = driverPod.getStatus
       .getContainerStatuses
       .asScala
-      .find(_.getName == DRIVER_LAUNCHER_CONTAINER_NAME)
+      .find(_.getName == DRIVER_CONTAINER_NAME)
       .map(status => {
         val lastState = status.getState
         if (lastState.getRunning != null) {
@@ -481,17 +479,21 @@ private[spark] class Client(
           "Driver container last state: Unknown"
         }
       }).getOrElse("The driver container wasn't found in the pod; expected to find" +
-      s" container with name $DRIVER_LAUNCHER_CONTAINER_NAME")
+      s" container with name $DRIVER_CONTAINER_NAME")
     s"$topLevelMessage\n" +
       s"$podStatusPhase\n" +
       s"$podStatusMessage\n\n$failedDriverContainerStatusString"
   }
 
   private def buildContainerPorts(): Seq[ContainerPort] = {
-    Seq(sparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT),
-      sparkConf.getInt("spark.blockManager.port", DEFAULT_BLOCKMANAGER_PORT),
-      DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT,
-      uiPort).map(new ContainerPortBuilder().withContainerPort(_).build())
+    Seq((DRIVER_PORT_NAME, sparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT)),
+      (BLOCK_MANAGER_PORT_NAME,
+        sparkConf.getInt("spark.blockManager.port", DEFAULT_BLOCKMANAGER_PORT)),
+      (SUBMISSION_SERVER_PORT_NAME, SUBMISSION_SERVER_PORT),
+      (UI_PORT_NAME, uiPort)).map(port => new ContainerPortBuilder()
+        .withName(port._1)
+        .withContainerPort(port._2)
+        .build())
   }
 
   private def buildSubmissionRequest(): KubernetesCreateSubmissionRequest = {
@@ -526,22 +528,22 @@ private[spark] class Client(
       .map(CompressionUtils.createTarGzip(_))
   }
 
-  private def buildDriverLauncherClient(
+  private def buildDriverSubmissionClient(
       kubernetesClient: KubernetesClient,
       service: Service,
-      driverLaunchSslOptions: SSLOptions): KubernetesSparkRestApi = {
+      driverSubmitSslOptions: SSLOptions): KubernetesSparkRestApi = {
     val servicePort = service
       .getSpec
       .getPorts
       .asScala
-      .filter(_.getName == DRIVER_LAUNCHER_SERVICE_PORT_NAME)
+      .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
       .head
       .getNodePort
     // NodePort is exposed on every node, so just pick one of them.
     // TODO be resilient to node failures and try all of them
     val node = kubernetesClient.nodes.list.getItems.asScala.head
     val nodeAddress = node.getStatus.getAddresses.asScala.head.getAddress
-    val urlScheme = if (driverLaunchSslOptions.enabled) {
+    val urlScheme = if (driverSubmitSslOptions.enabled) {
       "https"
     } else {
       logWarning("Submitting application details, application secret, and local" +
@@ -550,8 +552,8 @@ private[spark] class Client(
       "http"
     }
     val (trustManager, sslContext): (X509TrustManager, SSLContext) =
-      if (driverLaunchSslOptions.enabled) {
-        buildSslConnectionConfiguration(driverLaunchSslOptions)
+      if (driverSubmitSslOptions.enabled) {
+        buildSslConnectionConfiguration(driverSubmitSslOptions)
       } else {
         (null, SSLContext.getDefault)
       }
@@ -562,18 +564,18 @@ private[spark] class Client(
       trustContext = trustManager)
   }
 
-  private def buildSslConnectionConfiguration(driverLaunchSslOptions: SSLOptions) = {
-    driverLaunchSslOptions.trustStore.map(trustStoreFile => {
+  private def buildSslConnectionConfiguration(driverSubmitSslOptions: SSLOptions) = {
+    driverSubmitSslOptions.trustStore.map(trustStoreFile => {
       val trustManagerFactory = TrustManagerFactory.getInstance(
         TrustManagerFactory.getDefaultAlgorithm)
       val trustStore = KeyStore.getInstance(
-        driverLaunchSslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType))
+        driverSubmitSslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType))
       if (!trustStoreFile.isFile) {
         throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
           s" does not exist or is not a file.")
       }
       Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
-        driverLaunchSslOptions.trustStorePassword match {
+        driverSubmitSslOptions.trustStorePassword match {
           case Some(password) =>
             trustStore.load(trustStoreStream, password.toCharArray)
           case None => trustStore.load(trustStoreStream, null)
@@ -587,44 +589,29 @@ private[spark] class Client(
     }).getOrElse((null, SSLContext.getDefault))
   }
 
-  private def parseCustomLabels(labels: String): Map[String, String] = {
-    labels.split(",").map(_.trim).filterNot(_.isEmpty).map(label => {
-      label.split("=", 2).toSeq match {
-        case Seq(k, v) =>
-          require(k != DRIVER_LAUNCHER_SELECTOR_LABEL, "Label with key" +
-            s" $DRIVER_LAUNCHER_SELECTOR_LABEL cannot be used in" +
-            " spark.kubernetes.driver.labels, as it is reserved for Spark's" +
-            " internal configuration.")
-          (k, v)
-        case _ =>
-          throw new SparkException("Custom labels set by spark.kubernetes.driver.labels" +
-            " must be a comma-separated list of key-value pairs, with format <key>=<value>." +
-            s" Got label: $label. All labels: $labels")
-      }
-    }).toMap
+  private def parseCustomLabels(maybeLabels: Option[String]): Map[String, String] = {
+    maybeLabels.map(labels => {
+      labels.split(",").map(_.trim).filterNot(_.isEmpty).map(label => {
+        label.split("=", 2).toSeq match {
+          case Seq(k, v) =>
+            require(k != SPARK_APP_ID_LABEL, "Label with key" +
+              s" $SPARK_APP_ID_LABEL cannot be used in" +
+              " spark.kubernetes.driver.labels, as it is reserved for Spark's" +
+              " internal configuration.")
+            (k, v)
+          case _ =>
+            throw new SparkException("Custom labels set by spark.kubernetes.driver.labels" +
+              " must be a comma-separated list of key-value pairs, with format <key>=<value>." +
+              s" Got label: $label. All labels: $labels")
+        }
+      }).toMap
+    }).getOrElse(Map.empty[String, String])
   }
 }
 
 private[spark] object Client extends Logging {
 
-  private val SUBMISSION_SERVER_SECRET_NAME = "spark-submission-server-secret"
-  private val SSL_KEYSTORE_SECRET_NAME = "spark-submission-server-keystore"
-  private val SSL_KEYSTORE_PASSWORD_SECRET_NAME = "spark-submission-server-keystore-password"
-  private val SSL_KEY_PASSWORD_SECRET_NAME = "spark-submission-server-key-password"
-  private val DRIVER_LAUNCHER_SELECTOR_LABEL = "driver-launcher-selector"
-  private val DRIVER_LAUNCHER_SERVICE_INTERNAL_PORT = 7077
-  private val DEFAULT_DRIVER_PORT = 7078
-  private val DEFAULT_BLOCKMANAGER_PORT = 7079
-  private val DEFAULT_UI_PORT = 4040
-  private val UI_PORT_NAME = "spark-ui-port"
-  private val DRIVER_LAUNCHER_SERVICE_PORT_NAME = "driver-launcher-port"
-  private val DRIVER_PORT_NAME = "driver-port"
-  private val BLOCKMANAGER_PORT_NAME = "block-manager-port"
-  private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher"
-  private val SECURE_RANDOM = new SecureRandom()
-  private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission"
-  private val DEFAULT_LAUNCH_TIMEOUT_SECONDS = 60
-  private val SPARK_APP_NAME_LABEL = "spark-app-name"
+  private[spark] val SECURE_RANDOM = new SecureRandom()
 
   def main(args: Array[String]): Unit = {
     require(args.length >= 2, s"Too few arguments. Usage: ${getClass.getName} <mainAppResource>" +
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
new file mode 100644
index 0000000000000..9b145370f87d6
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.{SPARK_VERSION => sparkVersion}
+import org.apache.spark.internal.config.ConfigBuilder
+
+package object config {
+
+  private[spark] val KUBERNETES_NAMESPACE =
+    ConfigBuilder("spark.kubernetes.namespace")
+      .doc("""
+          | The namespace that will be used for running the driver and
+          | executor pods. When using spark-submit in cluster mode,
+          | this can also be passed to spark-submit via the
+          | --kubernetes-namespace command line argument.
+        """.stripMargin)
+      .stringConf
+      .createWithDefault("default")
+
+  private[spark] val DRIVER_DOCKER_IMAGE =
+    ConfigBuilder("spark.kubernetes.driver.docker.image")
+      .doc("""
+          | Docker image to use for the driver. Specify this using the
+          | standard Docker tag format.
+        """.stripMargin)
+      .stringConf
+      .createWithDefault(s"spark-driver:$sparkVersion")
+
+  private[spark] val EXECUTOR_DOCKER_IMAGE =
+    ConfigBuilder("spark.kubernetes.executor.docker.image")
+      .doc("""
+          | Docker image to use for the executors. Specify this using
+          | the standard Docker tag format.
+        """.stripMargin)
+      .stringConf
+      .createWithDefault(s"spark-executor:$sparkVersion")
+
+  private[spark] val KUBERNETES_CA_CERT_FILE =
+    ConfigBuilder("spark.kubernetes.submit.caCertFile")
+      .doc("""
+          | CA cert file for connecting to Kubernetes over SSL. This
+          | file should be located on the submitting machine's disk.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_CLIENT_KEY_FILE =
+    ConfigBuilder("spark.kubernetes.submit.clientKeyFile")
+      .doc("""
+          | Client key file for authenticating against the Kubernetes
+          | API server. This file should be located on the submitting
+          | machine's disk.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_CLIENT_CERT_FILE =
+    ConfigBuilder("spark.kubernetes.submit.clientCertFile")
+      .doc("""
+          | Client cert file for authenticating against the
+          | Kubernetes API server. This file should be located on
+          | the submitting machine's disk.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_SERVICE_ACCOUNT_NAME =
+    ConfigBuilder("spark.kubernetes.submit.serviceAccountName")
+      .doc("""
+          | Service account that is used when running the driver pod.
+          | The driver pod uses this service account when requesting
+          | executor pods from the API server.
+        """.stripMargin)
+      .stringConf
+      .createWithDefault("default")
+
+  private[spark] val KUBERNETES_DRIVER_UPLOAD_JARS =
+    ConfigBuilder("spark.kubernetes.driver.uploads.jars")
+      .doc("""
+          | Comma-separated list of jars to sent to the driver and
+          | all executors when submitting the application in cluster
+          | mode.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  // Note that while we set a default for this when we start up the
+  // scheduler, the specific default value is dynamically determined
+  // based on the executor memory.
+  private[spark] val KUBERNETES_EXECUTOR_MEMORY_OVERHEAD =
+    ConfigBuilder("spark.kubernetes.executor.memoryOverhead")
+      .doc("""
+          | The amount of off-heap memory (in megabytes) to be
+          | allocated per executor. This is memory that accounts for
+          | things like VM overheads, interned strings, other native
+          | overheads, etc. This tends to grow with the executor size
+          | (typically 6-10%).
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_LABELS =
+    ConfigBuilder("spark.kubernetes.driver.labels")
+      .doc("""
+          | Custom labels that will be added to the driver pod.
+          | This should be a comma-separated list of label key-value
+          | pairs, where each label is in the format key=value. Note
+          | that Spark also adds its own labels to the driver pod
+          | for bookkeeping purposes.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_SUBMIT_TIMEOUT =
+    ConfigBuilder("spark.kubernetes.driverSubmitTimeout")
+      .doc("""
+          | Time to wait for the driver process to start running
+          | before aborting its execution.
+        """.stripMargin)
+      .timeConf(TimeUnit.SECONDS)
+      .createWithDefault(60L)
+
+  private[spark] val KUBERNETES_DRIVER_SUBMIT_KEYSTORE =
+    ConfigBuilder("spark.ssl.kubernetes.submit.keyStore")
+      .doc("""
+          | KeyStore file for the driver submission server listening
+          | on SSL. Can be pre-mounted on the driver container
+          | or uploaded from the submitting client.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE =
+    ConfigBuilder("spark.ssl.kubernetes.submit.trustStore")
+      .doc("""
+          | TrustStore containing certificates for communicating
+          | to the driver submission server over SSL.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_SERVICE_NAME =
+    ConfigBuilder("spark.kubernetes.driver.service.name")
+        .doc("""
+            | Kubernetes service that exposes the driver pod
+            | for external access.
+          """.stripMargin)
+        .internal()
+        .stringConf
+        .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_POD_NAME =
+    ConfigBuilder("spark.kubernetes.driver.pod.name")
+      .doc("""
+          | Name of the driver pod.
+        """.stripMargin)
+      .internal()
+      .stringConf
+      .createOptional
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
new file mode 100644
index 0000000000000..027cc3c022b4e
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+package object constants {
+  // Labels
+  private[spark] val SPARK_DRIVER_LABEL = "spark-driver"
+  private[spark] val SPARK_APP_ID_LABEL = "spark-app-id"
+  private[spark] val SPARK_APP_NAME_LABEL = "spark-app-name"
+  private[spark] val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
+
+  // Secrets
+  private[spark] val DRIVER_CONTAINER_SECRETS_BASE_DIR = "/var/run/secrets/spark-submission"
+  private[spark] val SUBMISSION_APP_SECRET_NAME = "spark-submission-server-secret"
+  private[spark] val SUBMISSION_APP_SECRET_PREFIX = "spark-submission-server-secret"
+  private[spark] val SUBMISSION_APP_SECRET_VOLUME_NAME = "spark-submission-secret-volume"
+  private[spark] val SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME =
+      "spark-submission-server-key-password"
+  private[spark] val SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME =
+      "spark-submission-server-keystore-password"
+  private[spark] val SUBMISSION_SSL_KEYSTORE_SECRET_NAME = "spark-submission-server-keystore"
+  private[spark] val SUBMISSION_SSL_SECRETS_PREFIX = "spark-submission-server-ssl"
+  private[spark] val SUBMISSION_SSL_SECRETS_VOLUME_NAME = "spark-submission-server-ssl-secrets"
+
+  // Default and fixed ports
+  private[spark] val SUBMISSION_SERVER_PORT = 7077
+  private[spark] val DEFAULT_DRIVER_PORT = 7078
+  private[spark] val DEFAULT_BLOCKMANAGER_PORT = 7079
+  private[spark] val DEFAULT_UI_PORT = 4040
+  private[spark] val UI_PORT_NAME = "spark-ui-port"
+  private[spark] val SUBMISSION_SERVER_PORT_NAME = "submit-server"
+  private[spark] val BLOCK_MANAGER_PORT_NAME = "blockmanager"
+  private[spark] val DRIVER_PORT_NAME = "driver"
+  private[spark] val EXECUTOR_PORT_NAME = "executor"
+
+  // Environment Variables
+  private[spark] val ENV_SUBMISSION_SECRET_LOCATION = "SPARK_SUBMISSION_SECRET_LOCATION"
+  private[spark] val ENV_SUBMISSION_SERVER_PORT = "SPARK_SUBMISSION_SERVER_PORT"
+  private[spark] val ENV_SUBMISSION_KEYSTORE_FILE = "SPARK_SUBMISSION_KEYSTORE_FILE"
+  private[spark] val ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE =
+      "SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"
+  private[spark] val ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE =
+      "SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"
+  private[spark] val ENV_SUBMISSION_KEYSTORE_TYPE = "SPARK_SUBMISSION_KEYSTORE_TYPE"
+  private[spark] val ENV_SUBMISSION_USE_SSL = "SPARK_SUBMISSION_USE_SSL"
+  private[spark] val ENV_EXECUTOR_PORT = "SPARK_EXECUTOR_PORT"
+  private[spark] val ENV_DRIVER_URL = "SPARK_DRIVER_URL"
+  private[spark] val ENV_EXECUTOR_CORES = "SPARK_EXECUTOR_CORES"
+  private[spark] val ENV_EXECUTOR_MEMORY = "SPARK_EXECUTOR_MEMORY"
+  private[spark] val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
+  private[spark] val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
+
+  // Miscellaneous
+  private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
+  private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submit"
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
index 813d070e0f876..8beba23bc8e11 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -20,23 +20,22 @@ import com.fasterxml.jackson.annotation.{JsonSubTypes, JsonTypeInfo}
 
 import org.apache.spark.SPARK_VERSION
 
-// TODO: jars should probably be compressed. Shipping tarballs would be optimal.
 case class KubernetesCreateSubmissionRequest(
-  val appResource: AppResource,
-  val mainClass: String,
-  val appArgs: Array[String],
-  val sparkProperties: Map[String, String],
-  val secret: String,
-  val uploadedJarsBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
+  appResource: AppResource,
+  mainClass: String,
+  appArgs: Array[String],
+  sparkProperties: Map[String, String],
+  secret: String,
+  uploadedJarsBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
   message = "create"
   clientSparkVersion = SPARK_VERSION
 }
 
 case class TarGzippedData(
-  val dataBase64: String,
-  val blockSize: Int = 10240,
-  val recordSize: Int = 512,
-  val encoding: String
+  dataBase64: String,
+  blockSize: Int = 10240,
+  recordSize: Int = 512,
+  encoding: String
 )
 
 @JsonTypeInfo(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
index 3cbcb16293b1d..18eb9b7a12ca6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
@@ -28,12 +28,11 @@ trait KubernetesSparkRestApi {
   @Consumes(Array(MediaType.APPLICATION_JSON))
   @Produces(Array(MediaType.APPLICATION_JSON))
   @Path("/create")
-  def create(request: KubernetesCreateSubmissionRequest): CreateSubmissionResponse
+  def submitApplication(request: KubernetesCreateSubmissionRequest): CreateSubmissionResponse
 
   @GET
   @Consumes(Array(MediaType.APPLICATION_JSON))
   @Produces(Array(MediaType.APPLICATION_JSON))
   @Path("/ping")
   def ping(): PingResponse
-
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index dae4b2714b4e4..550ddd113fa42 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -21,17 +21,18 @@ import java.util.concurrent.Executors
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
-import io.fabric8.kubernetes.api.model.{ContainerPort, ContainerPortBuilder, EnvVar, EnvVarBuilder, Pod, QuantityBuilder}
+import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, Pod, QuantityBuilder}
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.deploy.kubernetes.{Client, KubernetesClientBuilder}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.rpc.RpcEndpointAddress
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 private[spark] class KubernetesClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
@@ -44,24 +45,19 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
 
   private val kubernetesMaster = Client.resolveK8sMaster(sc.master)
-
-  private val executorDockerImage = conf
-    .get("spark.kubernetes.executor.docker.image", s"spark-executor:${sc.version}")
-
-  private val kubernetesNamespace = conf.get("spark.kubernetes.namespace", "default")
-
+  private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
+  private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
   private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)
-
   private val blockmanagerPort = conf
     .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
 
   private val kubernetesDriverServiceName = conf
-    .getOption("spark.kubernetes.driver.service.name")
+    .get(KUBERNETES_DRIVER_SERVICE_NAME)
     .getOrElse(
       throw new SparkException("Must specify the service name the driver is running with"))
 
   private val kubernetesDriverPodName = conf
-    .getOption("spark.kubernetes.driver.pod.name")
+    .get(KUBERNETES_DRIVER_POD_NAME)
     .getOrElse(
       throw new SparkException("Must specify the driver pod name"))
 
@@ -69,7 +65,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val executorMemoryBytes = Utils.byteStringAsBytes(executorMemory)
 
   private val memoryOverheadBytes = conf
-    .getOption("spark.kubernetes.executor.memoryOverhead")
+    .get(KUBERNETES_EXECUTOR_MEMORY_OVERHEAD)
     .map(overhead => Utils.byteStringAsBytes(overhead))
     .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryBytes).toInt,
       MEMORY_OVERHEAD_MIN))
@@ -78,16 +74,12 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val executorCores = conf.getOption("spark.executor.cores").getOrElse("1")
 
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
-    Executors.newCachedThreadPool(
-      new ThreadFactoryBuilder()
-        .setDaemon(true)
-        .setNameFormat("kubernetes-executor-requests-%d")
-        .build))
+    ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
 
   private val kubernetesClient = KubernetesClientBuilder
     .buildFromWithinPod(kubernetesMaster, kubernetesNamespace)
 
-  val driverPod = try {
+  private val driverPod = try {
     kubernetesClient.pods().inNamespace(kubernetesNamespace).
       withName(kubernetesDriverPodName).get()
   } catch {
@@ -127,6 +119,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
+  override def applicationId(): String = conf.get("spark.app.id", super.applicationId())
+
   override def sufficientResourcesRegistered(): Boolean = {
     totalRegisteredExecutors.get() >= initialExecutors * minRegisteredRatio
   }
@@ -163,9 +157,9 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private def allocateNewExecutorPod(): (String, Pod) = {
     val executorKubernetesId = UUID.randomUUID().toString.replaceAll("-", "")
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
-    val name = s"$kubernetesDriverServiceName-exec-$executorKubernetesId"
-    val selectors = Map(SPARK_EXECUTOR_SELECTOR -> executorId,
-      SPARK_APP_SELECTOR -> applicationId()).asJava
+    val name = s"${applicationId()}-exec-$executorKubernetesId"
+    val selectors = Map(SPARK_EXECUTOR_ID_LABEL -> executorId,
+      SPARK_APP_ID_LABEL -> applicationId()).asJava
     val executorMemoryQuantity = new QuantityBuilder(false)
       .withAmount(executorMemoryBytes.toString)
       .build()
@@ -175,69 +169,61 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val executorCpuQuantity = new QuantityBuilder(false)
       .withAmount(executorCores)
       .build()
-    val requiredEnv = new ArrayBuffer[EnvVar]
-    requiredEnv += new EnvVarBuilder()
-      .withName("SPARK_EXECUTOR_PORT")
-      .withValue(executorPort.toString)
-      .build()
-    requiredEnv += new EnvVarBuilder()
-      .withName("SPARK_DRIVER_URL")
-      .withValue(driverUrl)
-      .build()
-    requiredEnv += new EnvVarBuilder()
-      .withName("SPARK_EXECUTOR_CORES")
-      .withValue(executorCores)
-      .build()
-    requiredEnv += new EnvVarBuilder()
-      .withName("SPARK_EXECUTOR_MEMORY")
-      .withValue(executorMemory)
-      .build()
-    requiredEnv += new EnvVarBuilder()
-      .withName("SPARK_APPLICATION_ID")
-      .withValue(applicationId())
-      .build()
-    requiredEnv += new EnvVarBuilder()
-      .withName("SPARK_EXECUTOR_ID")
-      .withValue(executorId)
-      .build()
-    val requiredPorts = new ArrayBuffer[ContainerPort]
-    requiredPorts += new ContainerPortBuilder()
-      .withName(EXECUTOR_PORT_NAME)
-      .withContainerPort(executorPort)
-      .build()
-    requiredPorts += new ContainerPortBuilder()
-      .withName(BLOCK_MANAGER_PORT_NAME)
-      .withContainerPort(blockmanagerPort)
-      .build()
-    (executorKubernetesId, kubernetesClient.pods().createNew()
-      .withNewMetadata()
-        .withName(name)
-        .withLabels(selectors)
-        .withOwnerReferences()
-        .addNewOwnerReference()
-          .withController(true)
-          .withApiVersion(driverPod.getApiVersion)
-          .withKind(driverPod.getKind)
-          .withName(driverPod.getMetadata.getName)
-          .withUid(driverPod.getMetadata.getUid)
-        .endOwnerReference()
-      .endMetadata()
-      .withNewSpec()
-        .addNewContainer()
-          .withName(s"exec-${applicationId()}-container")
-          .withImage(executorDockerImage)
-          .withImagePullPolicy("IfNotPresent")
-          .withNewResources()
-            .addToRequests("memory", executorMemoryQuantity)
-            .addToLimits("memory", executorMemoryLimitQuantity)
-            .addToRequests("cpu", executorCpuQuantity)
-            .addToLimits("cpu", executorCpuQuantity)
-            .endResources()
-          .withEnv(requiredEnv.asJava)
-          .withPorts(requiredPorts.asJava)
-          .endContainer()
-        .endSpec()
-      .done())
+    val requiredEnv = Seq(
+      (ENV_EXECUTOR_PORT, executorPort.toString),
+      (ENV_DRIVER_URL, driverUrl),
+      (ENV_EXECUTOR_CORES, executorCores),
+      (ENV_EXECUTOR_MEMORY, executorMemory),
+      (ENV_APPLICATION_ID, applicationId()),
+      (ENV_EXECUTOR_ID, executorId)
+    ).map(env => new EnvVarBuilder()
+      .withName(env._1)
+      .withValue(env._2)
+      .build())
+    val requiredPorts = Seq(
+      (EXECUTOR_PORT_NAME, executorPort),
+      (BLOCK_MANAGER_PORT_NAME, blockmanagerPort))
+      .map(port => {
+        new ContainerPortBuilder()
+          .withName(port._1)
+          .withContainerPort(port._2)
+          .build()
+      })
+    try {
+      (executorKubernetesId, kubernetesClient.pods().createNew()
+        .withNewMetadata()
+          .withName(name)
+          .withLabels(selectors)
+          .withOwnerReferences()
+          .addNewOwnerReference()
+            .withController(true)
+            .withApiVersion(driverPod.getApiVersion)
+            .withKind(driverPod.getKind)
+            .withName(driverPod.getMetadata.getName)
+            .withUid(driverPod.getMetadata.getUid)
+          .endOwnerReference()
+        .endMetadata()
+        .withNewSpec()
+          .addNewContainer()
+            .withName(s"executor")
+            .withImage(executorDockerImage)
+            .withImagePullPolicy("IfNotPresent")
+            .withNewResources()
+              .addToRequests("memory", executorMemoryQuantity)
+              .addToLimits("memory", executorMemoryLimitQuantity)
+              .addToRequests("cpu", executorCpuQuantity)
+              .addToLimits("cpu", executorCpuQuantity)
+              .endResources()
+            .withEnv(requiredEnv.asJava)
+            .withPorts(requiredPorts.asJava)
+            .endContainer()
+          .endSpec()
+        .done())
+    } catch {
+      case throwable: Throwable =>
+        logError("Failed to allocate executor pod.", throwable)
+        throw throwable
+    }
   }
 
   override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
@@ -269,13 +255,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
 }
 
 private object KubernetesClusterSchedulerBackend {
-  private val SPARK_EXECUTOR_SELECTOR = "spark-exec"
-  private val SPARK_APP_SELECTOR = "spark-app"
   private val DEFAULT_STATIC_PORT = 10000
-  private val DEFAULT_BLOCKMANAGER_PORT = 7079
-  private val DEFAULT_DRIVER_PORT = 7078
-  private val BLOCK_MANAGER_PORT_NAME = "blockmanager"
-  private val EXECUTOR_PORT_NAME = "executor"
   private val MEMORY_OVERHEAD_FACTOR = 0.10
   private val MEMORY_OVERHEAD_MIN = 384L
   private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 070008fce7410..92fdfb8ac5f41 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -27,6 +27,6 @@ CMD SSL_ARGS="" && \
     if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-key-password-file $SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"; fi && \
     exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.KubernetesSparkRestServer \
       --hostname $HOSTNAME \
-      --port $SPARK_DRIVER_LAUNCHER_SERVER_PORT \
+      --port $SPARK_SUBMISSION_SERVER_PORT \
       --secret-file $SPARK_SUBMISSION_SECRET_LOCATION \
       ${SSL_ARGS}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 13edea02dce9a..16de71118dec4 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -172,7 +172,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.executor.memory", "500m")
       .set("spark.executor.cores", "1")
       .set("spark.executors.instances", "1")
-      .set("spark.app.id", "spark-pi")
+      .set("spark.app.name", "spark-pi")
       .set("spark.ui.enabled", "true")
       .set("spark.testing", "false")
     val mainAppResource = s"file://$EXAMPLES_JAR"
@@ -298,11 +298,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .getLabels
     // We can't match all of the selectors directly since one of the selectors is based on the
     // launch time.
-    assert(driverPodLabels.size == 4, "Unexpected number of pod labels.")
-    assert(driverPodLabels.containsKey("driver-launcher-selector"), "Expected driver launcher" +
-      " selector label to be present.")
+    assert(driverPodLabels.size == 5, "Unexpected number of pod labels.")
     assert(driverPodLabels.get("spark-app-name") == "spark-pi", "Unexpected value for" +
       " spark-app-name label.")
+    assert(driverPodLabels.get("spark-app-id").startsWith("spark-pi"), "Unexpected value for" +
+      " spark-app-id label (should be prefixed with the app name).")
     assert(driverPodLabels.get("label1") == "label1value", "Unexpected value for label1")
     assert(driverPodLabels.get("label2") == "label2value", "Unexpected value for label2")
   }
@@ -323,12 +323,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-      "--conf", "spark.ssl.kubernetes.driverlaunch.enabled=true",
-      "--conf", "spark.ssl.kubernetes.driverlaunch.keyStore=" +
+      "--conf", "spark.ssl.kubernetes.submit.enabled=true",
+      "--conf", "spark.ssl.kubernetes.submit.keyStore=" +
         s"file://${keyStoreFile.getAbsolutePath}",
-      "--conf", "spark.ssl.kubernetes.driverlaunch.keyStorePassword=changeit",
-      "--conf", "spark.ssl.kubernetes.driverlaunch.keyPassword=changeit",
-      "--conf", "spark.ssl.kubernetes.driverlaunch.trustStore=" +
+      "--conf", "spark.ssl.kubernetes.submit.keyStorePassword=changeit",
+      "--conf", "spark.ssl.kubernetes.submit.keyPassword=changeit",
+      "--conf", "spark.ssl.kubernetes.submit.trustStore=" +
         s"file://${trustStoreFile.getAbsolutePath}",
       "--conf", s"spark.ssl.kubernetes.driverlaunch.trustStorePassword=changeit",
       EXAMPLES_JAR)

From 6ee3be5d0497131374f1ee4edab1f5071414892a Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 2 Feb 2017 10:58:15 -0800
Subject: [PATCH 409/534] Retry the submit-application request to multiple
 nodes (#69)

* Retry the submit-application request to multiple nodes.

* Fix doc style comment

* Check node unschedulable, log retry failures
---
 .../spark/deploy/kubernetes/Client.scala      | 27 ++++----
 .../spark/deploy/kubernetes/Retry.scala       | 28 +++++---
 .../rest/kubernetes/HttpClientUtil.scala      | 21 ++++--
 .../kubernetes/MultiServerFeignTarget.scala   | 67 +++++++++++++++++++
 .../integrationtest/minikube/Minikube.scala   |  2 +-
 5 files changed, 117 insertions(+), 28 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index fed9334dbbab4..715df54e573c3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -361,11 +361,13 @@ private[spark] class Client(
                   DEFAULT_BLOCKMANAGER_PORT.toString)
                 val driverSubmitter = buildDriverSubmissionClient(kubernetesClient, service,
                     driverSubmitSslOptions)
-                val ping = Retry.retry(5, 5.seconds) {
+                val ping = Retry.retry(5, 5.seconds,
+                    Some("Failed to contact the driver server")) {
                   driverSubmitter.ping()
                 }
                 ping onFailure {
                   case t: Throwable =>
+                    logError("Ping failed to the driver server", t)
                     submitCompletedFuture.setException(t)
                     kubernetesClient.services().delete(service)
                 }
@@ -532,17 +534,6 @@ private[spark] class Client(
       kubernetesClient: KubernetesClient,
       service: Service,
       driverSubmitSslOptions: SSLOptions): KubernetesSparkRestApi = {
-    val servicePort = service
-      .getSpec
-      .getPorts
-      .asScala
-      .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
-      .head
-      .getNodePort
-    // NodePort is exposed on every node, so just pick one of them.
-    // TODO be resilient to node failures and try all of them
-    val node = kubernetesClient.nodes.list.getItems.asScala.head
-    val nodeAddress = node.getStatus.getAddresses.asScala.head.getAddress
     val urlScheme = if (driverSubmitSslOptions.enabled) {
       "https"
     } else {
@@ -551,15 +542,23 @@ private[spark] class Client(
         " to secure this step.")
       "http"
     }
+    val servicePort = service.getSpec.getPorts.asScala
+      .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
+      .head.getNodePort
+    val nodeUrls = kubernetesClient.nodes.list.getItems.asScala
+      .filterNot(_.getSpec.getUnschedulable)
+      .flatMap(_.getStatus.getAddresses.asScala.map(address => {
+        s"$urlScheme://${address.getAddress}:$servicePort"
+      })).toArray
+    require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
     val (trustManager, sslContext): (X509TrustManager, SSLContext) =
       if (driverSubmitSslOptions.enabled) {
         buildSslConnectionConfiguration(driverSubmitSslOptions)
       } else {
         (null, SSLContext.getDefault)
       }
-    val url = s"$urlScheme://$nodeAddress:$servicePort"
     HttpClientUtil.createClient[KubernetesSparkRestApi](
-      url,
+      uris = nodeUrls,
       sslSocketFactory = sslContext.getSocketFactory,
       trustContext = trustManager)
   }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
index e5ce0bcd606b2..378583b29c547 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
@@ -19,24 +19,36 @@ package org.apache.spark.deploy.kubernetes
 import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.Duration
 
-private[spark] object Retry {
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+
+private[spark] object Retry extends Logging {
 
   private def retryableFuture[T]
-      (times: Int, interval: Duration)
+      (attempt: Int, maxAttempts: Int, interval: Duration, retryMessage: Option[String])
       (f: => Future[T])
       (implicit executionContext: ExecutionContext): Future[T] = {
     f recoverWith {
-      case _ if times > 0 => {
-        Thread.sleep(interval.toMillis)
-        retryableFuture(times - 1, interval)(f)
-      }
+      case error: Throwable =>
+        if (attempt <= maxAttempts) {
+          retryMessage.foreach { message =>
+            logWarning(s"$message - attempt $attempt of $maxAttempts", error)
+          }
+          Thread.sleep(interval.toMillis)
+          retryableFuture(attempt + 1, maxAttempts, interval, retryMessage)(f)
+        } else {
+          Future.failed(retryMessage.map(message =>
+            new SparkException(s"$message - reached $maxAttempts attempts," +
+              s" and aborting task.", error)
+          ).getOrElse(error))
+        }
     }
   }
 
   def retry[T]
-      (times: Int, interval: Duration)
+      (times: Int, interval: Duration, retryMessage: Option[String] = None)
       (f: => T)
       (implicit executionContext: ExecutionContext): Future[T] = {
-    retryableFuture(times, interval)(Future[T] { f })
+    retryableFuture(1, times, interval, retryMessage)(Future[T] { f })
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
index eb7d411700829..1cabfbad656eb 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
@@ -20,7 +20,7 @@ import javax.net.ssl.{SSLContext, SSLSocketFactory, X509TrustManager}
 
 import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import feign.Feign
+import feign.{Client, Feign, Request, Response}
 import feign.Request.Options
 import feign.jackson.{JacksonDecoder, JacksonEncoder}
 import feign.jaxrs.JAXRSContract
@@ -32,7 +32,7 @@ import org.apache.spark.status.api.v1.JacksonMessageWriter
 private[spark] object HttpClientUtil {
 
   def createClient[T: ClassTag](
-      uri: String,
+      uris: Array[String],
       sslSocketFactory: SSLSocketFactory = SSLContext.getDefault.getSocketFactory,
       trustContext: X509TrustManager = null,
       readTimeoutMillis: Int = 20000,
@@ -45,13 +45,24 @@ private[spark] object HttpClientUtil {
       .registerModule(new DefaultScalaModule)
       .setDateFormat(JacksonMessageWriter.makeISODateFormat)
     objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
-    val clazz = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
+    val target = new MultiServerFeignTarget[T](uris)
+    val baseHttpClient = new feign.okhttp.OkHttpClient(httpClientBuilder.build())
+    val resetTargetHttpClient = new Client {
+      override def execute(request: Request, options: Options): Response = {
+        val response = baseHttpClient.execute(request, options)
+        if (response.status() >= 200 && response.status() < 300) {
+          target.reset()
+        }
+        response
+      }
+    }
     Feign.builder()
-      .client(new feign.okhttp.OkHttpClient(httpClientBuilder.build()))
+      .client(resetTargetHttpClient)
       .contract(new JAXRSContract)
       .encoder(new JacksonEncoder(objectMapper))
       .decoder(new JacksonDecoder(objectMapper))
       .options(new Options(connectTimeoutMillis, readTimeoutMillis))
-      .target(clazz, uri)
+      .retryer(target)
+      .target(target)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
new file mode 100644
index 0000000000000..fea7f057cfa1b
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import feign.{Request, RequestTemplate, RetryableException, Retryer, Target}
+import scala.reflect.ClassTag
+import scala.util.Random
+
+private[kubernetes] class MultiServerFeignTarget[T : ClassTag](
+    private val servers: Seq[String]) extends Target[T] with Retryer {
+  require(servers.nonEmpty, "Must provide at least one server URI.")
+
+  private val threadLocalShuffledServers = new ThreadLocal[Seq[String]] {
+    override def initialValue(): Seq[String] = Random.shuffle(servers)
+  }
+
+  override def `type`(): Class[T] = {
+    implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
+  }
+
+  override def url(): String = threadLocalShuffledServers.get.head
+
+  /**
+   * Cloning the target is done on every request, for use on the current
+   * thread - thus it's important that clone returns a "fresh" target.
+   */
+  override def clone(): Retryer = {
+    reset()
+    this
+  }
+
+  override def name(): String = {
+    s"${getClass.getSimpleName} with servers [${servers.mkString(",")}]"
+  }
+
+  override def apply(requestTemplate: RequestTemplate): Request = {
+    if (!requestTemplate.url().startsWith("http")) {
+      requestTemplate.insert(0, url())
+    }
+    requestTemplate.request()
+  }
+
+  override def continueOrPropagate(e: RetryableException): Unit = {
+    threadLocalShuffledServers.set(threadLocalShuffledServers.get.drop(1))
+    if (threadLocalShuffledServers.get.isEmpty) {
+      throw e
+    }
+  }
+
+  def reset(): Unit = {
+    threadLocalShuffledServers.set(Random.shuffle(servers))
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
index 60c6564579a6e..b42f97952394e 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -123,7 +123,7 @@ private[spark] object Minikube extends Logging {
       .build()
     val sslContext = SSLUtils.sslContext(kubernetesConf)
     val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
-    HttpClientUtil.createClient[T](url, sslContext.getSocketFactory, trustManager)
+    HttpClientUtil.createClient[T](Array(url), sslContext.getSocketFactory, trustManager)
   }
 
   def executeMinikubeSsh(command: String): Unit = {

From d0f95dbec40fb686b42b8371ce532f12d69662c0 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 2 Feb 2017 12:22:54 -0800
Subject: [PATCH 410/534] Allow adding arbitrary files (#71)

* Allow adding arbitrary files

* Address comments and add documentation
---
 .../org/apache/spark/deploy/SparkSubmit.scala |  2 +
 .../spark/deploy/SparkSubmitArguments.scala   |  7 ++
 docs/running-on-kubernetes.md                 | 12 ++-
 .../launcher/SparkSubmitOptionParser.java     |  4 +-
 .../spark/deploy/kubernetes/Client.scala      | 34 ++++++--
 .../spark/deploy/kubernetes/config.scala      | 16 +++-
 .../rest/KubernetesRestProtocolMessages.scala |  3 +-
 .../rest/kubernetes/CompressionUtils.scala    |  4 +-
 .../KubernetesSparkRestServer.scala           | 53 ++++++++----
 .../jobs/FileExistenceTest.scala              | 54 ++++++++++++
 .../integrationtest/KubernetesSuite.scala     | 85 +++++++++++++++++--
 .../integration-tests/test-data/input.txt     |  1 +
 12 files changed, 243 insertions(+), 32 deletions(-)
 create mode 100644 resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/test-data/input.txt

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index db6ec22ca919f..51eb23560defe 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -474,6 +474,8 @@ object SparkSubmit {
         sysProp = "spark.kubernetes.namespace"),
       OptionAssigner(args.kubernetesUploadJars, KUBERNETES, CLUSTER,
         sysProp = "spark.kubernetes.driver.uploads.jars"),
+      OptionAssigner(args.kubernetesUploadFiles, KUBERNETES, CLUSTER,
+        sysProp = "spark.kubernetes.driver.uploads.files"),
 
         // Other options
       OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index ae1bee7ee4d14..f771755244f31 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -74,6 +74,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
   // Kubernetes only
   var kubernetesNamespace: String = null
   var kubernetesUploadJars: String = null
+  var kubernetesUploadFiles: String = null
 
   // Standalone cluster mode only
   var supervise: Boolean = false
@@ -196,6 +197,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     kubernetesUploadJars = Option(kubernetesUploadJars)
       .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.jars"))
       .orNull
+    kubernetesUploadFiles = Option(kubernetesUploadFiles)
+      .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.files"))
+      .orNull
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && !isR && primaryResource != null) {
@@ -442,6 +446,9 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       case KUBERNETES_UPLOAD_JARS =>
         kubernetesUploadJars = value
 
+      case KUBERNETES_UPLOAD_FILES =>
+        kubernetesUploadFiles = value
+
       case HELP =>
         printUsageAndExit(0)
 
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e256535fbbc9d..5a48bb254a6df 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -217,10 +217,20 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td><code>spark.kubernetes.driver.uploads.jars</code></td>
   <td>(none)</td>
   <td>
-    Comma-separated list of jars to sent to the driver and all executors when submitting the application in cluster
+    Comma-separated list of jars to send to the driver and all executors when submitting the application in cluster
     mode. Refer to <a href="running-on-kubernetes.html#adding-other-jars">adding other jars</a> for more information.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.uploads.files</code></td>
+  <td>(none)</td>
+  <td>
+    Comma-separated list of files to send to the driver and all executors when submitting the application in cluster
+    mode. The files are added in a flat hierarchy to the current working directory of the driver, having the same
+    names as the names of the original files. Note that two files with the same name cannot be added, even if they
+    were in different source directories on the client disk.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index f1dac20f52f0d..3369b5d8301be 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -80,6 +80,7 @@ class SparkSubmitOptionParser {
   protected final String KUBERNETES_MASTER = "--kubernetes-master";
   protected final String KUBERNETES_NAMESPACE = "--kubernetes-namespace";
   protected final String KUBERNETES_UPLOAD_JARS = "--upload-jars";
+  protected final String KUBERNETES_UPLOAD_FILES = "--upload-files";
 
   /**
    * This is the canonical list of spark-submit options. Each entry in the array contains the
@@ -122,7 +123,8 @@ class SparkSubmitOptionParser {
     { TOTAL_EXECUTOR_CORES },
     { KUBERNETES_MASTER },
     { KUBERNETES_NAMESPACE },
-    { KUBERNETES_UPLOAD_JARS }
+    { KUBERNETES_UPLOAD_JARS },
+    { KUBERNETES_UPLOAD_FILES }
   };
 
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 715df54e573c3..c350c4817664d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -61,7 +61,9 @@ private[spark] class Client(
   private val sslSecretsDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId-ssl"
   private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
-  private val uploadedJars = sparkConf.get(KUBERNETES_DRIVER_UPLOAD_JARS)
+  private val uploadedJars = sparkConf.get(KUBERNETES_DRIVER_UPLOAD_JARS).filter(_.nonEmpty)
+  private val uploadedFiles = sparkConf.get(KUBERNETES_DRIVER_UPLOAD_FILES).filter(_.nonEmpty)
+  uploadedFiles.foreach(validateNoDuplicateUploadFileNames)
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
 
@@ -513,18 +515,40 @@ private[spark] class Client(
       case "container" => ContainerAppResource(appResourceUri.getPath)
       case other => RemoteAppResource(other)
     }
-
-    val uploadJarsBase64Contents = compressJars(uploadedJars)
+    val uploadJarsBase64Contents = compressFiles(uploadedJars)
+    val uploadFilesBase64Contents = compressFiles(uploadedFiles)
     KubernetesCreateSubmissionRequest(
       appResource = resolvedAppResource,
       mainClass = mainClass,
       appArgs = appArgs,
       secret = secretBase64String,
       sparkProperties = sparkConf.getAll.toMap,
-      uploadedJarsBase64Contents = uploadJarsBase64Contents)
+      uploadedJarsBase64Contents = uploadJarsBase64Contents,
+      uploadedFilesBase64Contents = uploadFilesBase64Contents)
+  }
+
+  // Because uploaded files should be added to the working directory of the driver, they
+  // need to not have duplicate file names. They are added to the working directory so the
+  // user can reliably locate them in their application. This is similar in principle to how
+  // YARN handles its `spark.files` setting.
+  private def validateNoDuplicateUploadFileNames(uploadedFilesCommaSeparated: String): Unit = {
+    val pathsWithDuplicateNames = uploadedFilesCommaSeparated
+      .split(",")
+      .groupBy(new File(_).getName)
+      .filter(_._2.length > 1)
+    if (pathsWithDuplicateNames.nonEmpty) {
+      val pathsWithDuplicateNamesSorted = pathsWithDuplicateNames
+        .values
+        .flatten
+        .toList
+        .sortBy(new File(_).getName)
+      throw new SparkException("Cannot upload files with duplicate names via" +
+        s" ${KUBERNETES_DRIVER_UPLOAD_FILES.key}. The following paths have a duplicated" +
+        s" file name: ${pathsWithDuplicateNamesSorted.mkString(",")}")
+    }
   }
 
-  private def compressJars(maybeFilePaths: Option[String]): Option[TarGzippedData] = {
+  private def compressFiles(maybeFilePaths: Option[String]): Option[TarGzippedData] = {
     maybeFilePaths
       .map(_.split(","))
       .map(CompressionUtils.createTarGzip(_))
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 9b145370f87d6..3e0c400febca1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -94,13 +94,27 @@ package object config {
   private[spark] val KUBERNETES_DRIVER_UPLOAD_JARS =
     ConfigBuilder("spark.kubernetes.driver.uploads.jars")
       .doc("""
-          | Comma-separated list of jars to sent to the driver and
+          | Comma-separated list of jars to send to the driver and
           | all executors when submitting the application in cluster
           | mode.
         """.stripMargin)
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_DRIVER_UPLOAD_FILES =
+    ConfigBuilder("spark.kubernetes.driver.uploads.files")
+      .doc("""
+          | Comma-separated list of files to send to the driver and
+          | all executors when submitting the application in cluster
+          | mode. The files are added in a flat hierarchy to the
+          | current working directory of the driver, having the same
+          | names as the names of the original files. Note that two
+          | files with the same name cannot be added, even if they
+          | were in different source directories on the client disk.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
   // Note that while we set a default for this when we start up the
   // scheduler, the specific default value is dynamically determined
   // based on the executor memory.
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
index 8beba23bc8e11..6aeb851a16bf4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -26,7 +26,8 @@ case class KubernetesCreateSubmissionRequest(
   appArgs: Array[String],
   sparkProperties: Map[String, String],
   secret: String,
-  uploadedJarsBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
+  uploadedJarsBase64Contents: Option[TarGzippedData],
+  uploadedFilesBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
   message = "create"
   clientSparkVersion = SPARK_VERSION
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
index 1c95dacc7eb01..7204cb874aaec 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
@@ -68,8 +68,8 @@ private[spark] object CompressionUtils extends Logging {
             while (usedFileNames.contains(resolvedFileName)) {
               val oldResolvedFileName = resolvedFileName
               resolvedFileName = s"$nameWithoutExtension-$deduplicationCounter.$extension"
-              logWarning(s"File with name $oldResolvedFileName already exists. Trying to add with" +
-                s" file name $resolvedFileName instead.")
+              logWarning(s"File with name $oldResolvedFileName already exists. Trying to add" +
+                s" with file name $resolvedFileName instead.")
               deduplicationCounter += 1
             }
             usedFileNames += resolvedFileName
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 451dc96dd65ed..c5a7e27b15927 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -18,6 +18,7 @@ package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.File
 import java.net.URI
+import java.nio.file.Paths
 import java.util.concurrent.CountDownLatch
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
@@ -27,7 +28,7 @@ import org.apache.commons.codec.binary.Base64
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.rest._
 import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
@@ -149,7 +150,8 @@ private[spark] class KubernetesSparkRestServer(
             appArgs,
             sparkProperties,
             secret,
-            uploadedJars) =>
+            uploadedJars,
+            uploadedFiles) =>
               val decodedSecret = Base64.decodeBase64(secret)
               if (!expectedApplicationSecret.sameElements(decodedSecret)) {
                 responseServlet.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
@@ -157,29 +159,33 @@ private[spark] class KubernetesSparkRestServer(
               } else {
                 val tempDir = Utils.createTempDir()
                 val appResourcePath = resolvedAppResource(appResource, tempDir)
-                val jarsDirectory = new File(tempDir, "jars")
-                if (!jarsDirectory.mkdir) {
-                  throw new IllegalStateException("Failed to create jars dir at" +
-                    s"${jarsDirectory.getAbsolutePath}")
-                }
-                val writtenJars = writeBase64ContentsToFiles(uploadedJars, jarsDirectory)
-                val driverExtraClasspath = sparkProperties
-                  .get("spark.driver.extraClassPath")
-                  .map(_.split(","))
-                  .getOrElse(Array.empty[String])
+                val writtenJars = writeUploadedJars(uploadedJars, tempDir)
+                val writtenFiles = writeUploadedFiles(uploadedFiles)
+                val resolvedSparkProperties = new mutable.HashMap[String, String]
+                resolvedSparkProperties ++= sparkProperties
+
+                // Resolve driver classpath and jars
                 val originalJars = sparkProperties.get("spark.jars")
                   .map(_.split(","))
                   .getOrElse(Array.empty[String])
                 val resolvedJars = writtenJars ++ originalJars ++ Array(appResourcePath)
                 val sparkJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
+                val driverExtraClasspath = sparkProperties
+                  .get("spark.driver.extraClassPath")
+                  .map(_.split(","))
+                  .getOrElse(Array.empty[String])
                 val driverClasspath = driverExtraClasspath ++
                   resolvedJars ++
-                  sparkJars ++
-                  Array(appResourcePath)
-                val resolvedSparkProperties = new mutable.HashMap[String, String]
-                resolvedSparkProperties ++= sparkProperties
+                  sparkJars
                 resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
 
+                // Resolve spark.files
+                val originalFiles = sparkProperties.get("spark.files")
+                  .map(_.split(","))
+                  .getOrElse(Array.empty[String])
+                val resolvedFiles = originalFiles ++ writtenFiles
+                resolvedSparkProperties("spark.files") = resolvedFiles.mkString(",")
+
                 val command = new ArrayBuffer[String]
                 command += javaExecutable
                 command += "-cp"
@@ -229,6 +235,21 @@ private[spark] class KubernetesSparkRestServer(
       }
     }
 
+    private def writeUploadedJars(files: Option[TarGzippedData], rootTempDir: File):
+        Seq[String] = {
+      val resolvedDirectory = new File(rootTempDir, "jars")
+      if (!resolvedDirectory.mkdir()) {
+        throw new IllegalStateException(s"Failed to create jars dir at " +
+          resolvedDirectory.getAbsolutePath)
+      }
+      writeBase64ContentsToFiles(files, resolvedDirectory)
+    }
+
+    private def writeUploadedFiles(files: Option[TarGzippedData]): Seq[String] = {
+      val workingDir = Paths.get("").toFile.getAbsoluteFile
+      writeBase64ContentsToFiles(files, workingDir)
+    }
+
     def resolvedAppResource(appResource: AppResource, tempDir: File): String = {
       val appResourcePath = appResource match {
         case UploadedAppResource(resourceContentsBase64, resourceName) =>
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala
new file mode 100644
index 0000000000000..8b8d5e05f6479
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.jobs
+
+import java.nio.file.Paths
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.SparkSession
+
+private[spark] object FileExistenceTest {
+
+  def main(args: Array[String]): Unit = {
+    if (args.length < 2) {
+      throw new IllegalArgumentException("Usage: WordCount <source-file> <expected contents>")
+    }
+    // Can't use SparkContext.textFile since the file is local to the driver
+    val file = Paths.get(args(0)).toFile
+    if (!file.exists()) {
+      throw new SparkException(s"Failed to find file at ${file.getAbsolutePath}")
+    } else {
+      // scalastyle:off println
+      val contents = Files.toString(file, Charsets.UTF_8)
+      if (args(1) != contents) {
+        throw new SparkException(s"Contents do not match. Expected: ${args(1)}," +
+          s" actual, $contents")
+      } else {
+        println(s"File found at ${file.getAbsolutePath} with correct contents.")
+      }
+      // scalastyle:on println
+    }
+    val spark = SparkSession.builder()
+      .appName("Test")
+      .getOrCreate()
+    spark.stop()
+  }
+
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 16de71118dec4..40867c40d4474 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -21,7 +21,9 @@ import java.nio.file.Paths
 import java.util.UUID
 import java.util.concurrent.TimeUnit
 
+import com.google.common.base.Charsets
 import com.google.common.collect.ImmutableList
+import com.google.common.io.Files
 import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model.Pod
 import io.fabric8.kubernetes.client.{Config, KubernetesClient, KubernetesClientException, Watcher}
@@ -62,10 +64,14 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     .getOrElse(throw new IllegalStateException("Expected to find spark-examples jar; was the" +
         " pre-integration-test phase run?"))
 
+  private val TEST_EXISTENCE_FILE = Paths.get("test-data", "input.txt").toFile
+  private val TEST_EXISTENCE_FILE_CONTENTS = Files.toString(TEST_EXISTENCE_FILE, Charsets.UTF_8)
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
-  private val MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
+  private val SPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.SparkPiWithInfiniteWait"
+  private val FILE_EXISTENCE_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
+    ".integrationtest.jobs.FileExistenceTest"
   private val NAMESPACE = UUID.randomUUID().toString.replaceAll("-", "")
   private var minikubeKubernetesClient: KubernetesClient = _
   private var clientConfig: Config = _
@@ -179,7 +185,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
     new Client(
       sparkConf = sparkConf,
-      mainClass = MAIN_CLASS,
+      mainClass = SPARK_PI_MAIN_CLASS,
       mainAppResource = mainAppResource,
       appArgs = Array.empty[String]).run()
     val sparkMetricsService = getSparkMetricsService("spark-pi")
@@ -196,7 +202,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-cores", "1",
       "--num-executors", "1",
       "--upload-jars", HELPER_JAR,
-      "--class", MAIN_CLASS,
+      "--class", SPARK_PI_MAIN_CLASS,
       "--conf", "spark.ui.enabled=true",
       "--conf", "spark.testing=false",
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
@@ -279,7 +285,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-cores", "1",
       "--num-executors", "1",
       "--upload-jars", HELPER_JAR,
-      "--class", MAIN_CLASS,
+      "--class", SPARK_PI_MAIN_CLASS,
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
@@ -317,7 +323,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-cores", "1",
       "--num-executors", "1",
       "--upload-jars", HELPER_JAR,
-      "--class", MAIN_CLASS,
+      "--class", SPARK_PI_MAIN_CLASS,
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
@@ -334,4 +340,73 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       EXAMPLES_JAR)
     SparkSubmit.main(args)
   }
+
+  test("Added files should exist on the driver.") {
+    val args = Array(
+      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
+      "--deploy-mode", "cluster",
+      "--kubernetes-namespace", NAMESPACE,
+      "--name", "spark-file-existence-test",
+      "--executor-memory", "512m",
+      "--executor-cores", "1",
+      "--num-executors", "1",
+      "--upload-jars", HELPER_JAR,
+      "--upload-files", TEST_EXISTENCE_FILE.getAbsolutePath,
+      "--class", FILE_EXISTENCE_MAIN_CLASS,
+      "--conf", "spark.ui.enabled=false",
+      "--conf", "spark.testing=true",
+      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
+      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
+      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
+      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      EXAMPLES_JAR,
+      TEST_EXISTENCE_FILE.getName,
+      TEST_EXISTENCE_FILE_CONTENTS)
+    val podCompletedFuture = SettableFuture.create[Boolean]
+    val watch = new Watcher[Pod] {
+      override def eventReceived(action: Action, pod: Pod): Unit = {
+        val containerStatuses = pod.getStatus.getContainerStatuses.asScala
+        val allSuccessful = containerStatuses.nonEmpty && containerStatuses
+          .forall(status => {
+            status.getState.getTerminated != null && status.getState.getTerminated.getExitCode == 0
+        })
+        if (allSuccessful) {
+          podCompletedFuture.set(true)
+        } else {
+          val failedContainers = containerStatuses.filter(container => {
+            container.getState.getTerminated != null &&
+              container.getState.getTerminated.getExitCode != 0
+          })
+          if (failedContainers.nonEmpty) {
+            podCompletedFuture.setException(new SparkException(
+              "One or more containers in the driver failed with a nonzero exit code."))
+          }
+        }
+      }
+
+      override def onClose(e: KubernetesClientException): Unit = {
+        logWarning("Watch closed", e)
+      }
+    }
+    Utils.tryWithResource(minikubeKubernetesClient
+        .pods
+        .withLabel("spark-app-name", "spark-file-existence-test")
+        .watch(watch)) { _ =>
+      SparkSubmit.main(args)
+      assert(podCompletedFuture.get, "Failed to run driver pod")
+      val driverPod = minikubeKubernetesClient
+        .pods
+        .withLabel("spark-app-name", "spark-file-existence-test")
+        .list()
+        .getItems
+        .get(0)
+      val podLog = minikubeKubernetesClient
+        .pods
+        .withName(driverPod.getMetadata.getName)
+        .getLog
+      assert(podLog.contains(s"File found at /opt/spark/${TEST_EXISTENCE_FILE.getName}" +
+        s" with correct contents."), "Job did not find the file as expected.")
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/integration-tests/test-data/input.txt b/resource-managers/kubernetes/integration-tests/test-data/input.txt
new file mode 100644
index 0000000000000..dfe437bdebebc
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/test-data/input.txt
@@ -0,0 +1 @@
+Contents

From de9a82e95687c753450afd535a5365f7c53b5a72 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Thu, 2 Feb 2017 15:13:39 -0800
Subject: [PATCH 411/534] Fix NPE around unschedulable pod specs (#79)

---
 .../main/scala/org/apache/spark/deploy/kubernetes/Client.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index c350c4817664d..bef5a605f173b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -570,7 +570,8 @@ private[spark] class Client(
       .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
       .head.getNodePort
     val nodeUrls = kubernetesClient.nodes.list.getItems.asScala
-      .filterNot(_.getSpec.getUnschedulable)
+      .filterNot(node => node.getSpec.getUnschedulable != null &&
+        node.getSpec.getUnschedulable)
       .flatMap(_.getStatus.getAddresses.asScala.map(address => {
         s"$urlScheme://${address.getAddress}:$servicePort"
       })).toArray

From fae76a07b9357bed889d5fe66bd80d6cca2c2e87 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Thu, 2 Feb 2017 17:34:15 -0800
Subject: [PATCH 412/534] Introduce blocking submit to kubernetes by default
 (#53)

* Introduce blocking submit to kubernetes by default

Two new configuration settings:
- spark.kubernetes.submit.waitAppCompletion
- spark.kubernetes.report.interval

* Minor touchups

* More succinct logging for pod state

* Fix import order

* Switch to watch-based logging

* Spaces in comma-joined volumes, labels, and containers

* Use CountDownLatch instead of SettableFuture

* Match parallel ConfigBuilder style

* Disable logging in fire-and-forget mode

Which is enabled with spark.kubernetes.submit.waitAppCompletion=false
(default: true)

* Additional log line for when application is launched

* Minor wording changes

* More logging

* Drop log to DEBUG
---
 .../spark/deploy/kubernetes/Client.scala      | 154 +++++++++++-------
 .../kubernetes/LoggingPodStatusWatcher.scala  | 114 +++++++++++++
 .../spark/deploy/kubernetes/config.scala      |  19 +++
 3 files changed, 225 insertions(+), 62 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index bef5a605f173b..433c45d51fd6b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -18,7 +18,7 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.{File, FileInputStream}
 import java.security.{KeyStore, SecureRandom}
-import java.util.concurrent.{TimeoutException, TimeUnit}
+import java.util.concurrent.{CountDownLatch, TimeoutException, TimeUnit}
 import java.util.concurrent.atomic.AtomicBoolean
 import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
@@ -26,7 +26,7 @@ import com.google.common.base.Charsets
 import com.google.common.io.Files
 import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model._
-import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient, KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.{ConfigBuilder => K8SConfigBuilder, DefaultKubernetesClient, KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.codec.binary.Base64
 import scala.collection.JavaConverters._
@@ -67,6 +67,8 @@ private[spark] class Client(
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
 
+  private val waitForAppCompletion: Boolean = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+
   private val secretBase64String = {
     val secretBytes = new Array[Byte](128)
     SECURE_RANDOM.nextBytes(secretBytes)
@@ -81,9 +83,11 @@ private[spark] class Client(
       ThreadUtils.newDaemonSingleThreadExecutor("kubernetes-client-retryable-futures"))
 
   def run(): Unit = {
+    logInfo(s"Starting application $kubernetesAppId in Kubernetes...")
     val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
+
     val parsedCustomLabels = parseCustomLabels(customLabels)
-    var k8ConfBuilder = new ConfigBuilder()
+    var k8ConfBuilder = new K8SConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(master)
       .withNamespace(namespace)
@@ -116,73 +120,97 @@ private[spark] class Client(
             SPARK_APP_NAME_LABEL -> appName)
           ++ parsedCustomLabels).asJava
         val containerPorts = buildContainerPorts()
-        val submitCompletedFuture = SettableFuture.create[Boolean]
-        val submitPending = new AtomicBoolean(false)
-        val podWatcher = new DriverPodWatcher(
-          submitCompletedFuture,
-          submitPending,
-          kubernetesClient,
-          driverSubmitSslOptions,
-          Array(submitServerSecret) ++ sslSecrets,
-          driverKubernetesSelectors)
+
+        // start outer watch for status logging of driver pod
+        val driverPodCompletedLatch = new CountDownLatch(1)
+        // only enable interval logging if in waitForAppCompletion mode
+        val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
+        val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
+                                                       loggingInterval)
         Utils.tryWithResource(kubernetesClient
             .pods()
             .withLabels(driverKubernetesSelectors)
-            .watch(podWatcher)) { _ =>
-          kubernetesClient.pods().createNew()
-            .withNewMetadata()
-              .withName(kubernetesAppId)
+            .watch(loggingWatch)) { _ =>
+
+          // launch driver pod with inner watch to upload jars when it's ready
+          val submitCompletedFuture = SettableFuture.create[Boolean]
+          val submitPending = new AtomicBoolean(false)
+          val podWatcher = new DriverPodWatcher(
+            submitCompletedFuture,
+            submitPending,
+            kubernetesClient,
+            driverSubmitSslOptions,
+            Array(submitServerSecret) ++ sslSecrets,
+            driverKubernetesSelectors)
+          Utils.tryWithResource(kubernetesClient
+              .pods()
               .withLabels(driverKubernetesSelectors)
-              .endMetadata()
-            .withNewSpec()
-              .withRestartPolicy("OnFailure")
-              .addNewVolume()
-                .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
-                .withNewSecret()
-                  .withSecretName(submitServerSecret.getMetadata.getName)
-                  .endSecret()
-                .endVolume
-              .addToVolumes(sslVolumes: _*)
-              .withServiceAccount(serviceAccount)
-              .addNewContainer()
-                .withName(DRIVER_CONTAINER_NAME)
-                .withImage(driverDockerImage)
-                .withImagePullPolicy("IfNotPresent")
-                .addNewVolumeMount()
+              .watch(podWatcher)) { _ =>
+            kubernetesClient.pods().createNew()
+              .withNewMetadata()
+                .withName(kubernetesAppId)
+                .withLabels(driverKubernetesSelectors)
+                .endMetadata()
+              .withNewSpec()
+                .withRestartPolicy("OnFailure")
+                .addNewVolume()
                   .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
-                  .withMountPath(secretDirectory)
-                  .withReadOnly(true)
-                  .endVolumeMount()
-                .addToVolumeMounts(sslVolumeMounts: _*)
-                .addNewEnv()
-                  .withName(ENV_SUBMISSION_SECRET_LOCATION)
-                  .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
-                  .endEnv()
-                .addNewEnv()
-                  .withName(ENV_SUBMISSION_SERVER_PORT)
-                  .withValue(SUBMISSION_SERVER_PORT.toString)
-                  .endEnv()
-                .addToEnv(sslEnvs: _*)
-                .withPorts(containerPorts.asJava)
-                .endContainer()
-              .endSpec()
-            .done()
-          var submitSucceeded = false
-          try {
-            submitCompletedFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
-            submitSucceeded = true
-          } catch {
-            case e: TimeoutException =>
-              val finalErrorMessage: String = buildSubmitFailedErrorMessage(kubernetesClient, e)
-              logError(finalErrorMessage, e)
-              throw new SparkException(finalErrorMessage, e)
-          } finally {
-            if (!submitSucceeded) {
-              Utils.tryLogNonFatalError {
-                kubernetesClient.pods.withName(kubernetesAppId).delete()
+                  .withNewSecret()
+                    .withSecretName(submitServerSecret.getMetadata.getName)
+                    .endSecret()
+                  .endVolume
+                .addToVolumes(sslVolumes: _*)
+                .withServiceAccount(serviceAccount)
+                .addNewContainer()
+                  .withName(DRIVER_CONTAINER_NAME)
+                  .withImage(driverDockerImage)
+                  .withImagePullPolicy("IfNotPresent")
+                  .addNewVolumeMount()
+                    .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
+                    .withMountPath(secretDirectory)
+                    .withReadOnly(true)
+                    .endVolumeMount()
+                  .addToVolumeMounts(sslVolumeMounts: _*)
+                  .addNewEnv()
+                    .withName(ENV_SUBMISSION_SECRET_LOCATION)
+                    .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
+                    .endEnv()
+                  .addNewEnv()
+                    .withName(ENV_SUBMISSION_SERVER_PORT)
+                    .withValue(SUBMISSION_SERVER_PORT.toString)
+                    .endEnv()
+                  .addToEnv(sslEnvs: _*)
+                  .withPorts(containerPorts.asJava)
+                  .endContainer()
+                .endSpec()
+              .done()
+            var submitSucceeded = false
+            try {
+              submitCompletedFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
+              submitSucceeded = true
+              logInfo(s"Finished launching local resources to application $kubernetesAppId")
+            } catch {
+              case e: TimeoutException =>
+                val finalErrorMessage: String = buildSubmitFailedErrorMessage(kubernetesClient, e)
+                logError(finalErrorMessage, e)
+                throw new SparkException(finalErrorMessage, e)
+            } finally {
+              if (!submitSucceeded) {
+                Utils.tryLogNonFatalError {
+                  kubernetesClient.pods.withName(kubernetesAppId).delete()
+                }
               }
             }
           }
+
+          // wait if configured to do so
+          if (waitForAppCompletion) {
+            logInfo(s"Waiting for application $kubernetesAppId to finish...")
+            driverPodCompletedLatch.await()
+            logInfo(s"Application $kubernetesAppId finished.")
+          } else {
+            logInfo(s"Application $kubernetesAppId successfully launched.")
+          }
         }
       } finally {
         Utils.tryLogNonFatalError {
@@ -377,6 +405,8 @@ private[spark] class Client(
                   Future {
                     sparkConf.set("spark.driver.host", pod.getStatus.getPodIP)
                     val submitRequest = buildSubmissionRequest()
+                    logInfo(s"Submitting local resources to driver pod for application " +
+                      s"$kubernetesAppId ...")
                     driverSubmitter.submitApplication(submitRequest)
                   }
                 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
new file mode 100644
index 0000000000000..cbacaf6bda854
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.util.concurrent.{CountDownLatch, Executors, TimeUnit}
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.Pod
+import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+
+import org.apache.spark.internal.Logging
+
+/**
+ * A monitor for the running Kubernetes pod of a Spark application. Status logging occurs on
+ * every state change and also at an interval for liveness.
+ *
+ * @param podCompletedFuture a CountDownLatch that is set to true when the watched pod finishes
+ * @param appId
+ * @param interval ms between each state request.  If set to 0 or a negative number, the periodic
+ *                 logging will be disabled.
+ */
+private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownLatch,
+                                                  appId: String,
+                                                  interval: Long)
+    extends Watcher[Pod] with Logging {
+
+  // start timer for periodic logging
+  private val scheduler = Executors.newScheduledThreadPool(1)
+  private val logRunnable: Runnable = new Runnable {
+    override def run() = logShortStatus()
+  }
+  if (interval > 0) {
+    scheduler.scheduleWithFixedDelay(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
+  }
+
+  private var pod: Option[Pod] = Option.empty
+  private var prevPhase: String = null
+  private def phase: String = pod.map(_.getStatus().getPhase()).getOrElse("unknown")
+
+  override def eventReceived(action: Action, pod: Pod): Unit = {
+    this.pod = Option(pod)
+
+    logShortStatus()
+    if (prevPhase != phase) {
+      logLongStatus()
+    }
+    prevPhase = phase
+
+    if (phase == "Succeeded" || phase == "Failed") {
+      podCompletedFuture.countDown()
+    }
+  }
+
+  override def onClose(e: KubernetesClientException): Unit = {
+    scheduler.shutdown()
+    logDebug(s"Stopped watching application $appId with last-observed phase $phase")
+  }
+
+  private def logShortStatus() = {
+    logInfo(s"Application status for $appId (phase: $phase)")
+  }
+
+  private def logLongStatus() = {
+    logInfo("Phase changed, new state: " + pod.map(formatPodState(_)).getOrElse("unknown"))
+  }
+
+  private def formatPodState(pod: Pod): String = {
+
+    val details = Seq[(String, String)](
+      // pod metadata
+      ("pod name", pod.getMetadata.getName()),
+      ("namespace", pod.getMetadata.getNamespace()),
+      ("labels", pod.getMetadata.getLabels().asScala.mkString(", ")),
+      ("pod uid", pod.getMetadata.getUid),
+      ("creation time", pod.getMetadata.getCreationTimestamp()),
+
+      // spec details
+      ("service account name", pod.getSpec.getServiceAccountName()),
+      ("volumes", pod.getSpec.getVolumes().asScala.map(_.getName).mkString(", ")),
+      ("node name", pod.getSpec.getNodeName()),
+
+      // status
+      ("start time", pod.getStatus.getStartTime),
+      ("container images",
+        pod.getStatus.getContainerStatuses()
+            .asScala
+            .map(_.getImage)
+            .mkString(", ")),
+      ("phase", pod.getStatus.getPhase())
+    )
+
+    // Use more loggable format if value is null or empty
+    details.map { case (k, v) =>
+      val newValue = Option(v).filter(_.nonEmpty).getOrElse("N/A")
+      s"\n\t $k: $newValue"
+    }.mkString("")
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 3e0c400febca1..cb4cd42142ca4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -188,4 +188,23 @@ package object config {
       .internal()
       .stringConf
       .createOptional
+
+  private[spark] val WAIT_FOR_APP_COMPLETION =
+    ConfigBuilder("spark.kubernetes.submit.waitAppCompletion")
+      .doc(
+        """
+          | In cluster mode, whether to wait for the application to finish before exiting the
+          | launcher process.
+        """.stripMargin)
+      .booleanConf
+      .createWithDefault(true)
+
+  private[spark] val REPORT_INTERVAL =
+    ConfigBuilder("spark.kubernetes.report.interval")
+      .doc(
+        """
+          | Interval between reports of the current app status in cluster mode.
+        """.stripMargin)
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefaultString("1s")
 }

From 4bc7c523d5c102ae4fbb55c624cb56efbf4dd3da Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Fri, 3 Feb 2017 19:40:32 +0000
Subject: [PATCH 413/534] Do not wait for pod finishing in integration tests.
 (#84)

Since the example job are patched to never finish.
---
 .../deploy/kubernetes/integrationtest/KubernetesSuite.scala | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 40867c40d4474..c5458eccf830d 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -181,6 +181,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.app.name", "spark-pi")
       .set("spark.ui.enabled", "true")
       .set("spark.testing", "false")
+      .set("spark.kubernetes.submit.waitAppCompletion", "false")
     val mainAppResource = s"file://$EXAMPLES_JAR"
 
     new Client(
@@ -210,6 +211,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
       EXAMPLES_JAR)
     SparkSubmit.main(args)
     val sparkMetricsService = getSparkMetricsService("spark-pi")
@@ -231,6 +233,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
       s"container:///opt/spark/examples/jars/$EXAMPLES_JAR_FILE_NAME")
     val allContainersSucceeded = SettableFuture.create[Boolean]
     val watcher = new Watcher[Pod] {
@@ -292,6 +295,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       "--conf", "spark.kubernetes.driver.labels=label1=label1value,label2=label2value",
+      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
       EXAMPLES_JAR)
     SparkSubmit.main(args)
     val driverPodLabels = minikubeKubernetesClient
@@ -337,6 +341,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.ssl.kubernetes.submit.trustStore=" +
         s"file://${trustStoreFile.getAbsolutePath}",
       "--conf", s"spark.ssl.kubernetes.driverlaunch.trustStorePassword=changeit",
+      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
       EXAMPLES_JAR)
     SparkSubmit.main(args)
   }
@@ -360,6 +365,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
       EXAMPLES_JAR,
       TEST_EXISTENCE_FILE.getName,
       TEST_EXISTENCE_FILE_CONTENTS)

From 52a7ab2c8b8071ed78f4d5d4bd32ecaa1a7051db Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Wed, 8 Feb 2017 20:47:41 +0000
Subject: [PATCH 414/534] Check for user jars/files existence before creating
 the driver pod. (#86)

* Check for user jars/files existence before creating the driver pod.

Close apache-spark-on-k8s/spark#85

* CR
---
 .../spark/deploy/kubernetes/Client.scala      | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 433c45d51fd6b..b9b275c190fee 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -84,8 +84,10 @@ private[spark] class Client(
 
   def run(): Unit = {
     logInfo(s"Starting application $kubernetesAppId in Kubernetes...")
-    val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
 
+    Seq(uploadedFiles, uploadedJars, Some(mainAppResource)).foreach(checkForFilesExistence)
+
+    val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
     val parsedCustomLabels = parseCustomLabels(customLabels)
     var k8ConfBuilder = new K8SConfigBuilder()
       .withApiVersion("v1")
@@ -661,6 +663,22 @@ private[spark] class Client(
       }).toMap
     }).getOrElse(Map.empty[String, String])
   }
+
+  private def checkForFilesExistence(maybePaths: Option[String]): Unit = {
+    maybePaths.foreach { paths =>
+      paths.split(",").foreach { path =>
+        val uri = Utils.resolveURI(path)
+        uri.getScheme match {
+          case "file" | null =>
+            val file = new File(uri.getPath)
+            if (!file.isFile) {
+              throw new SparkException(s"""file "${uri}" does not exist!""")
+            }
+          case _ =>
+        }
+      }
+    }
+  }
 }
 
 private[spark] object Client extends Logging {

From 487d1e160e8f953cef0e59f0a8a06e6ad50bcf99 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 8 Feb 2017 17:59:50 -0800
Subject: [PATCH 415/534] Use readiness probe instead of client-side ping.
 (#75)

* Use readiness probe instead of client-side ping.

Keep one ping() just as a sanity check, but otherwise set up the
readiness probe to report the container as ready only when the ping
endpoint can be reached.

Also add a liveliness probe for convenience and symmetry.

* Extract common HTTP get action

* Remove some code

* Add delay to liveliness check

* Fix merge conflicts.

* Fix more merge conflicts

* Fix more merge conflicts

* Revamp readiness check logic

* Add addresses ready condition to endpoints watch

* Rearrange the logic some more.

* Remove liveness probe, retry against servers

* Fix compiler error

* Fix another compiler error

* Delay between retries. Remove unintended test modification

* FIx another compiler error

* Extract method

* Address comments

* Deduplicate node addresses, use lower initial connect timeout

* Drop maxRetriesPerServer from 10 to 3
---
 .../spark/deploy/kubernetes/Client.scala      | 643 +++++++++++-------
 .../spark/deploy/kubernetes/Retry.scala       |  54 --
 .../rest/kubernetes/HttpClientUtil.scala      |   7 +-
 .../kubernetes/MultiServerFeignTarget.scala   |  34 +-
 .../integrationtest/minikube/Minikube.scala   |   2 +-
 5 files changed, 426 insertions(+), 314 deletions(-)
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index b9b275c190fee..9eed9bfd2cd79 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -18,8 +18,8 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.{File, FileInputStream}
 import java.security.{KeyStore, SecureRandom}
-import java.util.concurrent.{CountDownLatch, TimeoutException, TimeUnit}
-import java.util.concurrent.atomic.AtomicBoolean
+import java.util
+import java.util.concurrent.{CountDownLatch, TimeUnit}
 import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
 import com.google.common.base.Charsets
@@ -31,8 +31,6 @@ import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.codec.binary.Base64
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-import scala.concurrent.{ExecutionContext, Future}
-import scala.concurrent.duration.DurationInt
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.config._
@@ -40,7 +38,7 @@ import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{ThreadUtils, Utils}
+import org.apache.spark.util.Utils
 
 private[spark] class Client(
     sparkConf: SparkConf,
@@ -78,10 +76,6 @@ private[spark] class Client(
   private val serviceAccount = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
   private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
 
-  private implicit val retryableExecutionContext = ExecutionContext
-    .fromExecutorService(
-      ThreadUtils.newDaemonSingleThreadExecutor("kubernetes-client-retryable-futures"))
-
   def run(): Unit = {
     logInfo(s"Starting application $kubernetesAppId in Kubernetes...")
 
@@ -112,119 +106,398 @@ private[spark] class Client(
         .withData(Map((SUBMISSION_APP_SECRET_NAME, secretBase64String)).asJava)
         .withType("Opaque")
         .done()
-      val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(kubernetesClient,
-        driverSubmitSslOptions,
-        isKeyStoreLocalFile)
       try {
-        val driverKubernetesSelectors = (Map(
-            SPARK_DRIVER_LABEL -> kubernetesAppId,
-            SPARK_APP_ID_LABEL -> kubernetesAppId,
-            SPARK_APP_NAME_LABEL -> appName)
-          ++ parsedCustomLabels).asJava
-        val containerPorts = buildContainerPorts()
-
-        // start outer watch for status logging of driver pod
-        val driverPodCompletedLatch = new CountDownLatch(1)
-        // only enable interval logging if in waitForAppCompletion mode
-        val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
-        val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
-                                                       loggingInterval)
-        Utils.tryWithResource(kubernetesClient
-            .pods()
-            .withLabels(driverKubernetesSelectors)
-            .watch(loggingWatch)) { _ =>
-
-          // launch driver pod with inner watch to upload jars when it's ready
-          val submitCompletedFuture = SettableFuture.create[Boolean]
-          val submitPending = new AtomicBoolean(false)
-          val podWatcher = new DriverPodWatcher(
-            submitCompletedFuture,
-            submitPending,
-            kubernetesClient,
-            driverSubmitSslOptions,
-            Array(submitServerSecret) ++ sslSecrets,
-            driverKubernetesSelectors)
+        val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(kubernetesClient,
+          driverSubmitSslOptions,
+          isKeyStoreLocalFile)
+        try {
+          // start outer watch for status logging of driver pod
+          val driverPodCompletedLatch = new CountDownLatch(1)
+          // only enable interval logging if in waitForAppCompletion mode
+          val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
+          val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
+            loggingInterval)
           Utils.tryWithResource(kubernetesClient
               .pods()
-              .withLabels(driverKubernetesSelectors)
-              .watch(podWatcher)) { _ =>
-            kubernetesClient.pods().createNew()
-              .withNewMetadata()
-                .withName(kubernetesAppId)
-                .withLabels(driverKubernetesSelectors)
-                .endMetadata()
-              .withNewSpec()
-                .withRestartPolicy("OnFailure")
-                .addNewVolume()
-                  .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
-                  .withNewSecret()
-                    .withSecretName(submitServerSecret.getMetadata.getName)
-                    .endSecret()
-                  .endVolume
-                .addToVolumes(sslVolumes: _*)
-                .withServiceAccount(serviceAccount)
-                .addNewContainer()
-                  .withName(DRIVER_CONTAINER_NAME)
-                  .withImage(driverDockerImage)
-                  .withImagePullPolicy("IfNotPresent")
-                  .addNewVolumeMount()
-                    .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
-                    .withMountPath(secretDirectory)
-                    .withReadOnly(true)
-                    .endVolumeMount()
-                  .addToVolumeMounts(sslVolumeMounts: _*)
-                  .addNewEnv()
-                    .withName(ENV_SUBMISSION_SECRET_LOCATION)
-                    .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
-                    .endEnv()
-                  .addNewEnv()
-                    .withName(ENV_SUBMISSION_SERVER_PORT)
-                    .withValue(SUBMISSION_SERVER_PORT.toString)
-                    .endEnv()
-                  .addToEnv(sslEnvs: _*)
-                  .withPorts(containerPorts.asJava)
-                  .endContainer()
-                .endSpec()
-              .done()
-            var submitSucceeded = false
-            try {
-              submitCompletedFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
-              submitSucceeded = true
-              logInfo(s"Finished launching local resources to application $kubernetesAppId")
+              .withName(kubernetesAppId)
+              .watch(loggingWatch)) { _ =>
+            val (driverPod, driverService) = launchDriverKubernetesComponents(
+              kubernetesClient,
+              parsedCustomLabels,
+              submitServerSecret,
+              driverSubmitSslOptions,
+              sslSecrets,
+              sslVolumes,
+              sslVolumeMounts,
+              sslEnvs,
+              isKeyStoreLocalFile)
+            val ownerReferenceConfiguredDriverService = try {
+              configureOwnerReferences(
+                kubernetesClient,
+                submitServerSecret,
+                sslSecrets,
+                driverPod,
+                driverService)
             } catch {
-              case e: TimeoutException =>
-                val finalErrorMessage: String = buildSubmitFailedErrorMessage(kubernetesClient, e)
-                logError(finalErrorMessage, e)
-                throw new SparkException(finalErrorMessage, e)
-            } finally {
-              if (!submitSucceeded) {
-                Utils.tryLogNonFatalError {
-                  kubernetesClient.pods.withName(kubernetesAppId).delete()
-                }
+              case e: Throwable =>
+                cleanupPodAndService(kubernetesClient, driverPod, driverService)
+                throw new SparkException("Failed to set owner references to the driver pod.", e)
+            }
+            try {
+              submitApplicationToDriverServer(kubernetesClient, driverSubmitSslOptions,
+                ownerReferenceConfiguredDriverService)
+              // wait if configured to do so
+              if (waitForAppCompletion) {
+                logInfo(s"Waiting for application $kubernetesAppId to finish...")
+                driverPodCompletedLatch.await()
+                logInfo(s"Application $kubernetesAppId finished.")
+              } else {
+                logInfo(s"Application $kubernetesAppId successfully launched.")
               }
+            } catch {
+              case e: Throwable =>
+                cleanupPodAndService(kubernetesClient, driverPod,
+                  ownerReferenceConfiguredDriverService)
+                throw new SparkException("Failed to submit the application to the driver pod.", e)
             }
           }
-
-          // wait if configured to do so
-          if (waitForAppCompletion) {
-            logInfo(s"Waiting for application $kubernetesAppId to finish...")
-            driverPodCompletedLatch.await()
-            logInfo(s"Application $kubernetesAppId finished.")
-          } else {
-            logInfo(s"Application $kubernetesAppId successfully launched.")
+        } finally {
+          Utils.tryLogNonFatalError {
+            // Secrets may have been mutated so delete by name to avoid problems with not having
+            // the latest version.
+            sslSecrets.foreach { secret =>
+              kubernetesClient.secrets().withName(secret.getMetadata.getName).delete()
+            }
           }
         }
       } finally {
         Utils.tryLogNonFatalError {
-          kubernetesClient.secrets().delete(submitServerSecret)
+          kubernetesClient.secrets().withName(submitServerSecret.getMetadata.getName).delete()
         }
-        Utils.tryLogNonFatalError {
-          kubernetesClient.secrets().delete(sslSecrets: _*)
+      }
+    }
+  }
+
+  private def cleanupPodAndService(
+      kubernetesClient: KubernetesClient,
+      driverPod: Pod,
+      driverService: Service): Unit = {
+    Utils.tryLogNonFatalError {
+      kubernetesClient.services().delete(driverService)
+    }
+    Utils.tryLogNonFatalError {
+      kubernetesClient.pods().delete(driverPod)
+    }
+  }
+
+  private def submitApplicationToDriverServer(
+      kubernetesClient: KubernetesClient,
+      driverSubmitSslOptions: SSLOptions,
+      driverService: Service) = {
+    sparkConf.getOption("spark.app.id").foreach { id =>
+      logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
+        s" overridden as $kubernetesAppId")
+    }
+    sparkConf.set(KUBERNETES_DRIVER_POD_NAME, kubernetesAppId)
+    sparkConf.set(KUBERNETES_DRIVER_SERVICE_NAME, driverService.getMetadata.getName)
+    sparkConf.set("spark.app.id", kubernetesAppId)
+    sparkConf.setIfMissing("spark.app.name", appName)
+    sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
+    sparkConf.setIfMissing("spark.blockmanager.port",
+      DEFAULT_BLOCKMANAGER_PORT.toString)
+    val driverSubmitter = buildDriverSubmissionClient(kubernetesClient, driverService,
+      driverSubmitSslOptions)
+    // Sanity check to see if the driver submitter is even reachable.
+    driverSubmitter.ping()
+    logInfo(s"Submitting local resources to driver pod for application " +
+      s"$kubernetesAppId ...")
+    val submitRequest = buildSubmissionRequest()
+    driverSubmitter.submitApplication(submitRequest)
+    logInfo("Successfully submitted local resources and driver configuration to" +
+      " driver pod.")
+    // After submitting, adjust the service to only expose the Spark UI
+    val uiServicePort = new ServicePortBuilder()
+      .withName(UI_PORT_NAME)
+      .withPort(uiPort)
+      .withNewTargetPort(uiPort)
+      .build()
+    kubernetesClient.services().withName(kubernetesAppId).edit().editSpec()
+      .withType("ClusterIP")
+      .withPorts(uiServicePort)
+      .endSpec()
+      .done()
+    logInfo("Finished submitting application to Kubernetes.")
+  }
+
+  private def launchDriverKubernetesComponents(
+      kubernetesClient: KubernetesClient,
+      parsedCustomLabels: Map[String, String],
+      submitServerSecret: Secret,
+      driverSubmitSslOptions: SSLOptions,
+      sslSecrets: Array[Secret],
+      sslVolumes: Array[Volume],
+      sslVolumeMounts: Array[VolumeMount],
+      sslEnvs: Array[EnvVar],
+      isKeyStoreLocalFile: Boolean): (Pod, Service) = {
+    val endpointsReadyFuture = SettableFuture.create[Endpoints]
+    val endpointsReadyWatcher = new DriverEndpointsReadyWatcher(endpointsReadyFuture)
+    val serviceReadyFuture = SettableFuture.create[Service]
+    val driverKubernetesSelectors = (Map(
+      SPARK_DRIVER_LABEL -> kubernetesAppId,
+      SPARK_APP_ID_LABEL -> kubernetesAppId,
+      SPARK_APP_NAME_LABEL -> appName)
+      ++ parsedCustomLabels).asJava
+    val serviceReadyWatcher = new DriverServiceReadyWatcher(serviceReadyFuture)
+    val podReadyFuture = SettableFuture.create[Pod]
+    val podWatcher = new DriverPodReadyWatcher(podReadyFuture)
+    Utils.tryWithResource(kubernetesClient
+        .pods()
+        .withName(kubernetesAppId)
+        .watch(podWatcher)) { _ =>
+      Utils.tryWithResource(kubernetesClient
+          .services()
+          .withName(kubernetesAppId)
+          .watch(serviceReadyWatcher)) { _ =>
+        Utils.tryWithResource(kubernetesClient
+            .endpoints()
+            .withName(kubernetesAppId)
+            .watch(endpointsReadyWatcher)) { _ =>
+          val driverService = createDriverService(
+            kubernetesClient,
+            driverKubernetesSelectors,
+            submitServerSecret)
+          val driverPod = try {
+            createDriverPod(
+              kubernetesClient,
+              driverKubernetesSelectors,
+              submitServerSecret,
+              driverSubmitSslOptions,
+              sslVolumes,
+              sslVolumeMounts,
+              sslEnvs)
+          } catch {
+            case e: Throwable =>
+              Utils.tryLogNonFatalError {
+                kubernetesClient.services().delete(driverService)
+              }
+              throw new SparkException("Failed to create the driver pod.", e)
+          }
+          try {
+            waitForReadyKubernetesComponents(kubernetesClient, endpointsReadyFuture,
+              serviceReadyFuture, podReadyFuture)
+            (driverPod, driverService)
+          } catch {
+            case e: Throwable =>
+              Utils.tryLogNonFatalError {
+                kubernetesClient.services().delete(driverService)
+              }
+              Utils.tryLogNonFatalError {
+                kubernetesClient.pods().delete(driverPod)
+              }
+              throw new SparkException("Timed out while waiting for a Kubernetes component to be" +
+                " ready.", e)
+          }
         }
       }
     }
   }
 
+  /**
+   * Sets the owner reference for all the kubernetes components to link to the driver pod.
+   *
+   * @return The driver service after it has been adjusted to reflect the new owner
+   * reference.
+   */
+  private def configureOwnerReferences(
+      kubernetesClient: KubernetesClient,
+      submitServerSecret: Secret,
+      sslSecrets: Array[Secret],
+      driverPod: Pod,
+      driverService: Service): Service = {
+    val driverPodOwnerRef = new OwnerReferenceBuilder()
+      .withName(driverPod.getMetadata.getName)
+      .withUid(driverPod.getMetadata.getUid)
+      .withApiVersion(driverPod.getApiVersion)
+      .withKind(driverPod.getKind)
+      .withController(true)
+      .build()
+    sslSecrets.foreach(secret => {
+      kubernetesClient.secrets().withName(secret.getMetadata.getName).edit()
+        .editMetadata()
+        .addToOwnerReferences(driverPodOwnerRef)
+        .endMetadata()
+        .done()
+    })
+    kubernetesClient.secrets().withName(submitServerSecret.getMetadata.getName).edit()
+      .editMetadata()
+      .addToOwnerReferences(driverPodOwnerRef)
+      .endMetadata()
+      .done()
+    kubernetesClient.services().withName(driverService.getMetadata.getName).edit()
+      .editMetadata()
+      .addToOwnerReferences(driverPodOwnerRef)
+      .endMetadata()
+      .done()
+  }
+
+  private def waitForReadyKubernetesComponents(
+      kubernetesClient: KubernetesClient,
+      endpointsReadyFuture: SettableFuture[Endpoints],
+      serviceReadyFuture: SettableFuture[Service],
+      podReadyFuture: SettableFuture[Pod]) = {
+    try {
+      podReadyFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
+      logInfo("Driver pod successfully created in Kubernetes cluster.")
+    } catch {
+      case e: Throwable =>
+        val finalErrorMessage: String = buildSubmitFailedErrorMessage(kubernetesClient, e)
+        logError(finalErrorMessage, e)
+        throw new SparkException(finalErrorMessage, e)
+    }
+    try {
+      serviceReadyFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
+      logInfo("Driver service created successfully in Kubernetes.")
+    } catch {
+      case e: Throwable =>
+        throw new SparkException(s"The driver service was not ready" +
+          s" in $driverSubmitTimeoutSecs seconds.", e)
+    }
+    try {
+      endpointsReadyFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
+      logInfo("Driver endpoints ready to receive application submission")
+    } catch {
+      case e: Throwable =>
+        throw new SparkException(s"The driver service endpoint was not ready" +
+          s" in $driverSubmitTimeoutSecs seconds.", e)
+    }
+  }
+
+  private def createDriverService(
+      kubernetesClient: KubernetesClient,
+      driverKubernetesSelectors: java.util.Map[String, String],
+      submitServerSecret: Secret): Service = {
+    val driverSubmissionServicePort = new ServicePortBuilder()
+      .withName(SUBMISSION_SERVER_PORT_NAME)
+      .withPort(SUBMISSION_SERVER_PORT)
+      .withNewTargetPort(SUBMISSION_SERVER_PORT)
+      .build()
+    kubernetesClient.services().createNew()
+      .withNewMetadata()
+        .withName(kubernetesAppId)
+        .withLabels(driverKubernetesSelectors)
+        .endMetadata()
+      .withNewSpec()
+        .withType("NodePort")
+        .withSelector(driverKubernetesSelectors)
+        .withPorts(driverSubmissionServicePort)
+        .endSpec()
+      .done()
+  }
+
+  private def createDriverPod(
+      kubernetesClient: KubernetesClient,
+      driverKubernetesSelectors: util.Map[String, String],
+      submitServerSecret: Secret,
+      driverSubmitSslOptions: SSLOptions,
+      sslVolumes: Array[Volume],
+      sslVolumeMounts: Array[VolumeMount],
+      sslEnvs: Array[EnvVar]) = {
+    val containerPorts = buildContainerPorts()
+    val probePingHttpGet = new HTTPGetActionBuilder()
+      .withScheme(if (driverSubmitSslOptions.enabled) "HTTPS" else "HTTP")
+      .withPath("/v1/submissions/ping")
+      .withNewPort(SUBMISSION_SERVER_PORT_NAME)
+      .build()
+    kubernetesClient.pods().createNew()
+      .withNewMetadata()
+        .withName(kubernetesAppId)
+        .withLabels(driverKubernetesSelectors)
+        .endMetadata()
+      .withNewSpec()
+        .withRestartPolicy("OnFailure")
+        .addNewVolume()
+          .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
+          .withNewSecret()
+            .withSecretName(submitServerSecret.getMetadata.getName)
+            .endSecret()
+          .endVolume()
+        .addToVolumes(sslVolumes: _*)
+        .withServiceAccount(serviceAccount)
+        .addNewContainer()
+          .withName(DRIVER_CONTAINER_NAME)
+          .withImage(driverDockerImage)
+          .withImagePullPolicy("IfNotPresent")
+          .addNewVolumeMount()
+            .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
+            .withMountPath(secretDirectory)
+            .withReadOnly(true)
+            .endVolumeMount()
+          .addToVolumeMounts(sslVolumeMounts: _*)
+          .addNewEnv()
+            .withName(ENV_SUBMISSION_SECRET_LOCATION)
+            .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
+            .endEnv()
+          .addNewEnv()
+            .withName(ENV_SUBMISSION_SERVER_PORT)
+            .withValue(SUBMISSION_SERVER_PORT.toString)
+            .endEnv()
+          .addToEnv(sslEnvs: _*)
+          .withPorts(containerPorts.asJava)
+          .withNewReadinessProbe().withHttpGet(probePingHttpGet).endReadinessProbe()
+          .endContainer()
+        .endSpec()
+      .done()
+  }
+
+  private class DriverPodReadyWatcher(resolvedDriverPod: SettableFuture[Pod]) extends Watcher[Pod] {
+    override def eventReceived(action: Action, pod: Pod): Unit = {
+      if ((action == Action.ADDED || action == Action.MODIFIED)
+          && pod.getStatus.getPhase == "Running"
+          && !resolvedDriverPod.isDone) {
+        pod.getStatus
+          .getContainerStatuses
+          .asScala
+          .find(status =>
+            status.getName == DRIVER_CONTAINER_NAME && status.getReady)
+          .foreach { _ => resolvedDriverPod.set(pod) }
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {
+      logDebug("Driver pod readiness watch closed.", cause)
+    }
+  }
+
+  private class DriverEndpointsReadyWatcher(resolvedDriverEndpoints: SettableFuture[Endpoints])
+      extends Watcher[Endpoints] {
+    override def eventReceived(action: Action, endpoints: Endpoints): Unit = {
+      if ((action == Action.ADDED) || (action == Action.MODIFIED)
+          && endpoints.getSubsets.asScala.nonEmpty
+          && endpoints.getSubsets.asScala.exists(_.getAddresses.asScala.nonEmpty)
+          && !resolvedDriverEndpoints.isDone) {
+        resolvedDriverEndpoints.set(endpoints)
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {
+      logDebug("Driver endpoints readiness watch closed.", cause)
+    }
+  }
+
+  private class DriverServiceReadyWatcher(resolvedDriverService: SettableFuture[Service])
+      extends Watcher[Service] {
+    override def eventReceived(action: Action, service: Service): Unit = {
+      if ((action == Action.ADDED) || (action == Action.MODIFIED)
+          && !resolvedDriverService.isDone) {
+        resolvedDriverService.set(service)
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {
+      logDebug("Driver service readiness watch closed.", cause)
+    }
+  }
+
   private def parseDriverSubmitSslOptions(): (SSLOptions, Boolean) = {
     val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_KEYSTORE)
     val resolvedSparkConf = sparkConf.clone()
@@ -306,18 +579,10 @@ private[spark] class Client(
         .withName(ENV_SUBMISSION_USE_SSL)
         .withValue("true")
         .build()
-      val sslSecrets = kubernetesClient.secrets().createNew()
-        .withNewMetadata()
-        .withName(sslSecretsName)
-        .endMetadata()
-        .withData(sslSecretsMap.asJava)
-        .withType("Opaque")
-        .done()
-      secrets += sslSecrets
       val sslVolume = new VolumeBuilder()
         .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
         .withNewSecret()
-          .withSecretName(sslSecrets.getMetadata.getName)
+          .withSecretName(sslSecretsName)
           .endSecret()
         .build()
       val sslVolumeMount = new VolumeMountBuilder()
@@ -325,147 +590,23 @@ private[spark] class Client(
         .withReadOnly(true)
         .withMountPath(sslSecretsDirectory)
         .build()
+      val sslSecrets = kubernetesClient.secrets().createNew()
+        .withNewMetadata()
+        .withName(sslSecretsName)
+        .endMetadata()
+        .withData(sslSecretsMap.asJava)
+        .withType("Opaque")
+        .done()
+      secrets += sslSecrets
       (sslEnvs.toArray, Array(sslVolume), Array(sslVolumeMount), secrets.toArray)
     } else {
       (Array[EnvVar](), Array[Volume](), Array[VolumeMount](), Array[Secret]())
     }
   }
 
-  private class DriverPodWatcher(
-      submitCompletedFuture: SettableFuture[Boolean],
-      submitPending: AtomicBoolean,
-      kubernetesClient: KubernetesClient,
-      driverSubmitSslOptions: SSLOptions,
-      applicationSecrets: Array[Secret],
-      driverKubernetesSelectors: java.util.Map[String, String]) extends Watcher[Pod] {
-    override def eventReceived(action: Action, pod: Pod): Unit = {
-      if ((action == Action.ADDED || action == Action.MODIFIED)
-        && pod.getStatus.getPhase == "Running"
-        && !submitCompletedFuture.isDone) {
-        if (!submitPending.getAndSet(true)) {
-          pod.getStatus
-            .getContainerStatuses
-            .asScala
-            .find(status =>
-              status.getName == DRIVER_CONTAINER_NAME && status.getReady) match {
-            case Some(_) =>
-              val ownerRefs = Seq(new OwnerReferenceBuilder()
-                .withName(pod.getMetadata.getName)
-                .withUid(pod.getMetadata.getUid)
-                .withApiVersion(pod.getApiVersion)
-                .withKind(pod.getKind)
-                .withController(true)
-                .build())
-
-              applicationSecrets.foreach(secret => {
-                secret.getMetadata.setOwnerReferences(ownerRefs.asJava)
-                kubernetesClient.secrets().createOrReplace(secret)
-              })
-
-              val driverSubmissionServicePort = new ServicePortBuilder()
-                .withName(SUBMISSION_SERVER_PORT_NAME)
-                .withPort(SUBMISSION_SERVER_PORT)
-                .withNewTargetPort(SUBMISSION_SERVER_PORT)
-                .build()
-              val service = kubernetesClient.services().createNew()
-                .withNewMetadata()
-                  .withName(kubernetesAppId)
-                  .withLabels(driverKubernetesSelectors)
-                  .withOwnerReferences(ownerRefs.asJava)
-                  .endMetadata()
-                .withNewSpec()
-                  .withType("NodePort")
-                  .withSelector(driverKubernetesSelectors)
-                  .withPorts(driverSubmissionServicePort)
-                  .endSpec()
-                .done()
-              try {
-                sparkConf.getOption("spark.app.id").foreach { id =>
-                  logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
-                    s" overridden as $kubernetesAppId")
-                }
-                sparkConf.set(KUBERNETES_DRIVER_POD_NAME, kubernetesAppId)
-                sparkConf.set(KUBERNETES_DRIVER_SERVICE_NAME, service.getMetadata.getName)
-                sparkConf.set("spark.app.id", kubernetesAppId)
-                sparkConf.setIfMissing("spark.app.name", appName)
-                sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
-                sparkConf.setIfMissing("spark.blockmanager.port",
-                  DEFAULT_BLOCKMANAGER_PORT.toString)
-                val driverSubmitter = buildDriverSubmissionClient(kubernetesClient, service,
-                    driverSubmitSslOptions)
-                val ping = Retry.retry(5, 5.seconds,
-                    Some("Failed to contact the driver server")) {
-                  driverSubmitter.ping()
-                }
-                ping onFailure {
-                  case t: Throwable =>
-                    logError("Ping failed to the driver server", t)
-                    submitCompletedFuture.setException(t)
-                    kubernetesClient.services().delete(service)
-                }
-                val submitComplete = ping.flatMap { _ =>
-                  Future {
-                    sparkConf.set("spark.driver.host", pod.getStatus.getPodIP)
-                    val submitRequest = buildSubmissionRequest()
-                    logInfo(s"Submitting local resources to driver pod for application " +
-                      s"$kubernetesAppId ...")
-                    driverSubmitter.submitApplication(submitRequest)
-                  }
-                }
-                submitComplete onFailure {
-                  case t: Throwable =>
-                    submitCompletedFuture.setException(t)
-                    kubernetesClient.services().delete(service)
-                }
-                val adjustServicePort = submitComplete.flatMap { _ =>
-                  Future {
-                    // After submitting, adjust the service to only expose the Spark UI
-                    val uiServicePort = new ServicePortBuilder()
-                      .withName(UI_PORT_NAME)
-                      .withPort(uiPort)
-                      .withNewTargetPort(uiPort)
-                      .build()
-                    kubernetesClient.services().withName(kubernetesAppId).edit()
-                      .editSpec()
-                        .withType("ClusterIP")
-                        .withPorts(uiServicePort)
-                        .endSpec()
-                      .done
-                  }
-                }
-                adjustServicePort onSuccess {
-                  case _ =>
-                    submitCompletedFuture.set(true)
-                }
-                adjustServicePort onFailure {
-                  case throwable: Throwable =>
-                    submitCompletedFuture.setException(throwable)
-                    kubernetesClient.services().delete(service)
-                }
-              } catch {
-                case e: Throwable =>
-                  submitCompletedFuture.setException(e)
-                  Utils.tryLogNonFatalError({
-                    kubernetesClient.services().delete(service)
-                  })
-                  throw e
-              }
-            case None =>
-          }
-        }
-      }
-    }
-
-    override def onClose(e: KubernetesClientException): Unit = {
-      if (!submitCompletedFuture.isDone) {
-        submitCompletedFuture.setException(e)
-      }
-    }
-  }
-
   private def buildSubmitFailedErrorMessage(
-      kubernetesClient: DefaultKubernetesClient,
-      e: TimeoutException): String = {
+      kubernetesClient: KubernetesClient,
+      e: Throwable): String = {
     val driverPod = try {
       kubernetesClient.pods().withName(kubernetesAppId).get()
     } catch {
@@ -606,7 +747,7 @@ private[spark] class Client(
         node.getSpec.getUnschedulable)
       .flatMap(_.getStatus.getAddresses.asScala.map(address => {
         s"$urlScheme://${address.getAddress}:$servicePort"
-      })).toArray
+      })).toSet
     require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
     val (trustManager, sslContext): (X509TrustManager, SSLContext) =
       if (driverSubmitSslOptions.enabled) {
@@ -616,8 +757,10 @@ private[spark] class Client(
       }
     HttpClientUtil.createClient[KubernetesSparkRestApi](
       uris = nodeUrls,
+      maxRetriesPerServer = 3,
       sslSocketFactory = sslContext.getSocketFactory,
-      trustContext = trustManager)
+      trustContext = trustManager,
+      connectTimeoutMillis = 5000)
   }
 
   private def buildSslConnectionConfiguration(driverSubmitSslOptions: SSLOptions) = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
deleted file mode 100644
index 378583b29c547..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Retry.scala
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes
-
-import scala.concurrent.{ExecutionContext, Future}
-import scala.concurrent.duration.Duration
-
-import org.apache.spark.SparkException
-import org.apache.spark.internal.Logging
-
-private[spark] object Retry extends Logging {
-
-  private def retryableFuture[T]
-      (attempt: Int, maxAttempts: Int, interval: Duration, retryMessage: Option[String])
-      (f: => Future[T])
-      (implicit executionContext: ExecutionContext): Future[T] = {
-    f recoverWith {
-      case error: Throwable =>
-        if (attempt <= maxAttempts) {
-          retryMessage.foreach { message =>
-            logWarning(s"$message - attempt $attempt of $maxAttempts", error)
-          }
-          Thread.sleep(interval.toMillis)
-          retryableFuture(attempt + 1, maxAttempts, interval, retryMessage)(f)
-        } else {
-          Future.failed(retryMessage.map(message =>
-            new SparkException(s"$message - reached $maxAttempts attempts," +
-              s" and aborting task.", error)
-          ).getOrElse(error))
-        }
-    }
-  }
-
-  def retry[T]
-      (times: Int, interval: Duration, retryMessage: Option[String] = None)
-      (f: => T)
-      (implicit executionContext: ExecutionContext): Future[T] = {
-    retryableFuture(1, times, interval, retryMessage)(Future[T] { f })
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
index 1cabfbad656eb..576f7058f20ee 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
@@ -32,7 +32,8 @@ import org.apache.spark.status.api.v1.JacksonMessageWriter
 private[spark] object HttpClientUtil {
 
   def createClient[T: ClassTag](
-      uris: Array[String],
+      uris: Set[String],
+      maxRetriesPerServer: Int = 1,
       sslSocketFactory: SSLSocketFactory = SSLContext.getDefault.getSocketFactory,
       trustContext: X509TrustManager = null,
       readTimeoutMillis: Int = 20000,
@@ -45,12 +46,12 @@ private[spark] object HttpClientUtil {
       .registerModule(new DefaultScalaModule)
       .setDateFormat(JacksonMessageWriter.makeISODateFormat)
     objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
-    val target = new MultiServerFeignTarget[T](uris)
+    val target = new MultiServerFeignTarget[T](uris.toSeq, maxRetriesPerServer)
     val baseHttpClient = new feign.okhttp.OkHttpClient(httpClientBuilder.build())
     val resetTargetHttpClient = new Client {
       override def execute(request: Request, options: Options): Response = {
         val response = baseHttpClient.execute(request, options)
-        if (response.status() >= 200 && response.status() < 300) {
+        if (response.status() / 100 == 2) {
           target.reset()
         }
         response
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
index fea7f057cfa1b..51313e00ce2da 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
@@ -20,20 +20,25 @@ import feign.{Request, RequestTemplate, RetryableException, Retryer, Target}
 import scala.reflect.ClassTag
 import scala.util.Random
 
+import org.apache.spark.internal.Logging
+
 private[kubernetes] class MultiServerFeignTarget[T : ClassTag](
-    private val servers: Seq[String]) extends Target[T] with Retryer {
+    private val servers: Seq[String],
+    private val maxRetriesPerServer: Int = 1,
+    private val delayBetweenRetriesMillis: Int = 1000) extends Target[T] with Retryer with Logging {
   require(servers.nonEmpty, "Must provide at least one server URI.")
 
   private val threadLocalShuffledServers = new ThreadLocal[Seq[String]] {
     override def initialValue(): Seq[String] = Random.shuffle(servers)
   }
+  private val threadLocalCurrentAttempt = new ThreadLocal[Int] {
+    override def initialValue(): Int = 0
+  }
 
   override def `type`(): Class[T] = {
     implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
   }
 
-  override def url(): String = threadLocalShuffledServers.get.head
-
   /**
    * Cloning the target is done on every request, for use on the current
    * thread - thus it's important that clone returns a "fresh" target.
@@ -54,14 +59,31 @@ private[kubernetes] class MultiServerFeignTarget[T : ClassTag](
     requestTemplate.request()
   }
 
+  override def url(): String = threadLocalShuffledServers.get.head
+
   override def continueOrPropagate(e: RetryableException): Unit = {
-    threadLocalShuffledServers.set(threadLocalShuffledServers.get.drop(1))
-    if (threadLocalShuffledServers.get.isEmpty) {
-      throw e
+    threadLocalCurrentAttempt.set(threadLocalCurrentAttempt.get + 1)
+    val currentAttempt = threadLocalCurrentAttempt.get
+    if (threadLocalCurrentAttempt.get < maxRetriesPerServer) {
+      logWarning(s"Attempt $currentAttempt of $maxRetriesPerServer failed for" +
+        s" server ${url()}. Retrying request...", e)
+      Thread.sleep(delayBetweenRetriesMillis)
+    } else {
+      val previousUrl = url()
+      threadLocalShuffledServers.set(threadLocalShuffledServers.get.drop(1))
+      if (threadLocalShuffledServers.get.isEmpty) {
+        logError(s"Failed request to all servers $maxRetriesPerServer times.", e)
+        throw e
+      } else {
+        logWarning(s"Failed request to $previousUrl $maxRetriesPerServer times." +
+          s" Trying to access ${url()} instead.", e)
+        threadLocalCurrentAttempt.set(0)
+      }
     }
   }
 
   def reset(): Unit = {
     threadLocalShuffledServers.set(Random.shuffle(servers))
+    threadLocalCurrentAttempt.set(0)
   }
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
index b42f97952394e..736b92cc2d628 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -123,7 +123,7 @@ private[spark] object Minikube extends Logging {
       .build()
     val sslContext = SSLUtils.sslContext(kubernetesConf)
     val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
-    HttpClientUtil.createClient[T](Array(url), sslContext.getSocketFactory, trustManager)
+    HttpClientUtil.createClient[T](Set(url), 5, sslContext.getSocketFactory, trustManager)
   }
 
   def executeMinikubeSsh(command: String): Unit = {

From bdfc4e122cfcf2782ac41028a7fa398bd2ae4e4b Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Thu, 9 Feb 2017 17:54:39 -0800
Subject: [PATCH 416/534] Note integration tests require Java 8 (#99)

---
 resource-managers/kubernetes/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 5e4ffaa54cb55..92ec305513f42 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -41,6 +41,8 @@ Below is a list of the submodules for this cluster manager and what they do.
 
 Note that the integration test framework is currently being heavily revised and is subject to change.
 
+Note that currently the integration tests only run with Java 8.
+
 Running any of the integration tests requires including `kubernetes-integration-tests` profile in the build command. In
 order to prepare the environment for running the integration tests, the `pre-integration-test` step must be run in Maven
 on the `resource-managers/kubernetes/integration-tests` module:

From fe8b45c3543c8ae2af5f774c47a5f80142533307 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 10 Feb 2017 14:50:38 -0800
Subject: [PATCH 417/534] Bumping up kubernetes-client version to fix GKE and
 local proxy (#105)

* Bumping up kubernetes-client version to add fixes

* Modify wording

* Addressed comments
---
 docs/running-on-kubernetes.md                 | 30 ++++++++++++++++++-
 resource-managers/kubernetes/core/pom.xml     |  2 +-
 .../KubernetesClusterSchedulerBackend.scala   |  2 +-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 5a48bb254a6df..19f406039e261 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -51,7 +51,7 @@ connect without SSL on a different port, the master would be set to `k8s://http:
 
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
- 
+
 ### Adding Other JARs
  
 Spark allows users to provide dependencies that are bundled into the driver's Docker image, or that are on the local
@@ -150,6 +150,34 @@ or `container:`. A scheme of `file:` corresponds to the keyStore being located o
 the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
 `container:`, the file is assumed to already be on the container's disk at the appropriate path.
 
+### Kubernetes Clusters and the authenticated proxy endpoint
+
+Spark-submit also supports submission through the
+[local kubectl proxy](https://kubernetes.io/docs/user-guide/connecting-to-applications-proxy/). One can use the
+authenticating proxy to communicate with the api server directly without passing credentials to spark-submit.
+
+The local proxy can be started by running:
+
+    kubectl proxy
+
+If our local proxy were listening on port 8001, we would have our submission looking like the following:
+
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class org.apache.spark.examples.SparkPi \
+      --master k8s://http://127.0.0.1:8001 \
+      --kubernetes-namespace default \
+      --conf spark.executor.instances=5 \
+      --conf spark.app.name=spark-pi \
+      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
+      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
+      examples/jars/spark_examples_2.11-2.2.0.jar
+
+Communication between Spark and Kubernetes clusters is performed using the fabric8 kubernetes-client library.
+The above mechanism using `kubectl proxy` can be used when we have authentication providers that the fabric8
+kubernetes-client library does not support. Authentication using X509 Client Certs and oauth tokens
+is currently supported.
+
 ### Spark Properties
 
 Below are some other common properties that are specific to Kubernetes. Most of the other configurations are the same
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 86d7dec2c076f..a7eba625cd56c 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -29,7 +29,7 @@
   <name>Spark Project Kubernetes</name>
   <properties>
     <sbt.project.name>kubernetes</sbt.project.name>
-    <kubernetes.client.version>1.4.34</kubernetes.client.version>
+    <kubernetes.client.version>2.0.3</kubernetes.client.version>
   </properties>
 
   <dependencies>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 550ddd113fa42..83225098bc651 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -44,7 +44,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val EXECUTOR_MODIFICATION_LOCK = new Object
   private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
 
-  private val kubernetesMaster = Client.resolveK8sMaster(sc.master)
+  private val kubernetesMaster = "https://kubernetes"
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
   private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)

From 7a4075f3a2f063ed26240d66841deeffcf3b5980 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 10 Feb 2017 18:38:29 -0800
Subject: [PATCH 418/534] Truncate k8s hostnames to be no longer than 63
 characters (#102)

* Truncate k8s hostnames to be no longer than 63 characters

* Use only executorId not executorKubernetesId
---
 .../KubernetesClusterSchedulerBackend.scala          | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 83225098bc651..d4e7da464be4a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -155,9 +155,14 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   private def allocateNewExecutorPod(): (String, Pod) = {
-    val executorKubernetesId = UUID.randomUUID().toString.replaceAll("-", "")
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
-    val name = s"${applicationId()}-exec-$executorKubernetesId"
+    val name = s"${applicationId()}-exec-$executorId"
+
+    // hostname must be no longer than 63 characters, so take the last 63 characters of the pod
+    // name as the hostname.  This preserves uniqueness since the end of name contains
+    // executorId and applicationId
+    val hostname = name.substring(Math.max(0, name.length - 63))
+
     val selectors = Map(SPARK_EXECUTOR_ID_LABEL -> executorId,
       SPARK_APP_ID_LABEL -> applicationId()).asJava
     val executorMemoryQuantity = new QuantityBuilder(false)
@@ -190,7 +195,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
           .build()
       })
     try {
-      (executorKubernetesId, kubernetesClient.pods().createNew()
+      (executorId, kubernetesClient.pods().createNew()
         .withNewMetadata()
           .withName(name)
           .withLabels(selectors)
@@ -204,6 +209,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
           .endOwnerReference()
         .endMetadata()
         .withNewSpec()
+          .withHostname(hostname)
           .addNewContainer()
             .withName(s"executor")
             .withImage(executorDockerImage)

From 3d80fffea60cb0e6ed14644adfec52d4e6185701 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Mon, 13 Feb 2017 15:43:30 +0000
Subject: [PATCH 419/534] Fixed loading the executors page through the kubectl
 proxy. (#95)

Fix apache-spark-on-k8s/spark#87
---
 .../apache/spark/ui/static/executorspage.js   | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
index fe5db6aa26b65..fa0282678d1f4 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js
@@ -54,7 +54,28 @@ $(document).ajaxStart(function () {
     $.blockUI({message: '<h3>Loading Executors Page...</h3>'});
 });
 
+function findKubernetesServiceBaseURI() {
+    var k8sProxyPattern = '/api/v1/proxy/namespaces/';
+    var k8sProxyPatternPos = document.baseURI.indexOf(k8sProxyPattern);
+    if (k8sProxyPatternPos > 0) {
+        // Spark is running in a kubernetes cluster, and the web ui is served
+        // through the kubectl proxy.
+        var remaining = document.baseURI.substr(k8sProxyPatternPos + k8sProxyPattern.length);
+        var urlSlashesCount = remaining.split('/').length - 3;
+        var words = document.baseURI.split('/');
+        var baseURI = words.slice(0, words.length - urlSlashesCount).join('/');
+        return baseURI;
+    }
+
+    return null;
+}
+
 function createTemplateURI(appId) {
+    var kubernetesBaseURI = findKubernetesServiceBaseURI();
+    if (kubernetesBaseURI) {
+        return kubernetesBaseURI + '/static/executorspage-template.html';
+    }
+
     var words = document.baseURI.split('/');
     var ind = words.indexOf("proxy");
     if (ind > 0) {
@@ -70,6 +91,14 @@ function createTemplateURI(appId) {
 }
 
 function getStandAloneppId(cb) {
+    var kubernetesBaseURI = findKubernetesServiceBaseURI();
+    if (kubernetesBaseURI) {
+        var appIdAndPort = kubernetesBaseURI.split('/').slice(-1)[0];
+        var appId = appIdAndPort.split(':')[0];
+        cb(appId);
+        return;
+    }
+
     var words = document.baseURI.split('/');
     var ind = words.indexOf("proxy");
     if (ind > 0) {
@@ -95,6 +124,11 @@ function getStandAloneppId(cb) {
 }
 
 function createRESTEndPoint(appId) {
+    var kubernetesBaseURI = findKubernetesServiceBaseURI();
+    if (kubernetesBaseURI) {
+        return kubernetesBaseURI + "/api/v1/applications/" + appId + "/allexecutors";
+    }
+
     var words = document.baseURI.split('/');
     var ind = words.indexOf("proxy");
     if (ind > 0) {

From a34a11416d5891bdc4a102c97aed4760d70b3ecd Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Mon, 13 Feb 2017 12:49:16 -0800
Subject: [PATCH 420/534] Filter nodes to only try and send files to external
 IPs (#106)

* Filter node addresses

* Added comment
---
 .../org/apache/spark/deploy/kubernetes/Client.scala      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 9eed9bfd2cd79..d3aa515484f78 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -745,9 +745,14 @@ private[spark] class Client(
     val nodeUrls = kubernetesClient.nodes.list.getItems.asScala
       .filterNot(node => node.getSpec.getUnschedulable != null &&
         node.getSpec.getUnschedulable)
-      .flatMap(_.getStatus.getAddresses.asScala.map(address => {
+      .flatMap(_.getStatus.getAddresses.asScala)
+      // The list contains hostnames, internal and external IP addresses.
+      // we want only external IP addresses in our list
+      // (https://kubernetes.io/docs/admin/node/#addresses)
+      .filter(_.getType == "ExternalIP")
+      .map(address => {
         s"$urlScheme://${address.getAddress}:$servicePort"
-      })).toSet
+      }).toSet
     require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
     val (trustManager, sslContext): (X509TrustManager, SSLContext) =
       if (driverSubmitSslOptions.enabled) {

From ac4dd917326dad358568549f3d60fa0c91c86d85 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Mon, 13 Feb 2017 23:18:01 +0000
Subject: [PATCH 421/534] Parse results of minikube status more rigorously
 (#97)

* Parse results of minikube status more rigorously

Prior code assumes the minikubeVM status line is always the first row output
from minikube status, and it is not when the version upgrade notifier prints
an upgrade suggestion message.

* Also filter ip response to expected rows
---
 .../kubernetes/integrationtest/minikube/Minikube.scala     | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
index 736b92cc2d628..e7eea679adf79 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.kubernetes.integrationtest.minikube
 import java.io.{BufferedReader, InputStreamReader}
 import java.nio.file.Paths
 import java.util.concurrent.TimeUnit
+import java.util.regex.Pattern
 import javax.net.ssl.X509TrustManager
 
 import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
@@ -58,13 +59,17 @@ private[spark] object Minikube extends Logging {
   def getMinikubeIp: String = synchronized {
     assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
     val outputs = executeMinikube("ip")
+      .filter(_.matches("^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$"))
     assert(outputs.size == 1, "Unexpected amount of output from minikube ip")
     outputs.head
   }
 
   def getMinikubeStatus: MinikubeStatus.Value = synchronized {
     assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
-    val statusString = executeMinikube("status").head.replaceFirst("minikubeVM: ", "")
+    val statusString = executeMinikube("status")
+      .filter(_.contains("minikubeVM: "))
+      .head
+      .replaceFirst("minikubeVM: ", "")
     MinikubeStatus.unapply(statusString)
         .getOrElse(throw new IllegalStateException(s"Unknown status $statusString"))
   }

From 2112c4a4e76ec594bec2e5e47e7ab5f3d03b7a64 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Tue, 14 Feb 2017 14:32:54 -0800
Subject: [PATCH 422/534] Adding legacyHostIP to the list of IPs we look at
 (#114)

---
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index d3aa515484f78..279ee505de609 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -747,9 +747,11 @@ private[spark] class Client(
         node.getSpec.getUnschedulable)
       .flatMap(_.getStatus.getAddresses.asScala)
       // The list contains hostnames, internal and external IP addresses.
-      // we want only external IP addresses in our list
       // (https://kubernetes.io/docs/admin/node/#addresses)
-      .filter(_.getType == "ExternalIP")
+      // we want only external IP addresses and legacyHostIP addresses in our list
+      // legacyHostIPs are deprecated and will be removed in the future.
+      // (https://github.com/kubernetes/kubernetes/issues/9267)
+      .filter(address => address.getType == "ExternalIP" || address.getType == "LegacyHostIP")
       .map(address => {
         s"$urlScheme://${address.getAddress}:$servicePort"
       }).toSet

From 043cdd9677e4c7a88e415029e4c988a58375429f Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 15 Feb 2017 01:29:31 +0000
Subject: [PATCH 423/534] Add -DskipTests to dev docs (#115)

* Add -DskipTests to dev docs

* Remove extraneous skipTests
---
 resource-managers/kubernetes/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 92ec305513f42..25b62ba35a193 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -17,7 +17,7 @@ important matters to keep in mind when developing this feature.
 To build Spark with Kubernetes support, use the `kubernetes` profile when invoking Maven. For example, to simply compile
 the Kubernetes core implementation module along with its dependencies:
 
-    build/mvn compile -Pkubernetes -pl resource-managers/kubernetes/core -am
+    build/mvn compile -Pkubernetes -pl resource-managers/kubernetes/core -am -DskipTests
 
 To build a distribution of Spark with Kubernetes support, use the `dev/make-distribution.sh` script, and add the
 `kubernetes` profile as part of the build arguments. Any other build arguments can be specified as one would expect when
@@ -47,7 +47,7 @@ Running any of the integration tests requires including `kubernetes-integration-
 order to prepare the environment for running the integration tests, the `pre-integration-test` step must be run in Maven
 on the `resource-managers/kubernetes/integration-tests` module:
 
-    build/mvn pre-integration-test -Pkubernetes -Pkubernetes-integration-tests -pl resource-managers/kubernetes/integration-tests -am
+    build/mvn pre-integration-test -Pkubernetes -Pkubernetes-integration-tests -pl resource-managers/kubernetes/integration-tests -am -DskipTests
  
 Afterwards, the integration tests can be executed with Maven or your IDE. Note that when running tests from an IDE, the
 `pre-integration-test` phase must be run every time the Spark main code changes. When running tests from the

From 0e6df1145d9ede8c18612d9d9096e2fdd4274879 Mon Sep 17 00:00:00 2001
From: Varun <varun@pepperdata.com>
Date: Wed, 15 Feb 2017 16:38:46 -0800
Subject: [PATCH 424/534] Shutdown the thread scheduler in
 LoggingPodStatusWatcher on receiving job finish event notifications (#121)

---
 .../apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
index cbacaf6bda854..b7a29fedcbd2d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
@@ -64,6 +64,7 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
 
     if (phase == "Succeeded" || phase == "Failed") {
       podCompletedFuture.countDown()
+      scheduler.shutdown()
     }
   }
 

From a800e20950c47fe03a8d0d06c983501debb72f8c Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Wed, 15 Feb 2017 17:22:36 -0800
Subject: [PATCH 425/534] Trigger scalatest plugin in the integration-test
 phase (#93)

* Trigger scalatest plugin in the integration-test phase

* Clean up unnecessary config section
---
 .../kubernetes/integration-tests/pom.xml      | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index f6a322f18cd75..3de10f94c4aca 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -216,6 +216,33 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <!-- Triggers scalatest plugin in the integration-test phase instead of
+             the test phase, so that test jobs are copied over beforehand.
+             See copy-test-spark-jobs execution of maven-dependency-plugin above. -->
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>test</id>
+            <goals>
+              <goal>test</goal>
+            </goals>
+            <configuration>
+              <!-- The negative pattern below prevents integration tests such as
+                   KubernetesSuite from running in the test phase. -->
+              <suffixes>(?&lt;!Suite)</suffixes>
+            </configuration>
+          </execution>
+          <execution>
+            <id>integration-test</id>
+            <phase>integration-test</phase>
+            <goals>
+              <goal>test</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
 
   </build>

From 2773b778b55f08f6279eae645a919ab140de585f Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 15 Feb 2017 17:56:35 -0800
Subject: [PATCH 426/534] Fix issue with DNS resolution (#118)

* Fix issue with DNS resolution

* Address comments
---
 .../spark/deploy/kubernetes/KubernetesClientBuilder.scala    | 5 +++--
 .../scala/org/apache/spark/deploy/kubernetes/constants.scala | 1 +
 .../kubernetes/KubernetesClusterSchedulerBackend.scala       | 3 +--
 .../kubernetes/integrationtest/minikube/Minikube.scala       | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
index 61d3ac17ac34a..89369b30694ee 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
@@ -22,6 +22,8 @@ import com.google.common.base.Charsets
 import com.google.common.io.Files
 import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
 
+import org.apache.spark.deploy.kubernetes.constants._
+
 private[spark] object KubernetesClientBuilder {
   private val API_SERVER_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
   private val CA_CERT_FILE = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
@@ -33,11 +35,10 @@ private[spark] object KubernetesClientBuilder {
    * into the pod's disk space.
    */
   def buildFromWithinPod(
-      kubernetesMaster: String,
       kubernetesNamespace: String): DefaultKubernetesClient = {
     var clientConfigBuilder = new ConfigBuilder()
       .withApiVersion("v1")
-      .withMasterUrl(kubernetesMaster)
+      .withMasterUrl(KUBERNETES_MASTER_INTERNAL_URL)
       .withNamespace(kubernetesNamespace)
 
     if (CA_CERT_FILE.isFile) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 027cc3c022b4e..688cd858e79ff 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -67,4 +67,5 @@ package object constants {
   // Miscellaneous
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
   private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submit"
+  private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index d4e7da464be4a..898b215b92d04 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -44,7 +44,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val EXECUTOR_MODIFICATION_LOCK = new Object
   private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
 
-  private val kubernetesMaster = "https://kubernetes"
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
   private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)
@@ -77,7 +76,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
 
   private val kubernetesClient = KubernetesClientBuilder
-    .buildFromWithinPod(kubernetesMaster, kubernetesNamespace)
+    .buildFromWithinPod(kubernetesNamespace)
 
   private val driverPod = try {
     kubernetesClient.pods().inNamespace(kubernetesNamespace).
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
index e7eea679adf79..07274bf962dde 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -76,7 +76,7 @@ private[spark] object Minikube extends Logging {
 
   def getDockerEnv: Map[String, String] = synchronized {
     assert(MINIKUBE_EXECUTABLE_DEST.exists(), EXPECTED_DOWNLOADED_MINIKUBE_MESSAGE)
-    executeMinikube("docker-env")
+    executeMinikube("docker-env", "--shell", "bash")
         .filter(_.startsWith("export"))
         .map(_.replaceFirst("export ", "").split('='))
         .map(arr => (arr(0), arr(1).replaceAllLiterally("\"", "")))

From 6a999cab598edc5fd87974a74f460c131885d012 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 16 Feb 2017 15:28:42 -0800
Subject: [PATCH 427/534] Change the API contract for uploading local files
 (#107)

* Change the API contract for uploading local jars.

This mirrors similarly to what YARN and Mesos expects.

* Address comments

* Fix test
---
 .../org/apache/spark/deploy/SparkSubmit.scala |   9 +-
 .../spark/deploy/SparkSubmitArguments.scala   |  14 --
 docs/running-on-kubernetes.md                 | 108 ++--------------
 .../launcher/SparkSubmitOptionParser.java     |   8 +-
 .../spark/deploy/kubernetes/Client.scala      | 121 ++++++------------
 .../spark/deploy/kubernetes/config.scala      |  24 ----
 .../rest/KubernetesRestProtocolMessages.scala |   4 +-
 .../rest/kubernetes/KubernetesFileUtils.scala |  44 +++++++
 .../KubernetesSparkRestServer.scala           | 115 +++++++++++------
 .../kubernetes/docker-minimal-bundle/pom.xml  |   6 -
 .../src/main/assembly/driver-assembly.xml     |  11 --
 .../src/main/assembly/executor-assembly.xml   |  11 --
 .../kubernetes/integration-tests/pom.xml      |  50 ++++++++
 .../integrationtest/KubernetesSuite.scala     |  87 +++----------
 14 files changed, 244 insertions(+), 368 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 51eb23560defe..002b29d5564e1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -472,10 +472,6 @@ object SparkSubmit {
 
       OptionAssigner(args.kubernetesNamespace, KUBERNETES, ALL_DEPLOY_MODES,
         sysProp = "spark.kubernetes.namespace"),
-      OptionAssigner(args.kubernetesUploadJars, KUBERNETES, CLUSTER,
-        sysProp = "spark.kubernetes.driver.uploads.jars"),
-      OptionAssigner(args.kubernetesUploadFiles, KUBERNETES, CLUSTER,
-        sysProp = "spark.kubernetes.driver.uploads.files"),
 
         // Other options
       OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
@@ -484,10 +480,11 @@ object SparkSubmit {
         sysProp = "spark.executor.memory"),
       OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS, ALL_DEPLOY_MODES,
         sysProp = "spark.cores.max"),
-      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
+      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         sysProp = "spark.files"),
       OptionAssigner(args.jars, LOCAL, CLIENT, sysProp = "spark.jars"),
-      OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, sysProp = "spark.jars"),
+      OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
+        sysProp = "spark.jars"),
       OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN, CLUSTER,
         sysProp = "spark.driver.memory"),
       OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN, CLUSTER,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index f771755244f31..4e297fe3b0e3b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -73,8 +73,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
 
   // Kubernetes only
   var kubernetesNamespace: String = null
-  var kubernetesUploadJars: String = null
-  var kubernetesUploadFiles: String = null
 
   // Standalone cluster mode only
   var supervise: Boolean = false
@@ -194,12 +192,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     kubernetesNamespace = Option(kubernetesNamespace)
       .orElse(sparkProperties.get("spark.kubernetes.namespace"))
       .orNull
-    kubernetesUploadJars = Option(kubernetesUploadJars)
-      .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.jars"))
-      .orNull
-    kubernetesUploadFiles = Option(kubernetesUploadFiles)
-      .orElse(sparkProperties.get("spark.kubernetes.driver.uploads.files"))
-      .orNull
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && !isR && primaryResource != null) {
@@ -443,12 +435,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
       case KUBERNETES_NAMESPACE =>
         kubernetesNamespace = value
 
-      case KUBERNETES_UPLOAD_JARS =>
-        kubernetesUploadJars = value
-
-      case KUBERNETES_UPLOAD_FILES =>
-        kubernetesUploadFiles = value
-
       case HELP =>
         printUsageAndExit(0)
 
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 19f406039e261..e5c7e9bb69448 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -51,87 +51,15 @@ connect without SSL on a different port, the master would be set to `k8s://http:
 
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
-
-### Adding Other JARs
- 
-Spark allows users to provide dependencies that are bundled into the driver's Docker image, or that are on the local
-disk of the submitter's machine. These two types of dependencies are specified via different configuration options to
-`spark-submit`:
  
-* Local jars provided by specifying the `--jars` command line argument to `spark-submit`, or by setting `spark.jars` in
-  the application's configuration, will be treated as jars that are located on the *disk of the driver container*. This
-  only applies to jar paths that do not specify a scheme or that have the scheme `file://`. Paths with other schemes are
-  fetched from their appropriate locations.
-* Local jars provided by specifying the `--upload-jars` command line argument to `spark-submit`, or by setting
-  `spark.kubernetes.driver.uploads.jars` in the application's configuration, will be treated as jars that are located on
-  the *disk of the submitting machine*. These jars are uploaded to the driver docker container before executing the
-  application.
-* A main application resource path that does not have a scheme or that has the scheme `file://` is assumed to be on the
-  *disk of the submitting machine*. This resource is uploaded to the driver docker container before executing the
-  application. A remote path can still be specified and the resource will be fetched from the appropriate location.
-* A main application resource path that has the scheme `container://` is assumed to be on the *disk of the driver
-  container*.
-  
-In all of these cases, the jars are placed on the driver's classpath, and are also sent to the executors. Below are some
-examples of providing application dependencies.
-
-To submit an application with both the main resource and two other jars living on the submitting user's machine:
-
-    bin/spark-submit \
-      --deploy-mode cluster \
-      --class com.example.applications.SampleApplication \
-      --master k8s://192.168.99.100 \
-      --upload-jars /home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
-      /home/exampleuser/exampleapplication/main.jar
-      
-Note that since passing the jars through the `--upload-jars` command line argument is equivalent to setting the
-`spark.kubernetes.driver.uploads.jars` Spark property, the above will behave identically to this command:
-
-    bin/spark-submit \
-      --deploy-mode cluster \
-      --class com.example.applications.SampleApplication \
-      --master k8s://192.168.99.100 \
-      --conf spark.kubernetes.driver.uploads.jars=/home/exampleuser/exampleapplication/dep1.jar,/home/exampleuser/exampleapplication/dep2.jar \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
-      /home/exampleuser/exampleapplication/main.jar
-
-To specify a main application resource that can be downloaded from an HTTP service, and if a plugin for that application
-is located in the jar `/opt/spark-plugins/app-plugin.jar` on the docker image's disk:
-
-    bin/spark-submit \
-      --deploy-mode cluster \
-      --class com.example.applications.PluggableApplication \
-      --master k8s://192.168.99.100 \
-      --jars /opt/spark-plugins/app-plugin.jar \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
-      http://example.com:8080/applications/sparkpluggable/app.jar
-      
-Note that since passing the jars through the `--jars` command line argument is equivalent to setting the `spark.jars`
-Spark property, the above will behave identically to this command:
-
-    bin/spark-submit \
-      --deploy-mode cluster \
-      --class com.example.applications.PluggableApplication \
-      --master k8s://192.168.99.100 \
-      --conf spark.jars=file:///opt/spark-plugins/app-plugin.jar \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
-      http://example.com:8080/applications/sparkpluggable/app.jar
-      
-To specify a main application resource that is in the Docker image, and if it has no other dependencies:
-
-    bin/spark-submit \
-      --deploy-mode cluster \
-      --class com.example.applications.PluggableApplication \
-      --master k8s://192.168.99.100:8443 \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver-custom:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
-      container:///home/applications/examples/example.jar
+### Dependency Management and Docker Containers
 
+Spark supports specifying JAR paths that are either on the submitting host's disk, or are located on the disk of the
+driver and executors. Refer to the [application submission](submitting-applications.html#advanced-dependency-management)
+section for details. Note that files specified with the `local` scheme should be added to the container image of both
+the driver and the executors. Files without a scheme or with the scheme `file://` are treated as being on the disk of
+the submitting machine, and are uploaded to the driver running in Kubernetes before launching the application.
+ 
 ### Setting Up SSL For Submitting the Driver
 
 When submitting to Kubernetes, a pod is started for the driver, and the pod starts an HTTP server. This HTTP server
@@ -146,9 +74,9 @@ pod in starting the application, set `spark.ssl.kubernetes.submit.trustStore`.
 
 One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
 container image's disk. Thus `spark.ssl.kubernetes.submit.keyStore` can be a URI with a scheme of either `file:`
-or `container:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
+or `local:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
 the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
-`container:`, the file is assumed to already be on the container's disk at the appropriate path.
+`local:`, the file is assumed to already be on the container's disk at the appropriate path.
 
 ### Kubernetes Clusters and the authenticated proxy endpoint
 
@@ -241,24 +169,6 @@ from the other deployment modes. See the [configuration page](configuration.html
     executor pods from the API server.
   </td>
 </tr>
-<tr>
-  <td><code>spark.kubernetes.driver.uploads.jars</code></td>
-  <td>(none)</td>
-  <td>
-    Comma-separated list of jars to send to the driver and all executors when submitting the application in cluster
-    mode. Refer to <a href="running-on-kubernetes.html#adding-other-jars">adding other jars</a> for more information.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.driver.uploads.files</code></td>
-  <td>(none)</td>
-  <td>
-    Comma-separated list of files to send to the driver and all executors when submitting the application in cluster
-    mode. The files are added in a flat hierarchy to the current working directory of the driver, having the same
-    names as the names of the original files. Note that two files with the same name cannot be added, even if they
-    were in different source directories on the client disk.
-  </td>
-</tr>
 <tr>
   <td><code>spark.kubernetes.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384 </td>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index 3369b5d8301be..a4d43c0795abc 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -77,10 +77,7 @@ class SparkSubmitOptionParser {
   protected final String QUEUE = "--queue";
 
   // Kubernetes-only options.
-  protected final String KUBERNETES_MASTER = "--kubernetes-master";
   protected final String KUBERNETES_NAMESPACE = "--kubernetes-namespace";
-  protected final String KUBERNETES_UPLOAD_JARS = "--upload-jars";
-  protected final String KUBERNETES_UPLOAD_FILES = "--upload-files";
 
   /**
    * This is the canonical list of spark-submit options. Each entry in the array contains the
@@ -121,10 +118,7 @@ class SparkSubmitOptionParser {
     { REPOSITORIES },
     { STATUS },
     { TOTAL_EXECUTOR_CORES },
-    { KUBERNETES_MASTER },
-    { KUBERNETES_NAMESPACE },
-    { KUBERNETES_UPLOAD_JARS },
-    { KUBERNETES_UPLOAD_FILES }
+    { KUBERNETES_NAMESPACE }
   };
 
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 279ee505de609..aa273a024f6f9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -35,7 +35,7 @@ import scala.collection.mutable
 import org.apache.spark.{SecurityManager, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, TarGzippedData, UploadedAppResource}
+import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
@@ -59,11 +59,10 @@ private[spark] class Client(
   private val sslSecretsDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId-ssl"
   private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
-  private val uploadedJars = sparkConf.get(KUBERNETES_DRIVER_UPLOAD_JARS).filter(_.nonEmpty)
-  private val uploadedFiles = sparkConf.get(KUBERNETES_DRIVER_UPLOAD_FILES).filter(_.nonEmpty)
-  uploadedFiles.foreach(validateNoDuplicateUploadFileNames)
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
+  private val sparkFiles = sparkConf.getOption("spark.files")
+  private val sparkJars = sparkConf.getOption("spark.jars")
 
   private val waitForAppCompletion: Boolean = sparkConf.get(WAIT_FOR_APP_COMPLETION)
 
@@ -78,9 +77,18 @@ private[spark] class Client(
 
   def run(): Unit = {
     logInfo(s"Starting application $kubernetesAppId in Kubernetes...")
-
-    Seq(uploadedFiles, uploadedJars, Some(mainAppResource)).foreach(checkForFilesExistence)
-
+    val submitterLocalFiles = KubernetesFileUtils.getOnlySubmitterLocalFiles(sparkFiles)
+    val submitterLocalJars = KubernetesFileUtils.getOnlySubmitterLocalFiles(sparkJars)
+    (submitterLocalFiles ++ submitterLocalJars).foreach { file =>
+      if (!new File(Utils.resolveURI(file).getPath).isFile) {
+        throw new SparkException(s"File $file does not exist or is a directory.")
+      }
+    }
+    if (KubernetesFileUtils.isUriLocalFile(mainAppResource) &&
+        !new File(Utils.resolveURI(mainAppResource).getPath).isFile) {
+      throw new SparkException(s"Main app resource file $mainAppResource is not a file or" +
+        s" is a directory.")
+    }
     val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
     val parsedCustomLabels = parseCustomLabels(customLabels)
     var k8ConfBuilder = new K8SConfigBuilder()
@@ -145,7 +153,7 @@ private[spark] class Client(
             }
             try {
               submitApplicationToDriverServer(kubernetesClient, driverSubmitSslOptions,
-                ownerReferenceConfiguredDriverService)
+                ownerReferenceConfiguredDriverService, submitterLocalFiles, submitterLocalJars)
               // wait if configured to do so
               if (waitForAppCompletion) {
                 logInfo(s"Waiting for application $kubernetesAppId to finish...")
@@ -193,7 +201,9 @@ private[spark] class Client(
   private def submitApplicationToDriverServer(
       kubernetesClient: KubernetesClient,
       driverSubmitSslOptions: SSLOptions,
-      driverService: Service) = {
+      driverService: Service,
+      submitterLocalFiles: Iterable[String],
+      submitterLocalJars: Iterable[String]): Unit = {
     sparkConf.getOption("spark.app.id").foreach { id =>
       logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
         s" overridden as $kubernetesAppId")
@@ -211,7 +221,7 @@ private[spark] class Client(
     driverSubmitter.ping()
     logInfo(s"Submitting local resources to driver pod for application " +
       s"$kubernetesAppId ...")
-    val submitRequest = buildSubmissionRequest()
+    val submitRequest = buildSubmissionRequest(submitterLocalFiles, submitterLocalJars)
     driverSubmitter.submitApplication(submitRequest)
     logInfo("Successfully submitted local resources and driver configuration to" +
       " driver pod.")
@@ -502,25 +512,18 @@ private[spark] class Client(
     val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_KEYSTORE)
     val resolvedSparkConf = sparkConf.clone()
     val (isLocalKeyStore, resolvedKeyStore) = maybeKeyStore.map(keyStore => {
-      val keyStoreURI = Utils.resolveURI(keyStore)
-      val isProvidedKeyStoreLocal = keyStoreURI.getScheme match {
-        case "file" | null => true
-        case "container" => false
-        case _ => throw new SparkException(s"Invalid KeyStore URI $keyStore; keyStore URI" +
-          " for submit server must have scheme file:// or container:// (no scheme defaults" +
-          " to file://)")
-      }
-      (isProvidedKeyStoreLocal, Option.apply(keyStoreURI.getPath))
-    }).getOrElse((true, Option.empty[String]))
+      (KubernetesFileUtils.isUriLocalFile(keyStore),
+        Option.apply(Utils.resolveURI(keyStore).getPath))
+    }).getOrElse((false, Option.empty[String]))
     resolvedKeyStore.foreach {
       resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, _)
     }
     sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE).foreach { trustStore =>
-      val trustStoreURI = Utils.resolveURI(trustStore)
-      trustStoreURI.getScheme match {
-        case "file" | null =>
-          resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE, trustStoreURI.getPath)
-        case _ => throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
+      if (KubernetesFileUtils.isUriLocalFile(trustStore)) {
+        resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE,
+          Utils.resolveURI(trustStore).getPath)
+      } else {
+        throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
           " for submit server must have no scheme, or scheme file://")
       }
     }
@@ -673,23 +676,24 @@ private[spark] class Client(
         .build())
   }
 
-  private def buildSubmissionRequest(): KubernetesCreateSubmissionRequest = {
-    val appResourceUri = Utils.resolveURI(mainAppResource)
-    val resolvedAppResource: AppResource = appResourceUri.getScheme match {
-      case "file" | null =>
-        val appFile = new File(appResourceUri.getPath)
-        if (!appFile.isFile) {
-          throw new IllegalStateException("Provided local file path does not exist" +
-            s" or is not a file: ${appFile.getAbsolutePath}")
-        }
+  private def buildSubmissionRequest(
+      submitterLocalFiles: Iterable[String],
+      submitterLocalJars: Iterable[String]): KubernetesCreateSubmissionRequest = {
+    val mainResourceUri = Utils.resolveURI(mainAppResource)
+    val resolvedAppResource: AppResource = Option(mainResourceUri.getScheme)
+        .getOrElse("file") match {
+      case "file" =>
+        val appFile = new File(mainResourceUri.getPath)
         val fileBytes = Files.toByteArray(appFile)
         val fileBase64 = Base64.encodeBase64String(fileBytes)
         UploadedAppResource(resourceBase64Contents = fileBase64, name = appFile.getName)
-      case "container" => ContainerAppResource(appResourceUri.getPath)
+      case "local" => ContainerAppResource(mainAppResource)
       case other => RemoteAppResource(other)
     }
-    val uploadJarsBase64Contents = compressFiles(uploadedJars)
-    val uploadFilesBase64Contents = compressFiles(uploadedFiles)
+    val uploadFilesBase64Contents = CompressionUtils.createTarGzip(submitterLocalFiles.map(
+      Utils.resolveURI(_).getPath))
+    val uploadJarsBase64Contents = CompressionUtils.createTarGzip(submitterLocalJars.map(
+      Utils.resolveURI(_).getPath))
     KubernetesCreateSubmissionRequest(
       appResource = resolvedAppResource,
       mainClass = mainClass,
@@ -700,33 +704,6 @@ private[spark] class Client(
       uploadedFilesBase64Contents = uploadFilesBase64Contents)
   }
 
-  // Because uploaded files should be added to the working directory of the driver, they
-  // need to not have duplicate file names. They are added to the working directory so the
-  // user can reliably locate them in their application. This is similar in principle to how
-  // YARN handles its `spark.files` setting.
-  private def validateNoDuplicateUploadFileNames(uploadedFilesCommaSeparated: String): Unit = {
-    val pathsWithDuplicateNames = uploadedFilesCommaSeparated
-      .split(",")
-      .groupBy(new File(_).getName)
-      .filter(_._2.length > 1)
-    if (pathsWithDuplicateNames.nonEmpty) {
-      val pathsWithDuplicateNamesSorted = pathsWithDuplicateNames
-        .values
-        .flatten
-        .toList
-        .sortBy(new File(_).getName)
-      throw new SparkException("Cannot upload files with duplicate names via" +
-        s" ${KUBERNETES_DRIVER_UPLOAD_FILES.key}. The following paths have a duplicated" +
-        s" file name: ${pathsWithDuplicateNamesSorted.mkString(",")}")
-    }
-  }
-
-  private def compressFiles(maybeFilePaths: Option[String]): Option[TarGzippedData] = {
-    maybeFilePaths
-      .map(_.split(","))
-      .map(CompressionUtils.createTarGzip(_))
-  }
-
   private def buildDriverSubmissionClient(
       kubernetesClient: KubernetesClient,
       service: Service,
@@ -813,22 +790,6 @@ private[spark] class Client(
       }).toMap
     }).getOrElse(Map.empty[String, String])
   }
-
-  private def checkForFilesExistence(maybePaths: Option[String]): Unit = {
-    maybePaths.foreach { paths =>
-      paths.split(",").foreach { path =>
-        val uri = Utils.resolveURI(path)
-        uri.getScheme match {
-          case "file" | null =>
-            val file = new File(uri.getPath)
-            if (!file.isFile) {
-              throw new SparkException(s"""file "${uri}" does not exist!""")
-            }
-          case _ =>
-        }
-      }
-    }
-  }
 }
 
 private[spark] object Client extends Logging {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index cb4cd42142ca4..ad83b0446538e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -91,30 +91,6 @@ package object config {
       .stringConf
       .createWithDefault("default")
 
-  private[spark] val KUBERNETES_DRIVER_UPLOAD_JARS =
-    ConfigBuilder("spark.kubernetes.driver.uploads.jars")
-      .doc("""
-          | Comma-separated list of jars to send to the driver and
-          | all executors when submitting the application in cluster
-          | mode.
-        """.stripMargin)
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_UPLOAD_FILES =
-    ConfigBuilder("spark.kubernetes.driver.uploads.files")
-      .doc("""
-          | Comma-separated list of files to send to the driver and
-          | all executors when submitting the application in cluster
-          | mode. The files are added in a flat hierarchy to the
-          | current working directory of the driver, having the same
-          | names as the names of the original files. Note that two
-          | files with the same name cannot be added, even if they
-          | were in different source directories on the client disk.
-        """.stripMargin)
-      .stringConf
-      .createOptional
-
   // Note that while we set a default for this when we start up the
   // scheduler, the specific default value is dynamically determined
   // based on the executor memory.
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
index 6aeb851a16bf4..0d2d1a1c6f5e3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -26,8 +26,8 @@ case class KubernetesCreateSubmissionRequest(
   appArgs: Array[String],
   sparkProperties: Map[String, String],
   secret: String,
-  uploadedJarsBase64Contents: Option[TarGzippedData],
-  uploadedFilesBase64Contents: Option[TarGzippedData]) extends SubmitRestProtocolRequest {
+  uploadedJarsBase64Contents: TarGzippedData,
+  uploadedFilesBase64Contents: TarGzippedData) extends SubmitRestProtocolRequest {
   message = "create"
   clientSparkVersion = SPARK_VERSION
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala
new file mode 100644
index 0000000000000..f30be1535f81c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import org.apache.spark.util.Utils
+
+private[spark] object KubernetesFileUtils {
+
+  private def filterUriStringsByScheme(
+      uris: Iterable[String], schemeFilter: (String => Boolean)): Iterable[String] = {
+    uris.filter(uri => schemeFilter(Option(Utils.resolveURI(uri).getScheme).getOrElse("file")))
+  }
+
+  def getNonSubmitterLocalFiles(uris: Iterable[String]): Iterable[String] = {
+    filterUriStringsByScheme(uris, _ != "file")
+  }
+
+  def getOnlyContainerLocalFiles(uris: Iterable[String]): Iterable[String] = {
+    filterUriStringsByScheme(uris, _ == "local")
+  }
+
+  def getOnlySubmitterLocalFiles(uris: Iterable[String]): Iterable[String] = {
+    filterUriStringsByScheme(uris, _ == "file")
+  }
+
+  def isUriLocalFile(uri: String): Boolean = {
+    Option(Utils.resolveURI(uri).getScheme).getOrElse("file") == "file"
+  }
+
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index c5a7e27b15927..f0b01b2320982 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -16,14 +16,14 @@
  */
 package org.apache.spark.deploy.rest.kubernetes
 
-import java.io.File
+import java.io.{File, FileOutputStream, StringReader}
 import java.net.URI
 import java.nio.file.Paths
 import java.util.concurrent.CountDownLatch
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
 import com.google.common.base.Charsets
-import com.google.common.io.Files
+import com.google.common.io.{BaseEncoding, ByteStreams, Files}
 import org.apache.commons.codec.binary.Base64
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
@@ -145,46 +145,73 @@ private[spark] class KubernetesSparkRestServer(
         } else {
           requestMessage match {
             case KubernetesCreateSubmissionRequest(
-            appResource,
-            mainClass,
-            appArgs,
-            sparkProperties,
-            secret,
-            uploadedJars,
-            uploadedFiles) =>
+                appResource,
+                mainClass,
+                appArgs,
+                sparkProperties,
+                secret,
+                uploadedJars,
+                uploadedFiles) =>
               val decodedSecret = Base64.decodeBase64(secret)
               if (!expectedApplicationSecret.sameElements(decodedSecret)) {
                 responseServlet.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
                 handleError("Unauthorized to submit application.")
               } else {
                 val tempDir = Utils.createTempDir()
-                val appResourcePath = resolvedAppResource(appResource, tempDir)
+                val resolvedAppResource = resolveAppResource(appResource, tempDir)
                 val writtenJars = writeUploadedJars(uploadedJars, tempDir)
                 val writtenFiles = writeUploadedFiles(uploadedFiles)
                 val resolvedSparkProperties = new mutable.HashMap[String, String]
                 resolvedSparkProperties ++= sparkProperties
-
-                // Resolve driver classpath and jars
                 val originalJars = sparkProperties.get("spark.jars")
                   .map(_.split(","))
-                  .getOrElse(Array.empty[String])
-                val resolvedJars = writtenJars ++ originalJars ++ Array(appResourcePath)
-                val sparkJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
+                  .getOrElse(Array.empty)
+
+                // The driver at this point has handed us the value of spark.jars verbatim as
+                // specified in spark-submit. At this point, remove all jars that were local
+                // to the submitting user's disk, and replace them with the paths that were
+                // written to disk above.
+                val onlyContainerLocalOrRemoteJars = KubernetesFileUtils
+                  .getNonSubmitterLocalFiles(originalJars)
+                val resolvedJars = (writtenJars ++
+                  onlyContainerLocalOrRemoteJars ++
+                  Array(resolvedAppResource.sparkJarPath)).toSet
+                if (resolvedJars.nonEmpty) {
+                  resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
+                } else {
+                  resolvedSparkProperties.remove("spark.jars")
+                }
+
+                // Determining the driver classpath is similar. It's the combination of:
+                // - Jars written from uploads
+                // - Jars in (spark.jars + mainAppResource) that has a "local" prefix
+                // - spark.driver.extraClasspath
+                // - Spark core jars from the installation
+                val sparkCoreJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
                 val driverExtraClasspath = sparkProperties
                   .get("spark.driver.extraClassPath")
                   .map(_.split(","))
                   .getOrElse(Array.empty[String])
+                val onlyContainerLocalJars = KubernetesFileUtils
+                  .getOnlyContainerLocalFiles(originalJars)
                 val driverClasspath = driverExtraClasspath ++
-                  resolvedJars ++
-                  sparkJars
-                resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
+                  Seq(resolvedAppResource.localPath) ++
+                  writtenJars ++
+                  onlyContainerLocalJars ++
+                  sparkCoreJars
 
-                // Resolve spark.files
+                // Resolve spark.files similarly to spark.jars.
                 val originalFiles = sparkProperties.get("spark.files")
                   .map(_.split(","))
                   .getOrElse(Array.empty[String])
-                val resolvedFiles = originalFiles ++ writtenFiles
-                resolvedSparkProperties("spark.files") = resolvedFiles.mkString(",")
+                val onlyContainerLocalOrRemoteFiles = KubernetesFileUtils
+                  .getNonSubmitterLocalFiles(originalFiles)
+                val resolvedFiles = writtenFiles ++ onlyContainerLocalOrRemoteFiles
+                if (resolvedFiles.nonEmpty) {
+                  resolvedSparkProperties("spark.files") = resolvedFiles.mkString(",")
+                } else {
+                  resolvedSparkProperties.remove("spark.files")
+                }
 
                 val command = new ArrayBuffer[String]
                 command += javaExecutable
@@ -235,35 +262,50 @@ private[spark] class KubernetesSparkRestServer(
       }
     }
 
-    private def writeUploadedJars(files: Option[TarGzippedData], rootTempDir: File):
+    private def writeUploadedJars(jars: TarGzippedData, rootTempDir: File):
         Seq[String] = {
       val resolvedDirectory = new File(rootTempDir, "jars")
       if (!resolvedDirectory.mkdir()) {
         throw new IllegalStateException(s"Failed to create jars dir at " +
           resolvedDirectory.getAbsolutePath)
       }
-      writeBase64ContentsToFiles(files, resolvedDirectory)
+      CompressionUtils.unpackAndWriteCompressedFiles(jars, resolvedDirectory)
     }
 
-    private def writeUploadedFiles(files: Option[TarGzippedData]): Seq[String] = {
+    private def writeUploadedFiles(files: TarGzippedData): Seq[String] = {
       val workingDir = Paths.get("").toFile.getAbsoluteFile
-      writeBase64ContentsToFiles(files, workingDir)
+      CompressionUtils.unpackAndWriteCompressedFiles(files, workingDir)
     }
 
-    def resolvedAppResource(appResource: AppResource, tempDir: File): String = {
-      val appResourcePath = appResource match {
+
+    /**
+     * Retrieve the path on the driver container where the main app resource is, and what value it
+     * ought to have in the spark.jars property. The two may be different because for non-local
+     * dependencies, we have to fetch the resource (if it is not "local") but still want to use
+     * the full URI in spark.jars.
+     */
+    private def resolveAppResource(appResource: AppResource, tempDir: File):
+        ResolvedAppResource = {
+      appResource match {
         case UploadedAppResource(resourceContentsBase64, resourceName) =>
           val resourceFile = new File(tempDir, resourceName)
           val resourceFilePath = resourceFile.getAbsolutePath
           if (resourceFile.createNewFile()) {
-            val resourceContentsBytes = Base64.decodeBase64(resourceContentsBase64)
-            Files.write(resourceContentsBytes, resourceFile)
-            resourceFile.getAbsolutePath
+            Utils.tryWithResource(new StringReader(resourceContentsBase64)) { reader =>
+              Utils.tryWithResource(new FileOutputStream(resourceFile)) { os =>
+                Utils.tryWithResource(BaseEncoding.base64().decodingStream(reader)) {
+                    decodingStream =>
+                  ByteStreams.copy(decodingStream, os)
+                }
+              }
+            }
+            ResolvedAppResource(resourceFile.getAbsolutePath, resourceFile.getAbsolutePath)
           } else {
             throw new IllegalStateException(s"Failed to write main app resource file" +
               s" to $resourceFilePath")
           }
-        case ContainerAppResource(resource) => resource
+        case ContainerAppResource(resource) =>
+          ResolvedAppResource(Utils.resolveURI(resource).getPath, resource)
         case RemoteAppResource(resource) =>
           Utils.fetchFile(resource, tempDir, conf,
             securityManager, SparkHadoopUtil.get.newConfiguration(conf),
@@ -275,19 +317,12 @@ private[spark] class KubernetesSparkRestServer(
             throw new IllegalStateException(s"Main app resource is not a file or" +
               s" does not exist at $downloadedFilePath")
           }
-          downloadedFilePath
+          ResolvedAppResource(downloadedFilePath, resource)
       }
-      appResourcePath
     }
   }
 
-  private def writeBase64ContentsToFiles(
-        maybeCompressedFiles: Option[TarGzippedData],
-        rootDir: File): Seq[String] = {
-    maybeCompressedFiles.map { compressedFiles =>
-      CompressionUtils.unpackAndWriteCompressedFiles(compressedFiles, rootDir)
-    }.getOrElse(Seq.empty[String])
-  }
+  private case class ResolvedAppResource(localPath: String, sparkJarPath: String)
 }
 
 private[spark] object KubernetesSparkRestServer {
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index 0ec2f36075db3..7f4d935e0e243 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -44,12 +44,6 @@
       <type>pom</type>
     </dependency>
 
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-examples_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-      <scope>provided</scope>
-    </dependency>
     <!--
       Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
       that the libraries Spark depend on have it available. We'll package the version that Spark
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
index 37fde921a6816..b5fcaa75f049c 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
@@ -69,18 +69,7 @@
       <excludes>
         <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
         <exclude>org.spark-project.spark:unused</exclude>
-        <exclude>org.apache.spark:spark-examples_${scala.binary.version}</exclude>
       </excludes>
     </dependencySet>
-    <dependencySet>
-      <outputDirectory>examples/jars</outputDirectory>
-      <useTransitiveDependencies>true</useTransitiveDependencies>
-      <unpack>false</unpack>
-      <scope>provided</scope>
-      <useProjectArtifact>false</useProjectArtifact>
-      <includes>
-        <include>org.apache.spark:spark-examples_${scala.binary.version}:jar</include>
-      </includes>
-    </dependencySet>
   </dependencySets>
 </assembly>
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
index 620a90137aafa..d97ba56562a12 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
@@ -78,18 +78,7 @@
       <excludes>
         <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
         <exclude>org.spark-project.spark:unused</exclude>
-        <exclude>org.apache.spark:spark-examples_${scala.binary.version}</exclude>
       </excludes>
     </dependencySet>
-    <dependencySet>
-      <outputDirectory>examples/jars</outputDirectory>
-      <useTransitiveDependencies>true</useTransitiveDependencies>
-      <unpack>false</unpack>
-      <scope>provided</scope>
-      <useProjectArtifact>false</useProjectArtifact>
-      <includes>
-        <include>org.apache.spark:spark-examples_${scala.binary.version}:jar</include>
-      </includes>
-    </dependencySet>
   </dependencySets>
 </assembly>
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 3de10f94c4aca..4c8069db2686b 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -143,6 +143,56 @@
               </artifactItems>
             </configuration>
           </execution>
+          <execution>
+            <id>copy-test-spark-jobs-to-docker-driver</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy</goal>
+            </goals>
+            <configuration>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-kubernetes-integration-tests-spark-jobs_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <type>jar</type>
+                  <outputDirectory>${project.build.directory}/docker/driver/examples/integration-tests-jars</outputDirectory>
+                </artifactItem>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <type>jar</type>
+                  <outputDirectory>${project.build.directory}/docker/driver/examples/integration-tests-jars</outputDirectory>
+                </artifactItem>
+              </artifactItems>
+            </configuration>
+          </execution>
+          <execution>
+            <id>copy-test-spark-jobs-to-docker-executor</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy</goal>
+            </goals>
+            <configuration>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-kubernetes-integration-tests-spark-jobs_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <type>jar</type>
+                  <outputDirectory>${project.build.directory}/docker/executor/examples/integration-tests-jars</outputDirectory>
+                </artifactItem>
+                <artifactItem>
+                  <groupId>org.apache.spark</groupId>
+                  <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
+                  <version>${project.version}</version>
+                  <type>jar</type>
+                  <outputDirectory>${project.build.directory}/docker/executor/examples/integration-tests-jars</outputDirectory>
+                </artifactItem>
+              </artifactItems>
+            </configuration>
+          </execution>
           <execution>
             <id>unpack-docker-driver-bundle</id>
             <phase>pre-integration-test</phase>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index c5458eccf830d..fe171db15b3d1 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -19,7 +19,6 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 import java.io.File
 import java.nio.file.Paths
 import java.util.UUID
-import java.util.concurrent.TimeUnit
 
 import com.google.common.base.Charsets
 import com.google.common.collect.ImmutableList
@@ -45,24 +44,13 @@ import org.apache.spark.util.Utils
 
 private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
-  private val EXAMPLES_JAR = Paths.get("target", "integration-tests-spark-jobs")
+  private val EXAMPLES_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs")
     .toFile
     .listFiles()(0)
-    .getAbsolutePath
 
-  private val HELPER_JAR = Paths.get("target", "integration-tests-spark-jobs-helpers")
+  private val HELPER_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs-helpers")
       .toFile
       .listFiles()(0)
-      .getAbsolutePath
-
-  private val EXAMPLES_JAR_FILE_NAME = Paths.get("target", "docker", "driver", "examples", "jars")
-    .toFile
-    .listFiles()
-    .toList
-    .map(_.getName)
-    .find(_.startsWith("spark-examples"))
-    .getOrElse(throw new IllegalStateException("Expected to find spark-examples jar; was the" +
-        " pre-integration-test phase run?"))
 
   private val TEST_EXISTENCE_FILE = Paths.get("test-data", "input.txt").toFile
   private val TEST_EXISTENCE_FILE_CONTENTS = Files.toString(TEST_EXISTENCE_FILE, Charsets.UTF_8)
@@ -174,7 +162,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.kubernetes.namespace", NAMESPACE)
       .set("spark.kubernetes.driver.docker.image", "spark-driver:latest")
       .set("spark.kubernetes.executor.docker.image", "spark-executor:latest")
-      .set("spark.kubernetes.driver.uploads.jars", HELPER_JAR)
+      .set("spark.jars", HELPER_JAR_FILE.getAbsolutePath)
       .set("spark.executor.memory", "500m")
       .set("spark.executor.cores", "1")
       .set("spark.executors.instances", "1")
@@ -182,7 +170,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set("spark.ui.enabled", "true")
       .set("spark.testing", "false")
       .set("spark.kubernetes.submit.waitAppCompletion", "false")
-    val mainAppResource = s"file://$EXAMPLES_JAR"
+    val mainAppResource = s"file://${EXAMPLES_JAR_FILE.getAbsolutePath}"
 
     new Client(
       sparkConf = sparkConf,
@@ -202,7 +190,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-memory", "512m",
       "--executor-cores", "1",
       "--num-executors", "1",
-      "--upload-jars", HELPER_JAR,
+      "--jars", HELPER_JAR_FILE.getAbsolutePath,
       "--class", SPARK_PI_MAIN_CLASS,
       "--conf", "spark.ui.enabled=true",
       "--conf", "spark.testing=false",
@@ -212,7 +200,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR)
+      EXAMPLES_JAR_FILE.getAbsolutePath)
     SparkSubmit.main(args)
     val sparkMetricsService = getSparkMetricsService("spark-pi")
     expectationsForStaticAllocation(sparkMetricsService)
@@ -227,55 +215,18 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-memory", "512m",
       "--executor-cores", "1",
       "--num-executors", "1",
-      "--class", "org.apache.spark.examples.SparkPi",
+      "--jars", s"local:///opt/spark/examples/integration-tests-jars/${HELPER_JAR_FILE.getName}",
+      "--class", SPARK_PI_MAIN_CLASS,
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
       "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      s"container:///opt/spark/examples/jars/$EXAMPLES_JAR_FILE_NAME")
-    val allContainersSucceeded = SettableFuture.create[Boolean]
-    val watcher = new Watcher[Pod] {
-      override def eventReceived(action: Action, pod: Pod): Unit = {
-        if (action == Action.ERROR) {
-          allContainersSucceeded.setException(
-              new SparkException("The execution of the driver pod failed."))
-        } else if (action == Action.MODIFIED &&
-            pod.getStatus.getContainerStatuses.asScala.nonEmpty &&
-            pod.getStatus
-              .getContainerStatuses
-              .asScala
-              .forall(_.getState.getTerminated != null)) {
-          allContainersSucceeded.set(
-            pod.getStatus
-              .getContainerStatuses
-              .asScala
-              .forall(_.getState.getTerminated.getExitCode == 0)
-          )
-        }
-      }
-
-      override def onClose(e: KubernetesClientException): Unit = {
-        logError("Integration test pod watch closed", e)
-      }
-    }
-    Utils.tryWithResource(
-      minikubeKubernetesClient
-        .pods
-        .withLabel("spark-app-name", "spark-pi")
-        .watch(watcher)) { _ =>
-      SparkSubmit.main(args)
-      assert(allContainersSucceeded.get(2, TimeUnit.MINUTES),
-          "Some containers exited with a non-zero status.")
-    }
-    val driverPod = minikubeKubernetesClient.pods
-      .withLabel("spark-app-name", "spark-pi")
-      .list
-      .getItems
-      .get(0)
-    val jobLog = minikubeKubernetesClient.pods.withName(driverPod.getMetadata.getName).getLog
-    assert(jobLog.contains("Pi is roughly"), "Pi was not computed by the job...")
+      s"local:///opt/spark/examples/integration-tests-jars/${EXAMPLES_JAR_FILE.getName}")
+    SparkSubmit.main(args)
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
+    expectationsForStaticAllocation(sparkMetricsService)
   }
 
   test("Run with custom labels") {
@@ -287,7 +238,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-memory", "512m",
       "--executor-cores", "1",
       "--num-executors", "1",
-      "--upload-jars", HELPER_JAR,
+      "--jars", HELPER_JAR_FILE.getAbsolutePath,
       "--class", SPARK_PI_MAIN_CLASS,
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
@@ -296,7 +247,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       "--conf", "spark.kubernetes.driver.labels=label1=label1value,label2=label2value",
       "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR)
+      EXAMPLES_JAR_FILE.getAbsolutePath)
     SparkSubmit.main(args)
     val driverPodLabels = minikubeKubernetesClient
       .pods
@@ -326,7 +277,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-memory", "512m",
       "--executor-cores", "1",
       "--num-executors", "1",
-      "--upload-jars", HELPER_JAR,
+      "--jars", HELPER_JAR_FILE.getAbsolutePath,
       "--class", SPARK_PI_MAIN_CLASS,
       "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
       "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
@@ -342,7 +293,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         s"file://${trustStoreFile.getAbsolutePath}",
       "--conf", s"spark.ssl.kubernetes.driverlaunch.trustStorePassword=changeit",
       "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR)
+      EXAMPLES_JAR_FILE.getAbsolutePath)
     SparkSubmit.main(args)
   }
 
@@ -355,8 +306,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--executor-memory", "512m",
       "--executor-cores", "1",
       "--num-executors", "1",
-      "--upload-jars", HELPER_JAR,
-      "--upload-files", TEST_EXISTENCE_FILE.getAbsolutePath,
+      "--jars", HELPER_JAR_FILE.getAbsolutePath,
+      "--files", TEST_EXISTENCE_FILE.getAbsolutePath,
       "--class", FILE_EXISTENCE_MAIN_CLASS,
       "--conf", "spark.ui.enabled=false",
       "--conf", "spark.testing=true",
@@ -366,7 +317,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR,
+      EXAMPLES_JAR_FILE.getAbsolutePath,
       TEST_EXISTENCE_FILE.getName,
       TEST_EXISTENCE_FILE_CONTENTS)
     val podCompletedFuture = SettableFuture.create[Boolean]

From cad5dd30a48e4c3dfb9ec9851cc347894a7afc3c Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Tue, 21 Feb 2017 21:48:33 -0800
Subject: [PATCH 428/534] Optionally expose the driver UI port as NodePort
 (#131)

* Optionally expose driver UI as a NodePort

* Update the usage doc

* Rename serviceType -> uiServiceType
---
 docs/running-on-kubernetes.md                            | 8 ++++++++
 .../org/apache/spark/deploy/kubernetes/Client.scala      | 4 +++-
 .../org/apache/spark/deploy/kubernetes/config.scala      | 9 +++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e5c7e9bb69448..cddc7259e6ac3 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -194,6 +194,14 @@ from the other deployment modes. See the [configuration page](configuration.html
     Time to wait for the driver pod to start running before aborting its execution.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.service.exposeUiPort</code></td>
+  <td><code>false</code></td>
+  <td>
+    Whether to expose the driver Web UI port as a service NodePort. Turned off by default because NodePort is a limited
+    resource. Use alternatives such as Ingress if possible.
+  </td>
+</tr>
 </table>
 
 ## Current Limitations
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index aa273a024f6f9..f83dd2824e412 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -226,13 +226,15 @@ private[spark] class Client(
     logInfo("Successfully submitted local resources and driver configuration to" +
       " driver pod.")
     // After submitting, adjust the service to only expose the Spark UI
+    val uiServiceType = if (sparkConf.get(EXPOSE_KUBERNETES_DRIVER_SERVICE_UI_PORT)) "NodePort"
+      else "ClusterIP"
     val uiServicePort = new ServicePortBuilder()
       .withName(UI_PORT_NAME)
       .withPort(uiPort)
       .withNewTargetPort(uiPort)
       .build()
     kubernetesClient.services().withName(kubernetesAppId).edit().editSpec()
-      .withType("ClusterIP")
+      .withType(uiServiceType)
       .withPorts(uiServicePort)
       .endSpec()
       .done()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index ad83b0446538e..a21ec2101cc6e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -156,6 +156,15 @@ package object config {
         .stringConf
         .createOptional
 
+  private[spark] val EXPOSE_KUBERNETES_DRIVER_SERVICE_UI_PORT =
+    ConfigBuilder("spark.kubernetes.driver.service.exposeUiPort")
+      .doc("""
+          | Whether to expose the driver Web UI port as a service NodePort. Turned off by default
+          | because NodePort is a limited resource. Use alternatives such as Ingress if possible.
+        """.stripMargin)
+      .booleanConf
+      .createWithDefault(false)
+
   private[spark] val KUBERNETES_DRIVER_POD_NAME =
     ConfigBuilder("spark.kubernetes.driver.pod.name")
       .doc("""

From 68a83a253e334960dde4ed7273d76898886b0866 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Thu, 23 Feb 2017 10:47:20 -0800
Subject: [PATCH 429/534] Set the REST service's exit code to the exit code of
 its driver subprocess (#138)

---
 .../rest/kubernetes/KubernetesSparkRestServer.scala       | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index f0b01b2320982..5952acc0d5916 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -20,6 +20,7 @@ import java.io.{File, FileOutputStream, StringReader}
 import java.net.URI
 import java.nio.file.Paths
 import java.util.concurrent.CountDownLatch
+import java.util.concurrent.atomic.AtomicInteger
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 
 import com.google.common.base.Charsets
@@ -101,6 +102,7 @@ private[spark] class KubernetesSparkRestServer(
     conf: SparkConf,
     expectedApplicationSecret: Array[Byte],
     shutdownLock: CountDownLatch,
+    exitCode: AtomicInteger,
     sslOptions: SSLOptions = new SSLOptions)
   extends RestSubmissionServer(host, port, conf, sslOptions) {
 
@@ -238,7 +240,8 @@ private[spark] class KubernetesSparkRestServer(
                 })
                 waitForProcessCompleteExecutor.submit(new Runnable {
                   override def run(): Unit = {
-                    process.waitFor
+                    // set the REST service's exit code to the exit code of the driver subprocess
+                    exitCode.set(process.waitFor)
                     SERVLET_LOCK.synchronized {
                       logInfo("Spark application complete. Shutting down submission server...")
                       KubernetesSparkRestServer.this.stop
@@ -355,12 +358,14 @@ private[spark] object KubernetesSparkRestServer {
     }
     val secretBytes = Files.toByteArray(secretFile)
     val sparkConf = new SparkConf(true)
+    val exitCode = new AtomicInteger(0)
     val server = new KubernetesSparkRestServer(
       parsedArguments.host.get,
       parsedArguments.port.get,
       sparkConf,
       secretBytes,
       barrier,
+      exitCode,
       sslOptions)
     server.start()
     ShutdownHookManager.addShutdownHook(() => {
@@ -371,6 +376,7 @@ private[spark] object KubernetesSparkRestServer {
       }
     })
     barrier.await()
+    System.exit(exitCode.get())
   }
 }
 

From 1ab6dbc2e4fa04af81f8dc5a8795a308d88f821e Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 23 Feb 2017 12:12:35 -0800
Subject: [PATCH 430/534] Pass the actual iterable from the option to get files
 (#139)

* Pass the actual iterable from the option to get files

* Split the original instance variables

* Explicitly set the type of the array
---
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index f83dd2824e412..a8b8de8e34d03 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -62,7 +62,11 @@ private[spark] class Client(
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
   private val sparkFiles = sparkConf.getOption("spark.files")
+    .map(_.split(","))
+    .getOrElse(Array.empty[String])
   private val sparkJars = sparkConf.getOption("spark.jars")
+    .map(_.split(","))
+    .getOrElse(Array.empty[String])
 
   private val waitForAppCompletion: Boolean = sparkConf.get(WAIT_FOR_APP_COMPLETION)
 

From bb5cb21691b1b287f343041da4dcf4edc6ed00aa Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 23 Feb 2017 14:44:02 -0800
Subject: [PATCH 431/534] Use a separate class to track components that need to
 be cleaned up (#122)

* Refactor the cleaning up of Kubernetes components.

Create a KubernetesComponentsCleaner which can register arbitrary pods,
services, secrets, and ingresses. When an exception is thrown or the JVM
shuts down, the cleaner automatically purges any of its registered
components from Kubernetes. The components can be unregistered when the
driver successfully begins running, so that the application persists
beyond the lifetime of the spark-submit process.

* Fix spacing

* Address comments

* Fix compiler error

* Pull KubernetesComponentCleaner into instance variable

* Remove a parameter

* Remove redundant registerOrUpdateSecret for SSL

* Remove Ingresses from component cleaner

* Clear resources generically as opposed to specifying each type

* Remove incorrect test assertion

* Rename variable
---
 .../spark/deploy/kubernetes/Client.scala      | 225 ++++++++----------
 .../KubernetesResourceCleaner.scala           |  52 ++++
 2 files changed, 152 insertions(+), 125 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index a8b8de8e34d03..7e6c34e12facd 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -38,7 +38,7 @@ import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ShutdownHookManager, Utils}
 
 private[spark] class Client(
     sparkConf: SparkConf,
@@ -79,6 +79,8 @@ private[spark] class Client(
   private val serviceAccount = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
   private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
 
+  private val kubernetesResourceCleaner = new KubernetesResourceCleaner
+
   def run(): Unit = {
     logInfo(s"Starting application $kubernetesAppId in Kubernetes...")
     val submitterLocalFiles = KubernetesFileUtils.getOnlySubmitterLocalFiles(sparkFiles)
@@ -111,6 +113,8 @@ private[spark] class Client(
 
     val k8ClientConfig = k8ConfBuilder.build
     Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig)) { kubernetesClient =>
+      ShutdownHookManager.addShutdownHook(() =>
+        kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
       val submitServerSecret = kubernetesClient.secrets().createNew()
         .withNewMetadata()
           .withName(secretName)
@@ -118,90 +122,64 @@ private[spark] class Client(
         .withData(Map((SUBMISSION_APP_SECRET_NAME, secretBase64String)).asJava)
         .withType("Opaque")
         .done()
+      kubernetesResourceCleaner.registerOrUpdateResource(submitServerSecret)
       try {
-        val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(kubernetesClient,
+        val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(
+          kubernetesClient,
           driverSubmitSslOptions,
           isKeyStoreLocalFile)
-        try {
-          // start outer watch for status logging of driver pod
-          val driverPodCompletedLatch = new CountDownLatch(1)
-          // only enable interval logging if in waitForAppCompletion mode
-          val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
-          val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
-            loggingInterval)
-          Utils.tryWithResource(kubernetesClient
-              .pods()
-              .withName(kubernetesAppId)
-              .watch(loggingWatch)) { _ =>
-            val (driverPod, driverService) = launchDriverKubernetesComponents(
-              kubernetesClient,
-              parsedCustomLabels,
-              submitServerSecret,
-              driverSubmitSslOptions,
-              sslSecrets,
-              sslVolumes,
-              sslVolumeMounts,
-              sslEnvs,
-              isKeyStoreLocalFile)
-            val ownerReferenceConfiguredDriverService = try {
-              configureOwnerReferences(
-                kubernetesClient,
-                submitServerSecret,
-                sslSecrets,
-                driverPod,
-                driverService)
-            } catch {
-              case e: Throwable =>
-                cleanupPodAndService(kubernetesClient, driverPod, driverService)
-                throw new SparkException("Failed to set owner references to the driver pod.", e)
-            }
-            try {
-              submitApplicationToDriverServer(kubernetesClient, driverSubmitSslOptions,
-                ownerReferenceConfiguredDriverService, submitterLocalFiles, submitterLocalJars)
-              // wait if configured to do so
-              if (waitForAppCompletion) {
-                logInfo(s"Waiting for application $kubernetesAppId to finish...")
-                driverPodCompletedLatch.await()
-                logInfo(s"Application $kubernetesAppId finished.")
-              } else {
-                logInfo(s"Application $kubernetesAppId successfully launched.")
-              }
-            } catch {
-              case e: Throwable =>
-                cleanupPodAndService(kubernetesClient, driverPod,
-                  ownerReferenceConfiguredDriverService)
-                throw new SparkException("Failed to submit the application to the driver pod.", e)
-            }
-          }
-        } finally {
-          Utils.tryLogNonFatalError {
-            // Secrets may have been mutated so delete by name to avoid problems with not having
-            // the latest version.
-            sslSecrets.foreach { secret =>
-              kubernetesClient.secrets().withName(secret.getMetadata.getName).delete()
-            }
+        // start outer watch for status logging of driver pod
+        val driverPodCompletedLatch = new CountDownLatch(1)
+        // only enable interval logging if in waitForAppCompletion mode
+        val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
+        val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
+          loggingInterval)
+        Utils.tryWithResource(kubernetesClient
+            .pods()
+            .withName(kubernetesAppId)
+            .watch(loggingWatch)) { _ =>
+          val (driverPod, driverService) = launchDriverKubernetesComponents(
+            kubernetesClient,
+            parsedCustomLabels,
+            submitServerSecret,
+            driverSubmitSslOptions,
+            sslSecrets,
+            sslVolumes,
+            sslVolumeMounts,
+            sslEnvs,
+            isKeyStoreLocalFile)
+          configureOwnerReferences(
+            kubernetesClient,
+            submitServerSecret,
+            sslSecrets,
+            driverPod,
+            driverService)
+          submitApplicationToDriverServer(
+            kubernetesClient,
+            driverSubmitSslOptions,
+            driverService,
+            submitterLocalFiles,
+            submitterLocalJars)
+          // Now that the application has started, persist the components that were created beyond
+          // the shutdown hook. We still want to purge the one-time secrets, so do not unregister
+          // those.
+          kubernetesResourceCleaner.unregisterResource(driverPod)
+          kubernetesResourceCleaner.unregisterResource(driverService)
+          // wait if configured to do so
+          if (waitForAppCompletion) {
+            logInfo(s"Waiting for application $kubernetesAppId to finish...")
+            driverPodCompletedLatch.await()
+            logInfo(s"Application $kubernetesAppId finished.")
+          } else {
+            logInfo(s"Application $kubernetesAppId successfully launched.")
           }
         }
       } finally {
-        Utils.tryLogNonFatalError {
-          kubernetesClient.secrets().withName(submitServerSecret.getMetadata.getName).delete()
-        }
+        kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient)
       }
     }
   }
 
-  private def cleanupPodAndService(
-      kubernetesClient: KubernetesClient,
-      driverPod: Pod,
-      driverService: Service): Unit = {
-    Utils.tryLogNonFatalError {
-      kubernetesClient.services().delete(driverService)
-    }
-    Utils.tryLogNonFatalError {
-      kubernetesClient.pods().delete(driverPod)
-    }
-  }
-
   private def submitApplicationToDriverServer(
       kubernetesClient: KubernetesClient,
       driverSubmitSslOptions: SSLOptions,
@@ -237,11 +215,13 @@ private[spark] class Client(
       .withPort(uiPort)
       .withNewTargetPort(uiPort)
       .build()
-    kubernetesClient.services().withName(kubernetesAppId).edit().editSpec()
-      .withType(uiServiceType)
-      .withPorts(uiServicePort)
-      .endSpec()
+    val resolvedService = kubernetesClient.services().withName(kubernetesAppId).edit()
+      .editSpec()
+        .withType(uiServiceType)
+        .withPorts(uiServicePort)
+        .endSpec()
       .done()
+    kubernetesResourceCleaner.registerOrUpdateResource(resolvedService)
     logInfo("Finished submitting application to Kubernetes.")
   }
 
@@ -282,37 +262,19 @@ private[spark] class Client(
             kubernetesClient,
             driverKubernetesSelectors,
             submitServerSecret)
-          val driverPod = try {
-            createDriverPod(
-              kubernetesClient,
-              driverKubernetesSelectors,
-              submitServerSecret,
-              driverSubmitSslOptions,
-              sslVolumes,
-              sslVolumeMounts,
-              sslEnvs)
-          } catch {
-            case e: Throwable =>
-              Utils.tryLogNonFatalError {
-                kubernetesClient.services().delete(driverService)
-              }
-              throw new SparkException("Failed to create the driver pod.", e)
-          }
-          try {
-            waitForReadyKubernetesComponents(kubernetesClient, endpointsReadyFuture,
-              serviceReadyFuture, podReadyFuture)
-            (driverPod, driverService)
-          } catch {
-            case e: Throwable =>
-              Utils.tryLogNonFatalError {
-                kubernetesClient.services().delete(driverService)
-              }
-              Utils.tryLogNonFatalError {
-                kubernetesClient.pods().delete(driverPod)
-              }
-              throw new SparkException("Timed out while waiting for a Kubernetes component to be" +
-                " ready.", e)
-          }
+          kubernetesResourceCleaner.registerOrUpdateResource(driverService)
+          val driverPod = createDriverPod(
+            kubernetesClient,
+            driverKubernetesSelectors,
+            submitServerSecret,
+            driverSubmitSslOptions,
+            sslVolumes,
+            sslVolumeMounts,
+            sslEnvs)
+          kubernetesResourceCleaner.registerOrUpdateResource(driverPod)
+          waitForReadyKubernetesComponents(kubernetesClient, endpointsReadyFuture,
+            serviceReadyFuture, podReadyFuture)
+          (driverPod, driverService)
         }
       }
     }
@@ -338,22 +300,32 @@ private[spark] class Client(
       .withController(true)
       .build()
     sslSecrets.foreach(secret => {
-      kubernetesClient.secrets().withName(secret.getMetadata.getName).edit()
+      val updatedSecret = kubernetesClient.secrets().withName(secret.getMetadata.getName).edit()
         .editMetadata()
         .addToOwnerReferences(driverPodOwnerRef)
         .endMetadata()
         .done()
+      kubernetesResourceCleaner.registerOrUpdateResource(updatedSecret)
     })
-    kubernetesClient.secrets().withName(submitServerSecret.getMetadata.getName).edit()
-      .editMetadata()
-      .addToOwnerReferences(driverPodOwnerRef)
-      .endMetadata()
-      .done()
-    kubernetesClient.services().withName(driverService.getMetadata.getName).edit()
-      .editMetadata()
-      .addToOwnerReferences(driverPodOwnerRef)
-      .endMetadata()
-      .done()
+    val updatedSubmitServerSecret = kubernetesClient
+      .secrets()
+      .withName(submitServerSecret.getMetadata.getName)
+      .edit()
+        .editMetadata()
+          .addToOwnerReferences(driverPodOwnerRef)
+          .endMetadata()
+        .done()
+    kubernetesResourceCleaner.registerOrUpdateResource(updatedSubmitServerSecret)
+    val updatedService = kubernetesClient
+      .services()
+      .withName(driverService.getMetadata.getName)
+      .edit()
+        .editMetadata()
+          .addToOwnerReferences(driverPodOwnerRef)
+          .endMetadata()
+        .done()
+    kubernetesResourceCleaner.registerOrUpdateResource(updatedService)
+    updatedService
   }
 
   private def waitForReadyKubernetesComponents(
@@ -417,7 +389,7 @@ private[spark] class Client(
       driverSubmitSslOptions: SSLOptions,
       sslVolumes: Array[Volume],
       sslVolumeMounts: Array[VolumeMount],
-      sslEnvs: Array[EnvVar]) = {
+      sslEnvs: Array[EnvVar]): Pod = {
     val containerPorts = buildContainerPorts()
     val probePingHttpGet = new HTTPGetActionBuilder()
       .withScheme(if (driverSubmitSslOptions.enabled) "HTTPS" else "HTTP")
@@ -537,9 +509,11 @@ private[spark] class Client(
     (securityManager.getSSLOptions(KUBERNETES_SUBMIT_SSL_NAMESPACE), isLocalKeyStore)
   }
 
-  private def configureSsl(kubernetesClient: KubernetesClient, driverSubmitSslOptions: SSLOptions,
-        isKeyStoreLocalFile: Boolean):
-        (Array[EnvVar], Array[Volume], Array[VolumeMount], Array[Secret]) = {
+  private def configureSsl(
+      kubernetesClient: KubernetesClient,
+      driverSubmitSslOptions: SSLOptions,
+      isKeyStoreLocalFile: Boolean):
+      (Array[EnvVar], Array[Volume], Array[VolumeMount], Array[Secret]) = {
     if (driverSubmitSslOptions.enabled) {
       val sslSecretsMap = mutable.HashMap[String, String]()
       val sslEnvs = mutable.Buffer[EnvVar]()
@@ -606,6 +580,7 @@ private[spark] class Client(
         .withData(sslSecretsMap.asJava)
         .withType("Opaque")
         .done()
+      kubernetesResourceCleaner.registerOrUpdateResource(sslSecrets)
       secrets += sslSecrets
       (sslEnvs.toArray, Array(sslVolume), Array(sslVolumeMount), secrets.toArray)
     } else {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
new file mode 100644
index 0000000000000..fb76b04604479
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.HasMetadata
+import io.fabric8.kubernetes.client.KubernetesClient
+import scala.collection.mutable
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+private[spark] class KubernetesResourceCleaner
+    extends Logging {
+
+  private val resources = mutable.HashMap.empty[(String, String), HasMetadata]
+
+  // Synchronized because deleteAllRegisteredResourcesFromKubernetes may be called from a
+  // shutdown hook
+  def registerOrUpdateResource(resource: HasMetadata): Unit = synchronized {
+    resources.put((resource.getMetadata.getName, resource.getKind), resource)
+  }
+
+  def unregisterResource(resource: HasMetadata): Unit = synchronized {
+    resources.remove((resource.getMetadata.getName, resource.getKind))
+  }
+
+  def deleteAllRegisteredResourcesFromKubernetes(kubernetesClient: KubernetesClient): Unit = {
+    synchronized {
+      logInfo(s"Deleting ${resources.size} registered Kubernetes resources:")
+      resources.values.foreach { resource =>
+        Utils.tryLogNonFatalError {
+          kubernetesClient.resource(resource).delete()
+        }
+      }
+      resources.clear()
+    }
+  }
+}

From 04a555e5fba5a3d970a324a2d9ebcb8f5600b20c Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Thu, 23 Feb 2017 15:55:31 -0800
Subject: [PATCH 432/534] Enable unit tests in Travis CI build (#132)

* Configure unit test build while banning flaky tests

* Clean up comment
---
 .travis.yml | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a118421eb45e0..be255103ac725 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,11 +25,22 @@
 sudo: required
 dist: trusty
 
-# 2. Choose language and target JDKs for parallel builds.
+# 2. Choose language, target JDK and env's for parallel builds.
 language: java
 jdk:
-  - oraclejdk7
   - oraclejdk8
+env:  # Used by the install section below.
+  # Configure the unit test build for spark core and kubernetes modules,
+  # while excluding some flaky unit tests using a regex pattern.
+  - PHASE=test  \
+    PROFILES="-Pmesos -Pyarn -Phadoop-2.7 -Pkubernetes"  \
+    MODULES="-pl core,resource-managers/kubernetes/core -am"  \
+    ARGS="-Dsuffixes='^org\.apache\.spark\.(?!rdd\.LocalCheckpointSuite$|deploy\.StandaloneDynamicAllocationSuite$).*'"
+  # Configure the full build.
+  - PHASE=install  \
+    PROFILES="-Pmesos -Pyarn -Phadoop-2.7 -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver"  \
+    MODULES=""  \
+    ARGS="-T 4 -q -DskipTests"
 
 # 3. Setup cache directory for SBT and Maven.
 cache:
@@ -41,11 +52,12 @@ cache:
 notifications:
   email: false
 
-# 5. Run maven install before running lint-java.
+# 5. Run maven build before running lints.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn ${PHASE} ${PROFILES} ${MODULES} ${ARGS}
 
-# 6. Run lint-java.
+# 6. Run lints.
 script:
   - dev/lint-java
+  - dev/lint-scala

From d7f41c538085dc052e0f7b761135e9b1260afd8c Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Thu, 23 Feb 2017 15:57:13 -0800
Subject: [PATCH 433/534] Change driver pod's restart policy from OnFailure to
 Never (#145)

---
 .../main/scala/org/apache/spark/deploy/kubernetes/Client.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 7e6c34e12facd..f55d71acfd6a0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -402,7 +402,7 @@ private[spark] class Client(
         .withLabels(driverKubernetesSelectors)
         .endMetadata()
       .withNewSpec()
-        .withRestartPolicy("OnFailure")
+        .withRestartPolicy("Never")
         .addNewVolume()
           .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
           .withNewSecret()

From b4b1bddb8c41262957eeb8616eda732b71f65ccc Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 23 Feb 2017 23:54:23 -0800
Subject: [PATCH 434/534] Extract SSL configuration handling to a separate
 class (#123)

* Extract SSL configuration to a separate class

* KubernetesSsl -> Ssl, container -> local
---
 .../spark/deploy/kubernetes/Client.scala      | 210 +++---------------
 .../kubernetes/SslConfigurationProvider.scala | 203 +++++++++++++++++
 2 files changed, 232 insertions(+), 181 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index f55d71acfd6a0..c787d5917e381 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -16,13 +16,11 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import java.io.{File, FileInputStream}
-import java.security.{KeyStore, SecureRandom}
+import java.io.File
+import java.security.SecureRandom
 import java.util
 import java.util.concurrent.{CountDownLatch, TimeUnit}
-import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
-import com.google.common.base.Charsets
 import com.google.common.io.Files
 import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model._
@@ -30,9 +28,8 @@ import io.fabric8.kubernetes.client.{ConfigBuilder => K8SConfigBuilder, DefaultK
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.codec.binary.Base64
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 
-import org.apache.spark.{SecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
@@ -56,8 +53,6 @@ private[spark] class Client(
   private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
   private val secretName = s"$SUBMISSION_APP_SECRET_PREFIX-$kubernetesAppId"
   private val secretDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId"
-  private val sslSecretsDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId-ssl"
-  private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
@@ -95,7 +90,6 @@ private[spark] class Client(
       throw new SparkException(s"Main app resource file $mainAppResource is not a file or" +
         s" is a directory.")
     }
-    val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
     val parsedCustomLabels = parseCustomLabels(customLabels)
     var k8ConfBuilder = new K8SConfigBuilder()
       .withApiVersion("v1")
@@ -115,6 +109,8 @@ private[spark] class Client(
     Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig)) { kubernetesClient =>
       ShutdownHookManager.addShutdownHook(() =>
         kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
+      val sslConfigurationProvider = new SslConfigurationProvider(
+        sparkConf, kubernetesAppId, kubernetesClient, kubernetesResourceCleaner)
       val submitServerSecret = kubernetesClient.secrets().createNew()
         .withNewMetadata()
           .withName(secretName)
@@ -124,10 +120,7 @@ private[spark] class Client(
         .done()
       kubernetesResourceCleaner.registerOrUpdateResource(submitServerSecret)
       try {
-        val (sslEnvs, sslVolumes, sslVolumeMounts, sslSecrets) = configureSsl(
-          kubernetesClient,
-          driverSubmitSslOptions,
-          isKeyStoreLocalFile)
+        val sslConfiguration = sslConfigurationProvider.getSslConfiguration()
         // start outer watch for status logging of driver pod
         val driverPodCompletedLatch = new CountDownLatch(1)
         // only enable interval logging if in waitForAppCompletion mode
@@ -142,21 +135,16 @@ private[spark] class Client(
             kubernetesClient,
             parsedCustomLabels,
             submitServerSecret,
-            driverSubmitSslOptions,
-            sslSecrets,
-            sslVolumes,
-            sslVolumeMounts,
-            sslEnvs,
-            isKeyStoreLocalFile)
+            sslConfiguration)
           configureOwnerReferences(
             kubernetesClient,
             submitServerSecret,
-            sslSecrets,
+            sslConfiguration.sslSecrets,
             driverPod,
             driverService)
           submitApplicationToDriverServer(
             kubernetesClient,
-            driverSubmitSslOptions,
+            sslConfiguration,
             driverService,
             submitterLocalFiles,
             submitterLocalJars)
@@ -182,7 +170,7 @@ private[spark] class Client(
 
   private def submitApplicationToDriverServer(
       kubernetesClient: KubernetesClient,
-      driverSubmitSslOptions: SSLOptions,
+      sslConfiguration: SslConfiguration,
       driverService: Service,
       submitterLocalFiles: Iterable[String],
       submitterLocalJars: Iterable[String]): Unit = {
@@ -198,7 +186,7 @@ private[spark] class Client(
     sparkConf.setIfMissing("spark.blockmanager.port",
       DEFAULT_BLOCKMANAGER_PORT.toString)
     val driverSubmitter = buildDriverSubmissionClient(kubernetesClient, driverService,
-      driverSubmitSslOptions)
+      sslConfiguration)
     // Sanity check to see if the driver submitter is even reachable.
     driverSubmitter.ping()
     logInfo(s"Submitting local resources to driver pod for application " +
@@ -229,20 +217,15 @@ private[spark] class Client(
       kubernetesClient: KubernetesClient,
       parsedCustomLabels: Map[String, String],
       submitServerSecret: Secret,
-      driverSubmitSslOptions: SSLOptions,
-      sslSecrets: Array[Secret],
-      sslVolumes: Array[Volume],
-      sslVolumeMounts: Array[VolumeMount],
-      sslEnvs: Array[EnvVar],
-      isKeyStoreLocalFile: Boolean): (Pod, Service) = {
-    val endpointsReadyFuture = SettableFuture.create[Endpoints]
-    val endpointsReadyWatcher = new DriverEndpointsReadyWatcher(endpointsReadyFuture)
-    val serviceReadyFuture = SettableFuture.create[Service]
+      sslConfiguration: SslConfiguration): (Pod, Service) = {
     val driverKubernetesSelectors = (Map(
       SPARK_DRIVER_LABEL -> kubernetesAppId,
       SPARK_APP_ID_LABEL -> kubernetesAppId,
       SPARK_APP_NAME_LABEL -> appName)
       ++ parsedCustomLabels).asJava
+    val endpointsReadyFuture = SettableFuture.create[Endpoints]
+    val endpointsReadyWatcher = new DriverEndpointsReadyWatcher(endpointsReadyFuture)
+    val serviceReadyFuture = SettableFuture.create[Service]
     val serviceReadyWatcher = new DriverServiceReadyWatcher(serviceReadyFuture)
     val podReadyFuture = SettableFuture.create[Pod]
     val podWatcher = new DriverPodReadyWatcher(podReadyFuture)
@@ -267,10 +250,7 @@ private[spark] class Client(
             kubernetesClient,
             driverKubernetesSelectors,
             submitServerSecret,
-            driverSubmitSslOptions,
-            sslVolumes,
-            sslVolumeMounts,
-            sslEnvs)
+            sslConfiguration)
           kubernetesResourceCleaner.registerOrUpdateResource(driverPod)
           waitForReadyKubernetesComponents(kubernetesClient, endpointsReadyFuture,
             serviceReadyFuture, podReadyFuture)
@@ -386,13 +366,10 @@ private[spark] class Client(
       kubernetesClient: KubernetesClient,
       driverKubernetesSelectors: util.Map[String, String],
       submitServerSecret: Secret,
-      driverSubmitSslOptions: SSLOptions,
-      sslVolumes: Array[Volume],
-      sslVolumeMounts: Array[VolumeMount],
-      sslEnvs: Array[EnvVar]): Pod = {
+      sslConfiguration: SslConfiguration): Pod = {
     val containerPorts = buildContainerPorts()
     val probePingHttpGet = new HTTPGetActionBuilder()
-      .withScheme(if (driverSubmitSslOptions.enabled) "HTTPS" else "HTTP")
+      .withScheme(if (sslConfiguration.sslOptions.enabled) "HTTPS" else "HTTP")
       .withPath("/v1/submissions/ping")
       .withNewPort(SUBMISSION_SERVER_PORT_NAME)
       .build()
@@ -409,7 +386,7 @@ private[spark] class Client(
             .withSecretName(submitServerSecret.getMetadata.getName)
             .endSecret()
           .endVolume()
-        .addToVolumes(sslVolumes: _*)
+        .addToVolumes(sslConfiguration.sslPodVolumes: _*)
         .withServiceAccount(serviceAccount)
         .addNewContainer()
           .withName(DRIVER_CONTAINER_NAME)
@@ -420,7 +397,7 @@ private[spark] class Client(
             .withMountPath(secretDirectory)
             .withReadOnly(true)
             .endVolumeMount()
-          .addToVolumeMounts(sslVolumeMounts: _*)
+          .addToVolumeMounts(sslConfiguration.sslPodVolumeMounts: _*)
           .addNewEnv()
             .withName(ENV_SUBMISSION_SECRET_LOCATION)
             .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
@@ -429,7 +406,7 @@ private[spark] class Client(
             .withName(ENV_SUBMISSION_SERVER_PORT)
             .withValue(SUBMISSION_SERVER_PORT.toString)
             .endEnv()
-          .addToEnv(sslEnvs: _*)
+          .addToEnv(sslConfiguration.sslPodEnvVars: _*)
           .withPorts(containerPorts.asJava)
           .withNewReadinessProbe().withHttpGet(probePingHttpGet).endReadinessProbe()
           .endContainer()
@@ -486,108 +463,6 @@ private[spark] class Client(
     }
   }
 
-  private def parseDriverSubmitSslOptions(): (SSLOptions, Boolean) = {
-    val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_KEYSTORE)
-    val resolvedSparkConf = sparkConf.clone()
-    val (isLocalKeyStore, resolvedKeyStore) = maybeKeyStore.map(keyStore => {
-      (KubernetesFileUtils.isUriLocalFile(keyStore),
-        Option.apply(Utils.resolveURI(keyStore).getPath))
-    }).getOrElse((false, Option.empty[String]))
-    resolvedKeyStore.foreach {
-      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, _)
-    }
-    sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE).foreach { trustStore =>
-      if (KubernetesFileUtils.isUriLocalFile(trustStore)) {
-        resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE,
-          Utils.resolveURI(trustStore).getPath)
-      } else {
-        throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
-          " for submit server must have no scheme, or scheme file://")
-      }
-    }
-    val securityManager = new SecurityManager(resolvedSparkConf)
-    (securityManager.getSSLOptions(KUBERNETES_SUBMIT_SSL_NAMESPACE), isLocalKeyStore)
-  }
-
-  private def configureSsl(
-      kubernetesClient: KubernetesClient,
-      driverSubmitSslOptions: SSLOptions,
-      isKeyStoreLocalFile: Boolean):
-      (Array[EnvVar], Array[Volume], Array[VolumeMount], Array[Secret]) = {
-    if (driverSubmitSslOptions.enabled) {
-      val sslSecretsMap = mutable.HashMap[String, String]()
-      val sslEnvs = mutable.Buffer[EnvVar]()
-      val secrets = mutable.Buffer[Secret]()
-      driverSubmitSslOptions.keyStore.foreach(store => {
-        val resolvedKeyStoreFile = if (isKeyStoreLocalFile) {
-          if (!store.isFile) {
-            throw new SparkException(s"KeyStore specified at $store is not a file or" +
-              s" does not exist.")
-          }
-          val keyStoreBytes = Files.toByteArray(store)
-          val keyStoreBase64 = Base64.encodeBase64String(keyStoreBytes)
-          sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_SECRET_NAME -> keyStoreBase64)
-          s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_SECRET_NAME"
-        } else {
-          store.getAbsolutePath
-        }
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_FILE)
-          .withValue(resolvedKeyStoreFile)
-          .build()
-      })
-      driverSubmitSslOptions.keyStorePassword.foreach(password => {
-        val passwordBase64 = Base64.encodeBase64String(password.getBytes(Charsets.UTF_8))
-        sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME -> passwordBase64)
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE)
-          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME")
-          .build()
-      })
-      driverSubmitSslOptions.keyPassword.foreach(password => {
-        val passwordBase64 = Base64.encodeBase64String(password.getBytes(Charsets.UTF_8))
-        sslSecretsMap += (SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME -> passwordBase64)
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE)
-          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME")
-          .build()
-      })
-      driverSubmitSslOptions.keyStoreType.foreach(storeType => {
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_TYPE)
-          .withValue(storeType)
-          .build()
-      })
-      sslEnvs += new EnvVarBuilder()
-        .withName(ENV_SUBMISSION_USE_SSL)
-        .withValue("true")
-        .build()
-      val sslVolume = new VolumeBuilder()
-        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
-        .withNewSecret()
-          .withSecretName(sslSecretsName)
-          .endSecret()
-        .build()
-      val sslVolumeMount = new VolumeMountBuilder()
-        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
-        .withReadOnly(true)
-        .withMountPath(sslSecretsDirectory)
-        .build()
-      val sslSecrets = kubernetesClient.secrets().createNew()
-        .withNewMetadata()
-        .withName(sslSecretsName)
-        .endMetadata()
-        .withData(sslSecretsMap.asJava)
-        .withType("Opaque")
-        .done()
-      kubernetesResourceCleaner.registerOrUpdateResource(sslSecrets)
-      secrets += sslSecrets
-      (sslEnvs.toArray, Array(sslVolume), Array(sslVolumeMount), secrets.toArray)
-    } else {
-      (Array[EnvVar](), Array[Volume](), Array[VolumeMount](), Array[Secret]())
-    }
-  }
-
   private def buildSubmitFailedErrorMessage(
       kubernetesClient: KubernetesClient,
       e: Throwable): String = {
@@ -688,8 +563,8 @@ private[spark] class Client(
   private def buildDriverSubmissionClient(
       kubernetesClient: KubernetesClient,
       service: Service,
-      driverSubmitSslOptions: SSLOptions): KubernetesSparkRestApi = {
-    val urlScheme = if (driverSubmitSslOptions.enabled) {
+      sslConfiguration: SslConfiguration): KubernetesSparkRestApi = {
+    val urlScheme = if (sslConfiguration.sslOptions.enabled) {
       "https"
     } else {
       logWarning("Submitting application details, application secret, and local" +
@@ -714,45 +589,18 @@ private[spark] class Client(
         s"$urlScheme://${address.getAddress}:$servicePort"
       }).toSet
     require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
-    val (trustManager, sslContext): (X509TrustManager, SSLContext) =
-      if (driverSubmitSslOptions.enabled) {
-        buildSslConnectionConfiguration(driverSubmitSslOptions)
-      } else {
-        (null, SSLContext.getDefault)
-      }
     HttpClientUtil.createClient[KubernetesSparkRestApi](
       uris = nodeUrls,
       maxRetriesPerServer = 3,
-      sslSocketFactory = sslContext.getSocketFactory,
-      trustContext = trustManager,
+      sslSocketFactory = sslConfiguration
+        .driverSubmitClientSslContext
+        .getSocketFactory,
+      trustContext = sslConfiguration
+        .driverSubmitClientTrustManager
+        .orNull,
       connectTimeoutMillis = 5000)
   }
 
-  private def buildSslConnectionConfiguration(driverSubmitSslOptions: SSLOptions) = {
-    driverSubmitSslOptions.trustStore.map(trustStoreFile => {
-      val trustManagerFactory = TrustManagerFactory.getInstance(
-        TrustManagerFactory.getDefaultAlgorithm)
-      val trustStore = KeyStore.getInstance(
-        driverSubmitSslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType))
-      if (!trustStoreFile.isFile) {
-        throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
-          s" does not exist or is not a file.")
-      }
-      Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
-        driverSubmitSslOptions.trustStorePassword match {
-          case Some(password) =>
-            trustStore.load(trustStoreStream, password.toCharArray)
-          case None => trustStore.load(trustStoreStream, null)
-        }
-      }
-      trustManagerFactory.init(trustStore)
-      val trustManagers = trustManagerFactory.getTrustManagers
-      val sslContext = SSLContext.getInstance("TLSv1.2")
-      sslContext.init(null, trustManagers, SECURE_RANDOM)
-      (trustManagers(0).asInstanceOf[X509TrustManager], sslContext)
-    }).getOrElse((null, SSLContext.getDefault))
-  }
-
   private def parseCustomLabels(maybeLabels: Option[String]): Map[String, String] = {
     maybeLabels.map(labels => {
       labels.split(",").map(_.trim).filterNot(_.isEmpty).map(label => {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
new file mode 100644
index 0000000000000..4c031fcba91ab
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.FileInputStream
+import java.security.{KeyStore, SecureRandom}
+import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{EnvVar, EnvVarBuilder, Secret, Volume, VolumeBuilder, VolumeMount, VolumeMountBuilder}
+import io.fabric8.kubernetes.client.KubernetesClient
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.util.Utils
+
+private[spark] case class SslConfiguration(
+  sslOptions: SSLOptions,
+  isKeyStoreLocalFile: Boolean,
+  sslPodEnvVars: Array[EnvVar],
+  sslPodVolumes: Array[Volume],
+  sslPodVolumeMounts: Array[VolumeMount],
+  sslSecrets: Array[Secret],
+  driverSubmitClientTrustManager: Option[X509TrustManager],
+  driverSubmitClientSslContext: SSLContext)
+
+private[spark] class SslConfigurationProvider(
+    sparkConf: SparkConf,
+    kubernetesAppId: String,
+    kubernetesClient: KubernetesClient,
+    kubernetesResourceCleaner: KubernetesResourceCleaner) {
+  private val SECURE_RANDOM = new SecureRandom()
+  private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
+  private val sslSecretsDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId-ssl"
+
+  def getSslConfiguration(): SslConfiguration = {
+    val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
+    if (driverSubmitSslOptions.enabled) {
+      val sslSecretsMap = mutable.HashMap[String, String]()
+      val sslEnvs = mutable.Buffer[EnvVar]()
+      val secrets = mutable.Buffer[Secret]()
+      driverSubmitSslOptions.keyStore.foreach(store => {
+        val resolvedKeyStoreFile = if (isKeyStoreLocalFile) {
+          if (!store.isFile) {
+            throw new SparkException(s"KeyStore specified at $store is not a file or" +
+              s" does not exist.")
+          }
+          val keyStoreBytes = Files.toByteArray(store)
+          val keyStoreBase64 = BaseEncoding.base64().encode(keyStoreBytes)
+          sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_SECRET_NAME -> keyStoreBase64)
+          s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_SECRET_NAME"
+        } else {
+          store.getAbsolutePath
+        }
+        sslEnvs += new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_FILE)
+          .withValue(resolvedKeyStoreFile)
+          .build()
+      })
+      driverSubmitSslOptions.keyStorePassword.foreach(password => {
+        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
+        sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME -> passwordBase64)
+        sslEnvs += new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE)
+          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME")
+          .build()
+      })
+      driverSubmitSslOptions.keyPassword.foreach(password => {
+        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
+        sslSecretsMap += (SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME -> passwordBase64)
+        sslEnvs += new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE)
+          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME")
+          .build()
+      })
+      driverSubmitSslOptions.keyStoreType.foreach(storeType => {
+        sslEnvs += new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_TYPE)
+          .withValue(storeType)
+          .build()
+      })
+      sslEnvs += new EnvVarBuilder()
+        .withName(ENV_SUBMISSION_USE_SSL)
+        .withValue("true")
+        .build()
+      val sslVolume = new VolumeBuilder()
+        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
+        .withNewSecret()
+        .withSecretName(sslSecretsName)
+        .endSecret()
+        .build()
+      val sslVolumeMount = new VolumeMountBuilder()
+        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
+        .withReadOnly(true)
+        .withMountPath(sslSecretsDirectory)
+        .build()
+      val sslSecrets = kubernetesClient.secrets().createNew()
+        .withNewMetadata()
+        .withName(sslSecretsName)
+        .endMetadata()
+        .withData(sslSecretsMap.asJava)
+        .withType("Opaque")
+        .done()
+      kubernetesResourceCleaner.registerOrUpdateResource(sslSecrets)
+      secrets += sslSecrets
+      val (driverSubmitClientTrustManager, driverSubmitClientSslContext) =
+        buildSslConnectionConfiguration(driverSubmitSslOptions)
+      SslConfiguration(
+        driverSubmitSslOptions,
+        isKeyStoreLocalFile,
+        sslEnvs.toArray,
+        Array(sslVolume),
+        Array(sslVolumeMount),
+        secrets.toArray,
+        driverSubmitClientTrustManager,
+        driverSubmitClientSslContext)
+    } else {
+      SslConfiguration(
+        driverSubmitSslOptions,
+        isKeyStoreLocalFile,
+        Array[EnvVar](),
+        Array[Volume](),
+        Array[VolumeMount](),
+        Array[Secret](),
+        None,
+        SSLContext.getDefault)
+    }
+  }
+
+  private def parseDriverSubmitSslOptions(): (SSLOptions, Boolean) = {
+    val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_KEYSTORE)
+    val resolvedSparkConf = sparkConf.clone()
+    val (isLocalKeyStore, resolvedKeyStore) = maybeKeyStore.map(keyStore => {
+      val keyStoreURI = Utils.resolveURI(keyStore)
+      val isProvidedKeyStoreLocal = keyStoreURI.getScheme match {
+        case "file" | null => true
+        case "local" => false
+        case _ => throw new SparkException(s"Invalid KeyStore URI $keyStore; keyStore URI" +
+          " for submit server must have scheme file:// or local:// (no scheme defaults" +
+          " to file://)")
+      }
+      (isProvidedKeyStoreLocal, Option.apply(keyStoreURI.getPath))
+    }).getOrElse((false, Option.empty[String]))
+    resolvedKeyStore.foreach {
+      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, _)
+    }
+    sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE).foreach { trustStore =>
+      val trustStoreURI = Utils.resolveURI(trustStore)
+      trustStoreURI.getScheme match {
+        case "file" | null =>
+          resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE, trustStoreURI.getPath)
+        case _ => throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
+          " for submit server must have no scheme, or scheme file://")
+      }
+    }
+    val securityManager = new SparkSecurityManager(resolvedSparkConf)
+    (securityManager.getSSLOptions(KUBERNETES_SUBMIT_SSL_NAMESPACE), isLocalKeyStore)
+  }
+
+  private def buildSslConnectionConfiguration(driverSubmitSslOptions: SSLOptions):
+      (Option[X509TrustManager], SSLContext) = {
+    driverSubmitSslOptions.trustStore.map(trustStoreFile => {
+      val trustManagerFactory = TrustManagerFactory.getInstance(
+        TrustManagerFactory.getDefaultAlgorithm)
+      val trustStore = KeyStore.getInstance(
+        driverSubmitSslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType))
+      if (!trustStoreFile.isFile) {
+        throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
+          s" does not exist or is not a file.")
+      }
+      Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
+        driverSubmitSslOptions.trustStorePassword match {
+          case Some(password) =>
+            trustStore.load(trustStoreStream, password.toCharArray)
+          case None => trustStore.load(trustStoreStream, null)
+        }
+      }
+      trustManagerFactory.init(trustStore)
+      val trustManagers = trustManagerFactory.getTrustManagers
+      val sslContext = SSLContext.getInstance("TLSv1.2")
+      sslContext.init(null, trustManagers, SECURE_RANDOM)
+      (Option.apply(trustManagers(0).asInstanceOf[X509TrustManager]), sslContext)
+    }).getOrElse((Option.empty[X509TrustManager], SSLContext.getDefault))
+  }
+}

From 39c2cf2cb091e71d74732c172c06d47f6ed69000 Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Fri, 24 Feb 2017 13:01:15 -0800
Subject: [PATCH 435/534] Exclude known flaky tests (#156)

* Exclude SparkSubmitSuite from Travis unit test build

* Remove SortShuffleSuite

* Exclude Java tests
---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index be255103ac725..87bc84645ca7d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,7 +35,7 @@ env:  # Used by the install section below.
   - PHASE=test  \
     PROFILES="-Pmesos -Pyarn -Phadoop-2.7 -Pkubernetes"  \
     MODULES="-pl core,resource-managers/kubernetes/core -am"  \
-    ARGS="-Dsuffixes='^org\.apache\.spark\.(?!rdd\.LocalCheckpointSuite$|deploy\.StandaloneDynamicAllocationSuite$).*'"
+    ARGS="-Dtest=none -Dsuffixes='^org\.apache\.spark\.(?!SortShuffleSuite$|rdd\.LocalCheckpointSuite$|deploy\.SparkSubmitSuite$|deploy\.StandaloneDynamicAllocationSuite$).*'"
   # Configure the full build.
   - PHASE=install  \
     PROFILES="-Pmesos -Pyarn -Phadoop-2.7 -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver"  \

From 2303aadd6eadc0589b07d6c77512e571fb75bcf1 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 24 Feb 2017 14:10:51 -0700
Subject: [PATCH 436/534] Richer logging and better error handling in driver
 pod watch (#154)

* pod-watch progress around watch events

* Simplify return

* comments
---
 .../kubernetes/LoggingPodStatusWatcher.scala  | 43 ++++++++++++-------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
index b7a29fedcbd2d..17c3db8331ac4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
@@ -50,27 +50,30 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
   }
 
   private var pod: Option[Pod] = Option.empty
-  private var prevPhase: String = null
   private def phase: String = pod.map(_.getStatus().getPhase()).getOrElse("unknown")
+  private def status: String = pod.map(_.getStatus().getContainerStatuses().toString())
+    .getOrElse("unknown")
 
   override def eventReceived(action: Action, pod: Pod): Unit = {
     this.pod = Option(pod)
-
-    logShortStatus()
-    if (prevPhase != phase) {
-      logLongStatus()
-    }
-    prevPhase = phase
-
-    if (phase == "Succeeded" || phase == "Failed") {
-      podCompletedFuture.countDown()
-      scheduler.shutdown()
+    action match {
+      case Action.DELETED =>
+        closeWatch()
+
+      case Action.ERROR =>
+        closeWatch()
+
+      case _ =>
+        logLongStatus()
+        if (hasCompleted()) {
+          closeWatch()
+        }
     }
   }
 
   override def onClose(e: KubernetesClientException): Unit = {
-    scheduler.shutdown()
-    logDebug(s"Stopped watching application $appId with last-observed phase $phase")
+    logDebug(s"Stopping watching application $appId with last-observed phase $phase")
+    closeWatch()
   }
 
   private def logShortStatus() = {
@@ -78,7 +81,16 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
   }
 
   private def logLongStatus() = {
-    logInfo("Phase changed, new state: " + pod.map(formatPodState(_)).getOrElse("unknown"))
+    logInfo("State changed, new state: " + pod.map(formatPodState(_)).getOrElse("unknown"))
+  }
+
+  private def hasCompleted(): Boolean = {
+    phase == "Succeeded" || phase == "Failed"
+  }
+
+  private def closeWatch(): Unit = {
+    podCompletedFuture.countDown()
+    scheduler.shutdown()
   }
 
   private def formatPodState(pod: Pod): String = {
@@ -103,7 +115,8 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
             .asScala
             .map(_.getImage)
             .mkString(", ")),
-      ("phase", pod.getStatus.getPhase())
+      ("phase", pod.getStatus.getPhase()),
+      ("status", pod.getStatus.getContainerStatuses().toString)
     )
 
     // Use more loggable format if value is null or empty

From e7f78cb5dd40ff9b450fc82c82dc30059a55d8a6 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 24 Feb 2017 16:29:05 -0800
Subject: [PATCH 437/534] Document blocking submit calls (#152)

* Document blocking submit calls

https://github.com/apache-spark-on-k8s/spark/pull/53 added these config but didn't document them

* Update running-on-kubernetes.md
---
 docs/running-on-kubernetes.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index cddc7259e6ac3..9d49ac6829723 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -202,6 +202,21 @@ from the other deployment modes. See the [configuration page](configuration.html
     resource. Use alternatives such as Ingress if possible.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.submit.waitAppCompletion</code></td>
+  <td><code>true</code></td>
+  <td>
+    In cluster mode, whether to wait for the application to finish before exiting the launcher process.  When changed to
+    false, the launcher has a "fire-and-forget" behavior when launching the Spark job.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.report.interval</code></td>
+  <td><code>1s</code></td>
+  <td>
+    Interval between reports of the current Spark job status in cluster mode.
+  </td>
+</tr>
 </table>
 
 ## Current Limitations

From fd24f23f9968875c243d0c24ebe631e30a75e9a2 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 2 Mar 2017 05:34:41 -0800
Subject: [PATCH 438/534] Allow custom annotations on the driver pod. (#163)

---
 docs/running-on-kubernetes.md                 |  8 +++
 .../spark/deploy/kubernetes/Client.scala      | 55 ++++++++++++-------
 .../spark/deploy/kubernetes/config.scala      | 10 ++++
 .../integrationtest/KubernetesSuite.scala     | 23 +++++---
 4 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 9d49ac6829723..a5806e98ee22d 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -187,6 +187,14 @@ from the other deployment modes. See the [configuration page](configuration.html
     for bookkeeping purposes.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.annotations</code></td>
+  <td>(none)</td>
+  <td>
+    Custom annotations that will be added to the driver pod. This should be a comma-separated list of label key-value
+    pairs, where each annotation is in the format <code>key=value</code>.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.driverSubmitTimeout</code></td>
   <td>60s</td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index c787d5917e381..c9831ce23ed0e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -18,7 +18,6 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.File
 import java.security.SecureRandom
-import java.util
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
 import com.google.common.io.Files
@@ -73,6 +72,7 @@ private[spark] class Client(
 
   private val serviceAccount = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
   private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
+  private val customAnnotations = sparkConf.get(KUBERNETES_DRIVER_ANNOTATIONS)
 
   private val kubernetesResourceCleaner = new KubernetesResourceCleaner
 
@@ -90,7 +90,18 @@ private[spark] class Client(
       throw new SparkException(s"Main app resource file $mainAppResource is not a file or" +
         s" is a directory.")
     }
-    val parsedCustomLabels = parseCustomLabels(customLabels)
+    val parsedCustomLabels = parseKeyValuePairs(customLabels, KUBERNETES_DRIVER_LABELS.key,
+      "labels")
+    parsedCustomLabels.keys.foreach { key =>
+      require(key != SPARK_APP_ID_LABEL, "Label with key" +
+        s" $SPARK_APP_ID_LABEL cannot be used in" +
+        " spark.kubernetes.driver.labels, as it is reserved for Spark's" +
+        " internal configuration.")
+    }
+    val parsedCustomAnnotations = parseKeyValuePairs(
+      customAnnotations,
+      KUBERNETES_DRIVER_ANNOTATIONS.key,
+      "annotations")
     var k8ConfBuilder = new K8SConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(master)
@@ -134,6 +145,7 @@ private[spark] class Client(
           val (driverPod, driverService) = launchDriverKubernetesComponents(
             kubernetesClient,
             parsedCustomLabels,
+            parsedCustomAnnotations,
             submitServerSecret,
             sslConfiguration)
           configureOwnerReferences(
@@ -215,14 +227,15 @@ private[spark] class Client(
 
   private def launchDriverKubernetesComponents(
       kubernetesClient: KubernetesClient,
-      parsedCustomLabels: Map[String, String],
+      customLabels: Map[String, String],
+      customAnnotations: Map[String, String],
       submitServerSecret: Secret,
       sslConfiguration: SslConfiguration): (Pod, Service) = {
     val driverKubernetesSelectors = (Map(
       SPARK_DRIVER_LABEL -> kubernetesAppId,
       SPARK_APP_ID_LABEL -> kubernetesAppId,
       SPARK_APP_NAME_LABEL -> appName)
-      ++ parsedCustomLabels).asJava
+      ++ customLabels)
     val endpointsReadyFuture = SettableFuture.create[Endpoints]
     val endpointsReadyWatcher = new DriverEndpointsReadyWatcher(endpointsReadyFuture)
     val serviceReadyFuture = SettableFuture.create[Service]
@@ -249,6 +262,7 @@ private[spark] class Client(
           val driverPod = createDriverPod(
             kubernetesClient,
             driverKubernetesSelectors,
+            customAnnotations,
             submitServerSecret,
             sslConfiguration)
           kubernetesResourceCleaner.registerOrUpdateResource(driverPod)
@@ -342,7 +356,7 @@ private[spark] class Client(
 
   private def createDriverService(
       kubernetesClient: KubernetesClient,
-      driverKubernetesSelectors: java.util.Map[String, String],
+      driverKubernetesSelectors: Map[String, String],
       submitServerSecret: Secret): Service = {
     val driverSubmissionServicePort = new ServicePortBuilder()
       .withName(SUBMISSION_SERVER_PORT_NAME)
@@ -352,11 +366,11 @@ private[spark] class Client(
     kubernetesClient.services().createNew()
       .withNewMetadata()
         .withName(kubernetesAppId)
-        .withLabels(driverKubernetesSelectors)
+        .withLabels(driverKubernetesSelectors.asJava)
         .endMetadata()
       .withNewSpec()
         .withType("NodePort")
-        .withSelector(driverKubernetesSelectors)
+        .withSelector(driverKubernetesSelectors.asJava)
         .withPorts(driverSubmissionServicePort)
         .endSpec()
       .done()
@@ -364,7 +378,8 @@ private[spark] class Client(
 
   private def createDriverPod(
       kubernetesClient: KubernetesClient,
-      driverKubernetesSelectors: util.Map[String, String],
+      driverKubernetesSelectors: Map[String, String],
+      customAnnotations: Map[String, String],
       submitServerSecret: Secret,
       sslConfiguration: SslConfiguration): Pod = {
     val containerPorts = buildContainerPorts()
@@ -376,7 +391,8 @@ private[spark] class Client(
     kubernetesClient.pods().createNew()
       .withNewMetadata()
         .withName(kubernetesAppId)
-        .withLabels(driverKubernetesSelectors)
+        .withLabels(driverKubernetesSelectors.asJava)
+        .withAnnotations(customAnnotations.asJava)
         .endMetadata()
       .withNewSpec()
         .withRestartPolicy("Never")
@@ -601,20 +617,19 @@ private[spark] class Client(
       connectTimeoutMillis = 5000)
   }
 
-  private def parseCustomLabels(maybeLabels: Option[String]): Map[String, String] = {
-    maybeLabels.map(labels => {
-      labels.split(",").map(_.trim).filterNot(_.isEmpty).map(label => {
-        label.split("=", 2).toSeq match {
+  private def parseKeyValuePairs(
+      maybeKeyValues: Option[String],
+      configKey: String,
+      keyValueType: String): Map[String, String] = {
+    maybeKeyValues.map(keyValues => {
+      keyValues.split(",").map(_.trim).filterNot(_.isEmpty).map(keyValue => {
+        keyValue.split("=", 2).toSeq match {
           case Seq(k, v) =>
-            require(k != SPARK_APP_ID_LABEL, "Label with key" +
-              s" $SPARK_APP_ID_LABEL cannot be used in" +
-              " spark.kubernetes.driver.labels, as it is reserved for Spark's" +
-              " internal configuration.")
             (k, v)
           case _ =>
-            throw new SparkException("Custom labels set by spark.kubernetes.driver.labels" +
-              " must be a comma-separated list of key-value pairs, with format <key>=<value>." +
-              s" Got label: $label. All labels: $labels")
+            throw new SparkException(s"Custom $keyValueType set by $configKey must be a" +
+              s" comma-separated list of key-value pairs, with format <key>=<value>." +
+              s" Got value: $keyValue. All values: $keyValues")
         }
       }).toMap
     }).getOrElse(Map.empty[String, String])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index a21ec2101cc6e..bc2f9d578555d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -118,6 +118,16 @@ package object config {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_DRIVER_ANNOTATIONS =
+    ConfigBuilder("spark.kubernetes.driver.annotations")
+      .doc("""
+             | Custom annotations that will be added to the driver pod.
+             | This should be a comma-separated list of annotation key-value
+             | pairs, where each annotation is in the format key=value.
+           """.stripMargin)
+      .stringConf
+      .createOptional
+
   private[spark] val KUBERNETES_DRIVER_SUBMIT_TIMEOUT =
     ConfigBuilder("spark.kubernetes.driverSubmitTimeout")
       .doc("""
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index fe171db15b3d1..11c85caa6fc94 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -229,7 +229,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     expectationsForStaticAllocation(sparkMetricsService)
   }
 
-  test("Run with custom labels") {
+  test("Run with custom labels and annotations") {
     val args = Array(
       "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
       "--deploy-mode", "cluster",
@@ -246,26 +246,35 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
       "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
       "--conf", "spark.kubernetes.driver.labels=label1=label1value,label2=label2value",
+      "--conf", "spark.kubernetes.driver.annotations=" +
+        "annotation1=annotation1value," +
+        "annotation2=annotation2value",
       "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
       EXAMPLES_JAR_FILE.getAbsolutePath)
     SparkSubmit.main(args)
-    val driverPodLabels = minikubeKubernetesClient
+    val driverPodMetadata = minikubeKubernetesClient
       .pods
       .withLabel("spark-app-name", "spark-pi")
       .list()
       .getItems
       .get(0)
       .getMetadata
-      .getLabels
+    val driverPodLabels = driverPodMetadata.getLabels
     // We can't match all of the selectors directly since one of the selectors is based on the
     // launch time.
-    assert(driverPodLabels.size == 5, "Unexpected number of pod labels.")
-    assert(driverPodLabels.get("spark-app-name") == "spark-pi", "Unexpected value for" +
+    assert(driverPodLabels.size === 5, "Unexpected number of pod labels.")
+    assert(driverPodLabels.get("spark-app-name") === "spark-pi", "Unexpected value for" +
       " spark-app-name label.")
     assert(driverPodLabels.get("spark-app-id").startsWith("spark-pi"), "Unexpected value for" +
       " spark-app-id label (should be prefixed with the app name).")
-    assert(driverPodLabels.get("label1") == "label1value", "Unexpected value for label1")
-    assert(driverPodLabels.get("label2") == "label2value", "Unexpected value for label2")
+    assert(driverPodLabels.get("label1") === "label1value", "Unexpected value for label1")
+    assert(driverPodLabels.get("label2") === "label2value", "Unexpected value for label2")
+    val driverPodAnnotations = driverPodMetadata.getAnnotations
+    assert(driverPodAnnotations.size === 2, "Unexpected number of pod annotations.")
+    assert(driverPodAnnotations.get("annotation1") === "annotation1value",
+      "Unexpected value for annotation1")
+    assert(driverPodAnnotations.get("annotation2") === "annotation2value",
+      "Unexpected value for annotation2")
   }
 
   test("Enable SSL on the driver submit server") {

From 7132f5d654d8c2f10e5b07a818fab610b5e55b13 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Thu, 2 Mar 2017 12:09:08 -0700
Subject: [PATCH 439/534] Update client version & minikube version (#142)

* Update client version

* Upgrade minikube

* Update pom.xml
---
 resource-managers/kubernetes/core/pom.xml              | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index a7eba625cd56c..1c26af6593d37 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -29,7 +29,7 @@
   <name>Spark Project Kubernetes</name>
   <properties>
     <sbt.project.name>kubernetes</sbt.project.name>
-    <kubernetes.client.version>2.0.3</kubernetes.client.version>
+    <kubernetes.client.version>2.2.1</kubernetes.client.version>
   </properties>
 
   <dependencies>
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 4c8069db2686b..03c713b6bc068 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -247,7 +247,7 @@
               <goal>wget</goal>
             </goals>
             <configuration>
-              <url>https://storage.googleapis.com/minikube/releases/v0.12.2/minikube-linux-amd64</url>
+              <url>https://storage.googleapis.com/minikube/releases/v0.16.0/minikube-linux-amd64</url>
               <outputDirectory>${project.build.directory}/minikube-bin/linux-amd64</outputDirectory>
               <outputFileName>minikube</outputFileName>
             </configuration>
@@ -259,7 +259,7 @@
               <goal>wget</goal>
             </goals>
             <configuration>
-              <url>https://storage.googleapis.com/minikube/releases/v0.12.2/minikube-darwin-amd64</url>
+              <url>https://storage.googleapis.com/minikube/releases/v0.16.0/minikube-darwin-amd64</url>
               <outputDirectory>${project.build.directory}/minikube-bin/darwin-amd64</outputDirectory>
               <outputFileName>minikube</outputFileName>
             </configuration>

From a51dcc80313294b33406082dfe9f7808be777325 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 2 Mar 2017 16:22:56 -0800
Subject: [PATCH 440/534] Allow customizing external URI provision + External
 URI can be set via annotations (#147)

* Listen for annotations that provide external URIs.

* FIx scalstyle

* Address comments

* Fix doc style

* Docs updates

* Clearly explain path rewrites
---
 docs/running-on-kubernetes.md                 |  47 +++-
 ...eploy.rest.kubernetes.DriverServiceManager |   2 +
 .../spark/deploy/kubernetes/Client.scala      | 211 ++++++++++--------
 .../KubernetesResourceCleaner.scala           |   3 +-
 .../spark/deploy/kubernetes/config.scala      |  21 ++
 .../spark/deploy/kubernetes/constants.scala   |   6 +
 .../kubernetes/DriverServiceManager.scala     | 100 +++++++++
 ...rnalSuppliedUrisDriverServiceManager.scala | 105 +++++++++
 .../NodePortUrisDriverServiceManager.scala    |  70 ++++++
 .../ExternalUriProviderWatch.scala            |  75 +++++++
 .../integrationtest/KubernetesSuite.scala     |  50 +++++
 11 files changed, 598 insertions(+), 92 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index a5806e98ee22d..d024d427fea97 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -106,6 +106,36 @@ The above mechanism using `kubectl proxy` can be used when we have authenticatio
 kubernetes-client library does not support. Authentication using X509 Client Certs and oauth tokens
 is currently supported.
 
+### Determining the Driver Base URI
+
+Kubernetes pods run with their own IP address space. If Spark is run in cluster mode, the driver pod may not be
+accessible to the submitter. However, the submitter needs to send local dependencies from its local disk to the driver
+pod.
+
+By default, Spark will place a [Service](https://kubernetes.io/docs/user-guide/services/#type-nodeport) with a NodePort
+that is opened on every node. The submission client will then contact the driver at one of the node's
+addresses with the appropriate service port.
+
+There may be cases where the nodes cannot be reached by the submission client. For example, the cluster may
+only be reachable through an external load balancer. The user may provide their own external URI for Spark driver
+services. To use a your own external URI instead of a node's IP and node port, first set
+`spark.kubernetes.driver.serviceManagerType` to `ExternalAnnotation`. A service will be created with the annotation
+`spark-job.alpha.apache.org/provideExternalUri`, and this service routes to the driver pod. You will need to run a
+separate process that watches the API server for services that are created with this annotation in the application's
+namespace (set by `spark.kubernetes.namespace`). The process should determine a URI that routes to this service
+(potentially configuring infrastructure to handle the URI behind the scenes), and patch the service to include an
+annotation `spark-job.alpha.apache.org/resolvedExternalUri`, which has its value as the external URI that your process
+has provided (e.g. `https://example.com:8080/my-job`).
+
+Note that the URI provided in the annotation needs to route traffic to the appropriate destination on the pod, which has
+a empty path portion of the URI. This means the external URI provider will likely need to rewrite the path from the
+external URI to the destination on the pod, e.g. https://example.com:8080/spark-app-1/submit will need to route traffic
+to https://<pod_ip>:<service_port>/. Note that the paths of these two URLs are different.
+
+If the above is confusing, keep in mind that this functionality is only necessary if the submitter cannot reach any of
+the nodes at the driver's node port. It is recommended to use the default configuration with the node port service
+whenever possible.
+
 ### Spark Properties
 
 Below are some other common properties that are specific to Kubernetes. Most of the other configurations are the same
@@ -207,7 +237,7 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td><code>false</code></td>
   <td>
     Whether to expose the driver Web UI port as a service NodePort. Turned off by default because NodePort is a limited
-    resource. Use alternatives such as Ingress if possible.
+    resource.
   </td>
 </tr>
 <tr>
@@ -225,6 +255,21 @@ from the other deployment modes. See the [configuration page](configuration.html
     Interval between reports of the current Spark job status in cluster mode.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.serviceManagerType</code></td>
+  <td><code>NodePort</code></td>
+  <td>
+    A tag indicating which class to use for creating the Kubernetes service and determining its URI for the submission
+    client. Valid values are currently <code>NodePort</code> and <code>ExternalAnnotation</code>. By default, a service
+    is created with the <code>NodePort</code> type, and the driver will be contacted at one of the nodes at the port
+    that the nodes expose for the service. If the nodes cannot be contacted from the submitter's machine, consider
+    setting this to <code>ExternalAnnotation</code> as described in "Determining the Driver Base URI" above. One may
+    also include a custom implementation of <code>org.apache.spark.deploy.rest.kubernetes.DriverServiceManager</code> on
+    the submitter's classpath - spark-submit service loads an instance of that class. To use the custom
+    implementation, set this value to the custom implementation's return value of 
+    <code>DriverServiceManager#getServiceManagerType()</code>. This method should only be done as a last resort.
+  </td>
+</tr>
 </table>
 
 ## Current Limitations
diff --git a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
new file mode 100644
index 0000000000000..5a306335b4166
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
@@ -0,0 +1,2 @@
+org.apache.spark.deploy.rest.kubernetes.ExternalSuppliedUrisDriverServiceManager
+org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
\ No newline at end of file
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index c9831ce23ed0e..af5623093382e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -18,6 +18,7 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.File
 import java.security.SecureRandom
+import java.util.ServiceLoader
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
 import com.google.common.io.Files
@@ -55,6 +56,7 @@ private[spark] class Client(
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
+  private val driverServiceManagerType = sparkConf.get(DRIVER_SERVICE_MANAGER_TYPE)
   private val sparkFiles = sparkConf.getOption("spark.files")
     .map(_.split(","))
     .getOrElse(Array.empty[String])
@@ -90,6 +92,7 @@ private[spark] class Client(
       throw new SparkException(s"Main app resource file $mainAppResource is not a file or" +
         s" is a directory.")
     }
+    val driverServiceManager = getDriverServiceManager
     val parsedCustomLabels = parseKeyValuePairs(customLabels, KUBERNETES_DRIVER_LABELS.key,
       "labels")
     parsedCustomLabels.keys.foreach { key =>
@@ -118,32 +121,48 @@ private[spark] class Client(
 
     val k8ClientConfig = k8ConfBuilder.build
     Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig)) { kubernetesClient =>
-      ShutdownHookManager.addShutdownHook(() =>
-        kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
-      val sslConfigurationProvider = new SslConfigurationProvider(
-        sparkConf, kubernetesAppId, kubernetesClient, kubernetesResourceCleaner)
-      val submitServerSecret = kubernetesClient.secrets().createNew()
-        .withNewMetadata()
-          .withName(secretName)
-          .endMetadata()
-        .withData(Map((SUBMISSION_APP_SECRET_NAME, secretBase64String)).asJava)
-        .withType("Opaque")
-        .done()
-      kubernetesResourceCleaner.registerOrUpdateResource(submitServerSecret)
-      try {
-        val sslConfiguration = sslConfigurationProvider.getSslConfiguration()
-        // start outer watch for status logging of driver pod
-        val driverPodCompletedLatch = new CountDownLatch(1)
-        // only enable interval logging if in waitForAppCompletion mode
-        val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
-        val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
-          loggingInterval)
-        Utils.tryWithResource(kubernetesClient
-            .pods()
-            .withName(kubernetesAppId)
-            .watch(loggingWatch)) { _ =>
+      driverServiceManager.start(kubernetesClient, kubernetesAppId, sparkConf)
+      // start outer watch for status logging of driver pod
+      // only enable interval logging if in waitForAppCompletion mode
+      val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
+      val driverPodCompletedLatch = new CountDownLatch(1)
+      val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
+        loggingInterval)
+      Utils.tryWithResource(kubernetesClient
+          .pods()
+          .withName(kubernetesAppId)
+          .watch(loggingWatch)) { _ =>
+        val resourceCleanShutdownHook = ShutdownHookManager.addShutdownHook(() =>
+          kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
+        val cleanupServiceManagerHook = ShutdownHookManager.addShutdownHook(
+          ShutdownHookManager.DEFAULT_SHUTDOWN_PRIORITY)(
+          () => driverServiceManager.stop())
+        // Place the error hook at a higher priority in order for the error hook to run before
+        // the stop hook.
+        val serviceManagerErrorHook = ShutdownHookManager.addShutdownHook(
+          ShutdownHookManager.DEFAULT_SHUTDOWN_PRIORITY + 1)(() =>
+          driverServiceManager.handleSubmissionError(
+            new SparkException("Submission shutting down early...")))
+        try {
+          val sslConfigurationProvider = new SslConfigurationProvider(
+            sparkConf, kubernetesAppId, kubernetesClient, kubernetesResourceCleaner)
+          val submitServerSecret = kubernetesClient.secrets().createNew()
+            .withNewMetadata()
+            .withName(secretName)
+            .endMetadata()
+            .withData(Map((SUBMISSION_APP_SECRET_NAME, secretBase64String)).asJava)
+            .withType("Opaque")
+            .done()
+          kubernetesResourceCleaner.registerOrUpdateResource(submitServerSecret)
+          val sslConfiguration = sslConfigurationProvider.getSslConfiguration()
+          val driverKubernetesSelectors = (Map(
+            SPARK_DRIVER_LABEL -> kubernetesAppId,
+            SPARK_APP_ID_LABEL -> kubernetesAppId,
+            SPARK_APP_NAME_LABEL -> appName)
+            ++ parsedCustomLabels)
           val (driverPod, driverService) = launchDriverKubernetesComponents(
             kubernetesClient,
+            driverServiceManager,
             parsedCustomLabels,
             parsedCustomAnnotations,
             submitServerSecret,
@@ -156,6 +175,7 @@ private[spark] class Client(
             driverService)
           submitApplicationToDriverServer(
             kubernetesClient,
+            driverServiceManager,
             sslConfiguration,
             driverService,
             submitterLocalFiles,
@@ -165,23 +185,43 @@ private[spark] class Client(
           // those.
           kubernetesResourceCleaner.unregisterResource(driverPod)
           kubernetesResourceCleaner.unregisterResource(driverService)
-          // wait if configured to do so
-          if (waitForAppCompletion) {
-            logInfo(s"Waiting for application $kubernetesAppId to finish...")
-            driverPodCompletedLatch.await()
-            logInfo(s"Application $kubernetesAppId finished.")
-          } else {
-            logInfo(s"Application $kubernetesAppId successfully launched.")
+        } catch {
+          case e: Throwable =>
+            driverServiceManager.handleSubmissionError(e)
+        } finally {
+          Utils.tryLogNonFatalError {
+            kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient)
+          }
+          Utils.tryLogNonFatalError {
+            driverServiceManager.stop()
+          }
+
+          // Remove the shutdown hooks that would be redundant
+          Utils.tryLogNonFatalError {
+            ShutdownHookManager.removeShutdownHook(resourceCleanShutdownHook)
+          }
+          Utils.tryLogNonFatalError {
+            ShutdownHookManager.removeShutdownHook(cleanupServiceManagerHook)
+          }
+          Utils.tryLogNonFatalError {
+            ShutdownHookManager.removeShutdownHook(serviceManagerErrorHook)
           }
         }
-      } finally {
-        kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient)
+        // wait if configured to do so
+        if (waitForAppCompletion) {
+          logInfo(s"Waiting for application $kubernetesAppId to finish...")
+          driverPodCompletedLatch.await()
+          logInfo(s"Application $kubernetesAppId finished.")
+        } else {
+          logInfo(s"Application $kubernetesAppId successfully launched.")
+        }
       }
     }
   }
 
   private def submitApplicationToDriverServer(
       kubernetesClient: KubernetesClient,
+      driverServiceManager: DriverServiceManager,
       sslConfiguration: SslConfiguration,
       driverService: Service,
       submitterLocalFiles: Iterable[String],
@@ -197,7 +237,10 @@ private[spark] class Client(
     sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
     sparkConf.setIfMissing("spark.blockmanager.port",
       DEFAULT_BLOCKMANAGER_PORT.toString)
-    val driverSubmitter = buildDriverSubmissionClient(kubernetesClient, driverService,
+    val driverSubmitter = buildDriverSubmissionClient(
+      kubernetesClient,
+      driverServiceManager,
+      driverService,
       sslConfiguration)
     // Sanity check to see if the driver submitter is even reachable.
     driverSubmitter.ping()
@@ -227,6 +270,7 @@ private[spark] class Client(
 
   private def launchDriverKubernetesComponents(
       kubernetesClient: KubernetesClient,
+      driverServiceManager: DriverServiceManager,
       customLabels: Map[String, String],
       customAnnotations: Map[String, String],
       submitServerSecret: Secret,
@@ -254,10 +298,9 @@ private[spark] class Client(
             .endpoints()
             .withName(kubernetesAppId)
             .watch(endpointsReadyWatcher)) { _ =>
-          val driverService = createDriverService(
-            kubernetesClient,
-            driverKubernetesSelectors,
-            submitServerSecret)
+          val serviceTemplate = createDriverServiceTemplate(driverKubernetesSelectors)
+          val driverService = kubernetesClient.services().create(
+            driverServiceManager.customizeDriverService(serviceTemplate).build())
           kubernetesResourceCleaner.registerOrUpdateResource(driverService)
           val driverPod = createDriverPod(
             kubernetesClient,
@@ -265,7 +308,6 @@ private[spark] class Client(
             customAnnotations,
             submitServerSecret,
             sslConfiguration)
-          kubernetesResourceCleaner.registerOrUpdateResource(driverPod)
           waitForReadyKubernetesComponents(kubernetesClient, endpointsReadyFuture,
             serviceReadyFuture, podReadyFuture)
           (driverPod, driverService)
@@ -354,28 +396,6 @@ private[spark] class Client(
     }
   }
 
-  private def createDriverService(
-      kubernetesClient: KubernetesClient,
-      driverKubernetesSelectors: Map[String, String],
-      submitServerSecret: Secret): Service = {
-    val driverSubmissionServicePort = new ServicePortBuilder()
-      .withName(SUBMISSION_SERVER_PORT_NAME)
-      .withPort(SUBMISSION_SERVER_PORT)
-      .withNewTargetPort(SUBMISSION_SERVER_PORT)
-      .build()
-    kubernetesClient.services().createNew()
-      .withNewMetadata()
-        .withName(kubernetesAppId)
-        .withLabels(driverKubernetesSelectors.asJava)
-        .endMetadata()
-      .withNewSpec()
-        .withType("NodePort")
-        .withSelector(driverKubernetesSelectors.asJava)
-        .withPorts(driverSubmissionServicePort)
-        .endSpec()
-      .done()
-  }
-
   private def createDriverPod(
       kubernetesClient: KubernetesClient,
       driverKubernetesSelectors: Map[String, String],
@@ -388,7 +408,7 @@ private[spark] class Client(
       .withPath("/v1/submissions/ping")
       .withNewPort(SUBMISSION_SERVER_PORT_NAME)
       .build()
-    kubernetesClient.pods().createNew()
+    val driverPod = kubernetesClient.pods().createNew()
       .withNewMetadata()
         .withName(kubernetesAppId)
         .withLabels(driverKubernetesSelectors.asJava)
@@ -428,6 +448,26 @@ private[spark] class Client(
           .endContainer()
         .endSpec()
       .done()
+    kubernetesResourceCleaner.registerOrUpdateResource(driverPod)
+    driverPod
+  }
+
+   private def createDriverServiceTemplate(driverKubernetesSelectors: Map[String, String])
+      : ServiceBuilder = {
+    val driverSubmissionServicePort = new ServicePortBuilder()
+      .withName(SUBMISSION_SERVER_PORT_NAME)
+      .withPort(SUBMISSION_SERVER_PORT)
+      .withNewTargetPort(SUBMISSION_SERVER_PORT)
+      .build()
+    new ServiceBuilder()
+      .withNewMetadata()
+        .withName(kubernetesAppId)
+        .withLabels(driverKubernetesSelectors.asJava)
+        .endMetadata()
+      .withNewSpec()
+        .withSelector(driverKubernetesSelectors.asJava)
+        .withPorts(driverSubmissionServicePort)
+        .endSpec()
   }
 
   private class DriverPodReadyWatcher(resolvedDriverPod: SettableFuture[Pod]) extends Watcher[Pod] {
@@ -578,36 +618,14 @@ private[spark] class Client(
 
   private def buildDriverSubmissionClient(
       kubernetesClient: KubernetesClient,
+      driverServiceManager: DriverServiceManager,
       service: Service,
       sslConfiguration: SslConfiguration): KubernetesSparkRestApi = {
-    val urlScheme = if (sslConfiguration.sslOptions.enabled) {
-      "https"
-    } else {
-      logWarning("Submitting application details, application secret, and local" +
-        " jars to the cluster over an insecure connection. You should configure SSL" +
-        " to secure this step.")
-      "http"
-    }
-    val servicePort = service.getSpec.getPorts.asScala
-      .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
-      .head.getNodePort
-    val nodeUrls = kubernetesClient.nodes.list.getItems.asScala
-      .filterNot(node => node.getSpec.getUnschedulable != null &&
-        node.getSpec.getUnschedulable)
-      .flatMap(_.getStatus.getAddresses.asScala)
-      // The list contains hostnames, internal and external IP addresses.
-      // (https://kubernetes.io/docs/admin/node/#addresses)
-      // we want only external IP addresses and legacyHostIP addresses in our list
-      // legacyHostIPs are deprecated and will be removed in the future.
-      // (https://github.com/kubernetes/kubernetes/issues/9267)
-      .filter(address => address.getType == "ExternalIP" || address.getType == "LegacyHostIP")
-      .map(address => {
-        s"$urlScheme://${address.getAddress}:$servicePort"
-      }).toSet
-    require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
+    val serviceUris = driverServiceManager.getDriverServiceSubmissionServerUris(service)
+    require(serviceUris.nonEmpty, "No uris found to contact the driver!")
     HttpClientUtil.createClient[KubernetesSparkRestApi](
-      uris = nodeUrls,
-      maxRetriesPerServer = 3,
+      uris = serviceUris,
+      maxRetriesPerServer = 10,
       sslSocketFactory = sslConfiguration
         .driverSubmitClientSslContext
         .getSocketFactory,
@@ -634,6 +652,21 @@ private[spark] class Client(
       }).toMap
     }).getOrElse(Map.empty[String, String])
   }
+
+  private def getDriverServiceManager: DriverServiceManager = {
+    val driverServiceManagerLoader = ServiceLoader.load(classOf[DriverServiceManager])
+    val matchingServiceManagers = driverServiceManagerLoader
+      .iterator()
+      .asScala
+      .filter(_.getServiceManagerType == driverServiceManagerType)
+      .toList
+    require(matchingServiceManagers.nonEmpty,
+      s"No driver service manager found matching type $driverServiceManagerType")
+    require(matchingServiceManagers.size == 1, "Multiple service managers found" +
+      s" matching type $driverServiceManagerType, got: " +
+      matchingServiceManagers.map(_.getClass).toList.mkString(","))
+    matchingServiceManagers.head
+  }
 }
 
 private[spark] object Client extends Logging {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
index fb76b04604479..6360bc0e48948 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
@@ -23,8 +23,7 @@ import scala.collection.mutable
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
-private[spark] class KubernetesResourceCleaner
-    extends Logging {
+private[spark] class KubernetesResourceCleaner extends Logging {
 
   private val resources = mutable.HashMap.empty[(String, String), HasMetadata]
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index bc2f9d578555d..213b5367263f8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.kubernetes
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.{SPARK_VERSION => sparkVersion}
+import org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
 import org.apache.spark.internal.config.ConfigBuilder
 
 package object config {
@@ -156,6 +157,16 @@ package object config {
       .stringConf
       .createOptional
 
+  private[spark] val DRIVER_SUBMIT_SSL_ENABLED =
+    ConfigBuilder("spark.ssl.kubernetes.submit.enabled")
+      .doc("""
+             | Whether or not to use SSL when sending the
+             | application dependencies to the driver pod.
+             |
+           """.stripMargin)
+      .booleanConf
+      .createWithDefault(false)
+
   private[spark] val KUBERNETES_DRIVER_SERVICE_NAME =
     ConfigBuilder("spark.kubernetes.driver.service.name")
         .doc("""
@@ -184,6 +195,16 @@ package object config {
       .stringConf
       .createOptional
 
+  private[spark] val DRIVER_SERVICE_MANAGER_TYPE =
+    ConfigBuilder("spark.kubernetes.driver.serviceManagerType")
+      .doc(s"""
+          | A tag indicating which class to use for creating the
+          | Kubernetes service and determining its URI for the submission
+          | client.
+        """.stripMargin)
+      .stringConf
+      .createWithDefault(NodePortUrisDriverServiceManager.TYPE)
+
   private[spark] val WAIT_FOR_APP_COMPLETION =
     ConfigBuilder("spark.kubernetes.submit.waitAppCompletion")
       .doc(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 688cd858e79ff..10ddb12463894 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -64,6 +64,12 @@ package object constants {
   private[spark] val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   private[spark] val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
 
+  // Annotation keys
+  private[spark] val ANNOTATION_PROVIDE_EXTERNAL_URI =
+      "spark-job.alpha.apache.org/provideExternalUri"
+  private[spark] val ANNOTATION_RESOLVED_EXTERNAL_URI =
+      "spark-job.alpha.apache.org/resolvedExternalUri"
+
   // Miscellaneous
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
   private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submit"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala
new file mode 100644
index 0000000000000..d92c0247e2a35
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest.kubernetes
+
+import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
+import io.fabric8.kubernetes.client.KubernetesClient
+
+import org.apache.spark.SparkConf
+
+/**
+ * Implementations of this interface are responsible for exposing the driver pod by:
+ * - Creating a Kubernetes Service that is backed by the driver pod, and
+ * - Providing one or more URIs that the service can be reached at from the submission client.
+ *
+ * In general, one should not need to implement custom variants of this interface. Consider
+ * if the built-in service managers, NodePort and ExternalAnnotation, suit your needs first.
+ *
+ * This API is in an alpha state and may break without notice.
+ */
+trait DriverServiceManager {
+
+  protected var kubernetesClient: KubernetesClient = _
+  protected var serviceName: String = _
+  protected var sparkConf: SparkConf = _
+
+  /**
+   * The tag that identifies this service manager type. This service manager will be loaded
+   * only if the Spark configuration spark.kubernetes.driver.serviceManagerType matches this
+   * value.
+   */
+  def getServiceManagerType: String
+
+  final def start(
+      kubernetesClient: KubernetesClient,
+      serviceName: String,
+      sparkConf: SparkConf): Unit = {
+    this.kubernetesClient = kubernetesClient
+    this.serviceName = serviceName
+    this.sparkConf = sparkConf
+    onStart(kubernetesClient, serviceName, sparkConf)
+  }
+
+  /**
+   * Guaranteed to be called before {@link createDriverService} or
+   * {@link getDriverServiceSubmissionServerUris} is called.
+   */
+  protected def onStart(
+      kubernetesClient: KubernetesClient,
+      serviceName: String,
+      sparkConf: SparkConf): Unit = {}
+
+  /**
+   * Customize the driver service that overlays on the driver pod.
+   *
+   * Implementations are expected to take the service template and adjust it
+   * according to the particular needs of how the Service will be accessed by
+   * URIs provided in {@link getDriverServiceSubmissionServerUris}.
+   *
+   * @param driverServiceTemplate Base settings for the driver service.
+   * @return The same ServiceBuilder object with any required customizations.
+   */
+  def customizeDriverService(driverServiceTemplate: ServiceBuilder): ServiceBuilder
+
+  /**
+   * Return the set of URIs that can be used to reach the submission server that
+   * is running on the driver pod.
+   */
+  def getDriverServiceSubmissionServerUris(driverService: Service): Set[String]
+
+  /**
+   * Called when the Spark application failed to start. Allows the service
+   * manager to clean up any state it may have created that should not be persisted
+   * in the case of an unsuccessful launch. Note that stop() is still called
+   * regardless if this method is called.
+   */
+  def handleSubmissionError(cause: Throwable): Unit = {}
+
+  final def stop(): Unit = onStop()
+
+  /**
+   * Perform any cleanup of this service manager.
+   * the super implementation.
+   */
+  protected def onStop(): Unit = {}
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala
new file mode 100644
index 0000000000000..257571b5a9d3e
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.util.concurrent.TimeUnit
+
+import com.google.common.util.concurrent.SettableFuture
+import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watch, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * Creates the service with an annotation that is expected to be detected by another process
+ * which the user provides and is not built in this project. When the external process detects
+ * the creation of the service with the appropriate annotation, it is expected to populate the
+ * value of a second annotation that is the URI of the driver submission server.
+ */
+private[spark] class ExternalSuppliedUrisDriverServiceManager
+  extends DriverServiceManager with Logging {
+
+  private val externalUriFuture = SettableFuture.create[String]
+  private var externalUriSetWatch: Option[Watch] = None
+
+  override def onStart(
+      kubernetesClient: KubernetesClient,
+      serviceName: String,
+      sparkConf: SparkConf): Unit = {
+    externalUriSetWatch = Some(kubernetesClient
+      .services()
+      .withName(serviceName)
+      .watch(new ExternalUriSetWatcher(externalUriFuture)))
+  }
+
+  override def getServiceManagerType: String = ExternalSuppliedUrisDriverServiceManager.TYPE
+
+  override def customizeDriverService(driverServiceTemplate: ServiceBuilder): ServiceBuilder = {
+    require(serviceName != null, "Service name was null; was start() called?")
+    driverServiceTemplate
+      .editMetadata()
+      .addToAnnotations(ANNOTATION_PROVIDE_EXTERNAL_URI, "true")
+      .endMetadata()
+      .editSpec()
+      .withType("ClusterIP")
+      .endSpec()
+  }
+
+  override def getDriverServiceSubmissionServerUris(driverService: Service): Set[String] = {
+    val timeoutSeconds = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
+    require(externalUriSetWatch.isDefined, "The watch that listens for the provision of" +
+      " the external URI was not started; was start() called?")
+    Set(externalUriFuture.get(timeoutSeconds, TimeUnit.SECONDS))
+  }
+
+  override def onStop(): Unit = {
+    Utils.tryLogNonFatalError {
+      externalUriSetWatch.foreach(_.close())
+      externalUriSetWatch = None
+    }
+  }
+}
+
+private[spark] object ExternalSuppliedUrisDriverServiceManager {
+  val TYPE = "ExternalAnnotation"
+}
+
+private[spark] class ExternalUriSetWatcher(externalUriFuture: SettableFuture[String])
+  extends Watcher[Service] with Logging {
+
+  override def eventReceived(action: Action, service: Service): Unit = {
+    if (action == Action.MODIFIED && !externalUriFuture.isDone) {
+      service
+        .getMetadata
+        .getAnnotations
+        .asScala
+        .get(ANNOTATION_RESOLVED_EXTERNAL_URI)
+        .foreach(externalUriFuture.set)
+    }
+  }
+
+  override def onClose(cause: KubernetesClientException): Unit = {
+    logDebug("External URI set watcher closed.", cause)
+  }
+}
+
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
new file mode 100644
index 0000000000000..fa8362677f38f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.Logging
+
+/**
+ * Creates the service with an open NodePort. The URI to reach the submission server is thus
+ * at the address of any of the nodes through the service's node port.
+ */
+private[spark] class NodePortUrisDriverServiceManager extends DriverServiceManager with Logging {
+
+  override def getServiceManagerType: String = NodePortUrisDriverServiceManager.TYPE
+
+  override def customizeDriverService(driverServiceTemplate: ServiceBuilder): ServiceBuilder = {
+    driverServiceTemplate.editSpec().withType("NodePort").endSpec()
+  }
+
+  override def getDriverServiceSubmissionServerUris(driverService: Service): Set[String] = {
+    val urlScheme = if (sparkConf.get(DRIVER_SUBMIT_SSL_ENABLED)) {
+      "https"
+    } else {
+      logWarning("Submitting application details, application secret, and local" +
+        " jars to the cluster over an insecure connection. You should configure SSL" +
+        " to secure this step.")
+      "http"
+    }
+    val servicePort = driverService.getSpec.getPorts.asScala
+      .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
+      .head.getNodePort
+    val nodeUrls = kubernetesClient.nodes.list.getItems.asScala
+      .filterNot(node => node.getSpec.getUnschedulable != null &&
+        node.getSpec.getUnschedulable)
+      .flatMap(_.getStatus.getAddresses.asScala)
+      // The list contains hostnames, internal and external IP addresses.
+      // (https://kubernetes.io/docs/admin/node/#addresses)
+      // we want only external IP addresses and legacyHostIP addresses in our list
+      // legacyHostIPs are deprecated and will be removed in the future.
+      // (https://github.com/kubernetes/kubernetes/issues/9267)
+      .filter(address => address.getType == "ExternalIP" || address.getType == "LegacyHostIP")
+      .map(address => {
+        s"$urlScheme://${address.getAddress}:$servicePort"
+      }).toSet
+    require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
+    nodeUrls
+  }
+}
+
+private[spark] object NodePortUrisDriverServiceManager {
+  val TYPE = "NodePort"
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
new file mode 100644
index 0000000000000..3199a8c385f95
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.util.concurrent.atomic.AtomicBoolean
+
+import io.fabric8.kubernetes.api.model.Service
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import scala.collection.JavaConverters._
+
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.internal.Logging
+
+/**
+ * A slightly unrealistic implementation of external URI provision, but works
+ * for tests - essentially forces the service to revert back to being exposed
+ * on NodePort.
+ */
+private[spark] class ExternalUriProviderWatch(kubernetesClient: KubernetesClient)
+    extends Watcher[Service] with Logging {
+
+  // Visible for testing
+  val annotationSet = new AtomicBoolean(false)
+
+  override def eventReceived(action: Action, service: Service): Unit = {
+    if (action == Action.ADDED) {
+      service.getMetadata
+          .getAnnotations
+          .asScala
+          .get(ANNOTATION_PROVIDE_EXTERNAL_URI).foreach { _ =>
+        if (!annotationSet.getAndSet(true)) {
+          val nodePortService = kubernetesClient.services().withName(service.getMetadata.getName)
+            .edit()
+              .editSpec()
+                .withType("NodePort")
+                .endSpec()
+            .done()
+          val submissionServerPort = nodePortService
+            .getSpec()
+            .getPorts
+            .asScala
+            .find(_.getName == SUBMISSION_SERVER_PORT_NAME)
+            .map(_.getNodePort)
+            .getOrElse(throw new IllegalStateException("Submission server port not found."))
+          val resolvedNodePortUri = s"http://${Minikube.getMinikubeIp}:$submissionServerPort"
+          kubernetesClient.services().withName(service.getMetadata.getName).edit()
+            .editMetadata()
+              .addToAnnotations(ANNOTATION_RESOLVED_EXTERNAL_URI, resolvedNodePortUri)
+              .endMetadata()
+            .done()
+        }
+      }
+    }
+  }
+
+  override def onClose(cause: KubernetesClientException): Unit = {
+    logWarning("External URI provider watch closed.", cause)
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 11c85caa6fc94..6aa1c1fee0d47 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -35,10 +35,13 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkSubmit
 import org.apache.spark.deploy.kubernetes.Client
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
 import org.apache.spark.deploy.kubernetes.integrationtest.sslutil.SSLUtils
+import org.apache.spark.deploy.rest.kubernetes.ExternalSuppliedUrisDriverServiceManager
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 import org.apache.spark.util.Utils
 
@@ -108,6 +111,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         .withGracePeriod(60)
         .delete
     })
+    // spark-submit sets system properties so we have to clear them
+    new SparkConf(true).getAll.map(_._1).foreach { System.clearProperty }
   }
 
   override def afterAll(): Unit = {
@@ -375,4 +380,49 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         s" with correct contents."), "Job did not find the file as expected.")
     }
   }
+
+  test("Use external URI provider") {
+    val externalUriProviderWatch = new ExternalUriProviderWatch(minikubeKubernetesClient)
+    Utils.tryWithResource(minikubeKubernetesClient.services()
+        .withLabel("spark-app-name", "spark-pi")
+        .watch(externalUriProviderWatch)) { _ =>
+      val args = Array(
+        "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
+        "--deploy-mode", "cluster",
+        "--kubernetes-namespace", NAMESPACE,
+        "--name", "spark-pi",
+        "--executor-memory", "512m",
+        "--executor-cores", "1",
+        "--num-executors", "1",
+        "--jars", HELPER_JAR_FILE.getAbsolutePath,
+        "--class", SPARK_PI_MAIN_CLASS,
+        "--conf", "spark.ui.enabled=true",
+        "--conf", "spark.testing=false",
+        "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
+        "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
+        "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
+        "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
+        "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
+        "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
+        "--conf", s"${DRIVER_SERVICE_MANAGER_TYPE.key}=${ExternalSuppliedUrisDriverServiceManager.TYPE}",
+        EXAMPLES_JAR_FILE.getAbsolutePath)
+      SparkSubmit.main(args)
+      val sparkMetricsService = getSparkMetricsService("spark-pi")
+      expectationsForStaticAllocation(sparkMetricsService)
+      assert(externalUriProviderWatch.annotationSet.get)
+      val driverService = minikubeKubernetesClient
+        .services()
+        .withLabel("spark-app-name", "spark-pi")
+        .list()
+        .getItems
+        .asScala(0)
+      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_PROVIDE_EXTERNAL_URI),
+          "External URI request annotation was not set on the driver service.")
+      // Unfortunately we can't check the correctness of the actual value of the URI, as it depends
+      // on the driver submission port set on the driver service but we remove that port from the
+      // service once the submission is complete.
+      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_RESOLVED_EXTERNAL_URI),
+        "Resolved URI annotation not set on driver service.")
+    }
+  }
 }

From a14dc1eb3c599d773c6b0a587370939d84dd05d5 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 3 Mar 2017 00:08:33 -0700
Subject: [PATCH 441/534] Remove okhttp from top-level pom (#166)

---
 pom.xml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pom.xml b/pom.xml
index 5fc088bc2d174..393b7e3501149 100644
--- a/pom.xml
+++ b/pom.xml
@@ -327,11 +327,6 @@
         <artifactId>feign-jaxrs</artifactId>
         <version>${feign.version}</version>
       </dependency>
-      <dependency>
-        <groupId>com.squareup.okhttp3</groupId>
-        <artifactId>okhttp</artifactId>
-        <version>3.4.1</version>
-      </dependency>
       <dependency>
         <groupId>org.bouncycastle</groupId>
         <artifactId>bcpkix-jdk15on</artifactId>

From 015f18d10fe54327c26124b10c1fe9f16e176f90 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 3 Mar 2017 12:45:04 -0800
Subject: [PATCH 442/534] Allow setting memory on the driver submission server.
 (#161)

* Allow setting memory on the driver submission server.

* Address comments

* Address comments
---
 docs/running-on-kubernetes.md                 | 18 +++++++++++-
 .../spark/deploy/kubernetes/Client.scala      | 28 +++++++++++++++++++
 .../spark/deploy/kubernetes/config.scala      | 23 ++++++++++++++-
 .../spark/deploy/kubernetes/constants.scala   |  3 ++
 .../KubernetesClusterSchedulerBackend.scala   | 21 +++++++-------
 5 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index d024d427fea97..1824fd6d2de98 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -201,13 +201,29 @@ from the other deployment modes. See the [configuration page](configuration.html
 </tr>
 <tr>
   <td><code>spark.kubernetes.executor.memoryOverhead</code></td>
-  <td>executorMemory * 0.10, with minimum of 384 </td>
+  <td>executorMemory * 0.10, with minimum of 384</td>
   <td>
     The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things
     like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size
     (typically 6-10%).
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.submissionServerMemory</code></td>
+  <td>256m</td>
+  <td>
+    The amount of memory to allocate for the driver submission server.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.memoryOverhead</code></td>
+  <td>(driverMemory + driverSubmissionServerMemory) * 0.10, with minimum of 384</td>
+  <td>
+    The amount of off-heap memory (in megabytes) to be allocated for the driver and the driver submission server. This
+    is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to
+    grow with the driver size (typically 6-10%).
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.driver.labels</code></td>
   <td>(none)</td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index af5623093382e..d38d84f7b3ed3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -64,6 +64,19 @@ private[spark] class Client(
     .map(_.split(","))
     .getOrElse(Array.empty[String])
 
+  // Memory settings
+  private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
+  private val driverSubmitServerMemoryMb = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY)
+  private val driverSubmitServerMemoryString = sparkConf.get(
+    KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY.key,
+    KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY.defaultValueString)
+  private val driverContainerMemoryMb = driverMemoryMb + driverSubmitServerMemoryMb
+  private val memoryOverheadMb = sparkConf
+    .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverContainerMemoryMb).toInt,
+      MEMORY_OVERHEAD_MIN))
+  private val driverContainerMemoryWithOverhead = driverContainerMemoryMb + memoryOverheadMb
+
   private val waitForAppCompletion: Boolean = sparkConf.get(WAIT_FOR_APP_COMPLETION)
 
   private val secretBase64String = {
@@ -408,6 +421,12 @@ private[spark] class Client(
       .withPath("/v1/submissions/ping")
       .withNewPort(SUBMISSION_SERVER_PORT_NAME)
       .build()
+    val driverMemoryQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverContainerMemoryMb}M")
+      .build()
+    val driverMemoryLimitQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverContainerMemoryWithOverhead}M")
+      .build()
     val driverPod = kubernetesClient.pods().createNew()
       .withNewMetadata()
         .withName(kubernetesAppId)
@@ -442,7 +461,16 @@ private[spark] class Client(
             .withName(ENV_SUBMISSION_SERVER_PORT)
             .withValue(SUBMISSION_SERVER_PORT.toString)
             .endEnv()
+          // Note that SPARK_DRIVER_MEMORY only affects the REST server via spark-class.
+          .addNewEnv()
+            .withName(ENV_DRIVER_MEMORY)
+            .withValue(driverSubmitServerMemoryString)
+            .endEnv()
           .addToEnv(sslConfiguration.sslPodEnvVars: _*)
+          .withNewResources()
+            .addToRequests("memory", driverMemoryQuantity)
+            .addToLimits("memory", driverMemoryLimitQuantity)
+            .endResources()
           .withPorts(containerPorts.asJava)
           .withNewReadinessProbe().withHttpGet(probePingHttpGet).endReadinessProbe()
           .endContainer()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 213b5367263f8..dc61ad4025f0f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -21,6 +21,7 @@ import java.util.concurrent.TimeUnit
 import org.apache.spark.{SPARK_VERSION => sparkVersion}
 import org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
 import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.network.util.ByteUnit
 
 package object config {
 
@@ -104,7 +105,19 @@ package object config {
           | overheads, etc. This tends to grow with the executor size
           | (typically 6-10%).
         """.stripMargin)
-      .stringConf
+      .bytesConf(ByteUnit.MiB)
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_MEMORY_OVERHEAD =
+    ConfigBuilder("spark.kubernetes.driver.memoryOverhead")
+      .doc("""
+          | The amount of off-heap memory (in megabytes) to be
+          | allocated for the driver and the driver submission server.
+          | This is memory that accounts for things like VM overheads,
+          | interned strings, other native overheads, etc. This tends
+          | to grow with the driver's memory size (typically 6-10%).
+           """.stripMargin)
+      .bytesConf(ByteUnit.MiB)
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_LABELS =
@@ -177,6 +190,14 @@ package object config {
         .stringConf
         .createOptional
 
+  private[spark] val KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY =
+    ConfigBuilder("spark.kubernetes.driver.submissionServerMemory")
+      .doc("""
+          | The amount of memory to allocate for the driver submission server.
+        """.stripMargin)
+      .bytesConf(ByteUnit.MiB)
+      .createWithDefaultString("256m")
+
   private[spark] val EXPOSE_KUBERNETES_DRIVER_SERVICE_UI_PORT =
     ConfigBuilder("spark.kubernetes.driver.service.exposeUiPort")
       .doc("""
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 10ddb12463894..4af065758e674 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -63,6 +63,7 @@ package object constants {
   private[spark] val ENV_EXECUTOR_MEMORY = "SPARK_EXECUTOR_MEMORY"
   private[spark] val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   private[spark] val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
+  private[spark] val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
 
   // Annotation keys
   private[spark] val ANNOTATION_PROVIDE_EXTERNAL_URI =
@@ -74,4 +75,6 @@ package object constants {
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
   private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submit"
   private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
+  private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
+  private[spark] val MEMORY_OVERHEAD_MIN = 384L
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 898b215b92d04..90907ff83ed84 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -60,15 +60,16 @@ private[spark] class KubernetesClusterSchedulerBackend(
     .getOrElse(
       throw new SparkException("Must specify the driver pod name"))
 
-  private val executorMemory = conf.getOption("spark.executor.memory").getOrElse("1g")
-  private val executorMemoryBytes = Utils.byteStringAsBytes(executorMemory)
+  private val executorMemoryMb = conf.get(org.apache.spark.internal.config.EXECUTOR_MEMORY)
+  private val executorMemoryString = conf.get(
+    org.apache.spark.internal.config.EXECUTOR_MEMORY.key,
+    org.apache.spark.internal.config.EXECUTOR_MEMORY.defaultValueString)
 
-  private val memoryOverheadBytes = conf
+  private val memoryOverheadMb = conf
     .get(KUBERNETES_EXECUTOR_MEMORY_OVERHEAD)
-    .map(overhead => Utils.byteStringAsBytes(overhead))
-    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryBytes).toInt,
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryMb).toInt,
       MEMORY_OVERHEAD_MIN))
-  private val executorMemoryWithOverhead = executorMemoryBytes + memoryOverheadBytes
+  private val executorMemoryWithOverhead = executorMemoryMb + memoryOverheadMb
 
   private val executorCores = conf.getOption("spark.executor.cores").getOrElse("1")
 
@@ -165,10 +166,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val selectors = Map(SPARK_EXECUTOR_ID_LABEL -> executorId,
       SPARK_APP_ID_LABEL -> applicationId()).asJava
     val executorMemoryQuantity = new QuantityBuilder(false)
-      .withAmount(executorMemoryBytes.toString)
+      .withAmount(s"${executorMemoryMb}M")
       .build()
     val executorMemoryLimitQuantity = new QuantityBuilder(false)
-      .withAmount(executorMemoryWithOverhead.toString)
+      .withAmount(s"${executorMemoryWithOverhead}M")
       .build()
     val executorCpuQuantity = new QuantityBuilder(false)
       .withAmount(executorCores)
@@ -177,7 +178,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       (ENV_EXECUTOR_PORT, executorPort.toString),
       (ENV_DRIVER_URL, driverUrl),
       (ENV_EXECUTOR_CORES, executorCores),
-      (ENV_EXECUTOR_MEMORY, executorMemory),
+      (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId()),
       (ENV_EXECUTOR_ID, executorId)
     ).map(env => new EnvVarBuilder()
@@ -261,7 +262,5 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
 private object KubernetesClusterSchedulerBackend {
   private val DEFAULT_STATIC_PORT = 10000
-  private val MEMORY_OVERHEAD_FACTOR = 0.10
-  private val MEMORY_OVERHEAD_MIN = 384L
   private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
 }

From f41435598c686ddcf8d605ab5098bcd2d46912d6 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 3 Mar 2017 17:20:57 -0700
Subject: [PATCH 443/534] Add a section for prerequisites (#171)

* Adding prerequisites

* address comments
---
 docs/running-on-kubernetes.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 1824fd6d2de98..660bbaca8da90 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -3,9 +3,15 @@ layout: global
 title: Running Spark on Kubernetes
 ---
 
-Support for running on [Kubernetes](https://kubernetes.io/) is available in experimental status. The feature set is
+Support for running on [Kubernetes](https://kubernetes.io/docs/whatisk8s/) is available in experimental status. The feature set is
 currently limited and not well-tested. This should not be used in production environments.
 
+## Prerequisites
+
+* You must have a running Kubernetes cluster with access configured to it using [kubectl](https://kubernetes.io/docs/user-guide/prereqs/). If you do not already have a working Kubernetes cluster, you may setup a test cluster on your local machine using [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
+* You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/), [nodes](https://kubernetes.io/docs/admin/node/) and [services](https://kubernetes.io/docs/user-guide/services/) in your cluster. You can verify that you can list these resources by running `kubectl get nodes`, `kubectl get pods` and `kubectl get svc` which should give you a list of nodes, pods and services (if any) respectively.
+* You must have an extracted spark distribution with Kubernetes support, or build one from [source](https://github.com/apache-spark-on-k8s/spark).
+
 ## Setting Up Docker Images
 
 Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
@@ -81,7 +87,7 @@ the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/
 ### Kubernetes Clusters and the authenticated proxy endpoint
 
 Spark-submit also supports submission through the
-[local kubectl proxy](https://kubernetes.io/docs/user-guide/connecting-to-applications-proxy/). One can use the
+[local kubectl proxy](https://kubernetes.io/docs/user-guide/accessing-the-cluster/#using-kubectl-proxy). One can use the
 authenticating proxy to communicate with the api server directly without passing credentials to spark-submit.
 
 The local proxy can be started by running:

From 6cf635d9b1353d4d71a26fceb7aa816d4f8985e3 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Fri, 3 Mar 2017 17:21:05 -0700
Subject: [PATCH 444/534] Add instructions to find master URL (#169)

---
 docs/running-on-kubernetes.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 660bbaca8da90..27ddc4b04062f 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -55,6 +55,15 @@ being contacted at `api_server_url`. If no HTTP protocol is specified in the URL
 setting the master to `k8s://example.com:443` is equivalent to setting it to `k8s://https://example.com:443`, but to
 connect without SSL on a different port, the master would be set to `k8s://http://example.com:8443`.
 
+
+If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
+
+    > kubectl cluster-info
+    Kubernetes master is running at http://127.0.0.1:8080
+
+In the above example, the specific Kubernetes cluster can be used with spark submit by specifying
+`--master k8s://http://127.0.0.1:8080` as an argument to spark-submit.
+
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
  
@@ -62,7 +71,7 @@ the cluster.
 
 Spark supports specifying JAR paths that are either on the submitting host's disk, or are located on the disk of the
 driver and executors. Refer to the [application submission](submitting-applications.html#advanced-dependency-management)
-section for details. Note that files specified with the `local` scheme should be added to the container image of both
+section for details. Note that files specified with the `local://` scheme should be added to the container image of both
 the driver and the executors. Files without a scheme or with the scheme `file://` are treated as being on the disk of
 the submitting machine, and are uploaded to the driver running in Kubernetes before launching the application.
  

From 191dd512d2ebebc338ea18ee62ceacba0b4947b3 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 6 Mar 2017 11:17:11 -0800
Subject: [PATCH 445/534] Propagate exceptions (#172)

---
 .../main/scala/org/apache/spark/deploy/kubernetes/Client.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index d38d84f7b3ed3..770821e97d12c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -201,6 +201,7 @@ private[spark] class Client(
         } catch {
           case e: Throwable =>
             driverServiceManager.handleSubmissionError(e)
+            throw e
         } finally {
           Utils.tryLogNonFatalError {
             kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient)

From dc4e3d292481cb29d46cf02469481efe284fedb8 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Mon, 6 Mar 2017 12:00:06 -0800
Subject: [PATCH 446/534] Logging for resource deletion (#170)

* Logging for resource deletion

Remove dangling colon and replace with an ellipses and a second log statement

* Update KubernetesResourceCleaner.scala
---
 .../spark/deploy/kubernetes/KubernetesResourceCleaner.scala   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
index 6360bc0e48948..6329bb1359516 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
@@ -39,13 +39,15 @@ private[spark] class KubernetesResourceCleaner extends Logging {
 
   def deleteAllRegisteredResourcesFromKubernetes(kubernetesClient: KubernetesClient): Unit = {
     synchronized {
-      logInfo(s"Deleting ${resources.size} registered Kubernetes resources:")
+      val resourceCount = resources.size
+      logInfo(s"Deleting ${resourceCount} registered Kubernetes resources...")
       resources.values.foreach { resource =>
         Utils.tryLogNonFatalError {
           kubernetesClient.resource(resource).delete()
         }
       }
       resources.clear()
+      logInfo(s"Deleted ${resourceCount} registered Kubernetes resources.")
     }
   }
 }

From 363693985d108e16f0e49c240be245eaa031993d Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Mon, 13 Mar 2017 17:46:41 -0700
Subject: [PATCH 447/534] Fix pom versions (#178)

* Fix pom versioning

* fix k8s versions in pom

* Change pom string to 2.1.0-k8s-0.1.0-SNAPSHOT
---
 assembly/pom.xml                                                | 2 +-
 common/network-common/pom.xml                                   | 2 +-
 common/network-shuffle/pom.xml                                  | 2 +-
 common/network-yarn/pom.xml                                     | 2 +-
 common/sketch/pom.xml                                           | 2 +-
 common/tags/pom.xml                                             | 2 +-
 common/unsafe/pom.xml                                           | 2 +-
 core/pom.xml                                                    | 2 +-
 examples/pom.xml                                                | 2 +-
 external/docker-integration-tests/pom.xml                       | 2 +-
 external/flume-assembly/pom.xml                                 | 2 +-
 external/flume-sink/pom.xml                                     | 2 +-
 external/flume/pom.xml                                          | 2 +-
 external/java8-tests/pom.xml                                    | 2 +-
 external/kafka-0-10-assembly/pom.xml                            | 2 +-
 external/kafka-0-10-sql/pom.xml                                 | 2 +-
 external/kafka-0-10/pom.xml                                     | 2 +-
 external/kafka-0-8-assembly/pom.xml                             | 2 +-
 external/kafka-0-8/pom.xml                                      | 2 +-
 external/kinesis-asl-assembly/pom.xml                           | 2 +-
 external/kinesis-asl/pom.xml                                    | 2 +-
 external/spark-ganglia-lgpl/pom.xml                             | 2 +-
 graphx/pom.xml                                                  | 2 +-
 launcher/pom.xml                                                | 2 +-
 mesos/pom.xml                                                   | 2 +-
 mllib-local/pom.xml                                             | 2 +-
 mllib/pom.xml                                                   | 2 +-
 pom.xml                                                         | 2 +-
 repl/pom.xml                                                    | 2 +-
 resource-managers/kubernetes/core/pom.xml                       | 2 +-
 resource-managers/kubernetes/docker-minimal-bundle/pom.xml      | 2 +-
 .../kubernetes/integration-tests-spark-jobs-helpers/pom.xml     | 2 +-
 .../kubernetes/integration-tests-spark-jobs/pom.xml             | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml          | 2 +-
 sql/catalyst/pom.xml                                            | 2 +-
 sql/core/pom.xml                                                | 2 +-
 sql/hive-thriftserver/pom.xml                                   | 2 +-
 sql/hive/pom.xml                                                | 2 +-
 streaming/pom.xml                                               | 2 +-
 tools/pom.xml                                                   | 2 +-
 yarn/pom.xml                                                    | 2 +-
 41 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 1819fe404a1d9..c1f2c5b29f7e8 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 67d78d5f102fb..7d016120e44d7 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 93790979d7b26..718f609178e24 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index 53cb8dd815d81..d543991cb6a94 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 89bee8567fc74..384ef55b6f8a9 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 7b45b23e9c546..3d8eb2703ed6b 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index 9b84f1e0c1dfc..cd53039ed9a47 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index bbe07006109ea..23510cb50bcb7 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/pom.xml b/examples/pom.xml
index 2fb42413aca81..e3f7529ecbcec 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 4061c5f089c54..60fa11d8a5043 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 6cfc47ef00e2a..a21ec3f3d7fcb 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 58caf35f65a16..25e34698d831e 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index ed32fc0ec4c18..8058732e72e74 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index a3f3907573f21..fc326931315a3 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index 9ae4461db64a2..b023bc49203b2 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index f7276d0bd2197..03cd565f93025 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 52c88150137e3..5e294e6acc006 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 93b49bcf615b6..3654ede0e192a 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index cdfd29e3a9208..f9d61028e906a 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index c6a79aa86bcf0..1ba318d8d39c2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 3fa28aa81f214..fcc4456ef4a13 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 5c828780600cd..2bef062c0e0b3 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 1818bc80ea78a..78ca270157dcf 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d60a633b87699..1f0549f901f47 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index f8e43d2c43ec2..e97743cf1bbd9 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 6dcb44cebb254..24718fd46a3ec 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 5cf3a7f3e0f5e..384b6af178d43 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 393b7e3501149..9cfaf6eb65323 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0</version>
+  <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/repl/pom.xml b/repl/pom.xml
index 1e7db9b10f045..a42c69a99573d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 1c26af6593d37..985ffd08f3fc7 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.2.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index 7f4d935e0e243..e9f88e37a5f89 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.2.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
index f99838636b349..b9c29b26eb648 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.2.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
index 59e59aca5109b..16dd0c9322c13 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.2.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 03c713b6bc068..5c54d0e5e3aab 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.2.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index c58e0f43b2ac7..818f33868ef7a 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 37e7dccd2e27d..d1b8982b2d464 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 468d758a77884..3dc6539e178c3 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 7bf4fc0df45e8..ce3879131e736 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 06569e6ee2231..715aa4bbf6373 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 35d53b30191a5..d978d6ef6fdd4 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 38374b5ae5a3b..424965419c10b 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0</version>
+    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 2382ea6e5b9d8ea0ef459ece4e8de67c738f84e0 Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Wed, 15 Mar 2017 16:19:48 -0700
Subject: [PATCH 448/534] Exclude flaky ExternalShuffleServiceSuite from Travis
 (#185)

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 87bc84645ca7d..b9ae28a421309 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,7 +35,7 @@ env:  # Used by the install section below.
   - PHASE=test  \
     PROFILES="-Pmesos -Pyarn -Phadoop-2.7 -Pkubernetes"  \
     MODULES="-pl core,resource-managers/kubernetes/core -am"  \
-    ARGS="-Dtest=none -Dsuffixes='^org\.apache\.spark\.(?!SortShuffleSuite$|rdd\.LocalCheckpointSuite$|deploy\.SparkSubmitSuite$|deploy\.StandaloneDynamicAllocationSuite$).*'"
+    ARGS="-Dtest=none -Dsuffixes='^org\.apache\.spark\.(?!ExternalShuffleServiceSuite|SortShuffleSuite$|rdd\.LocalCheckpointSuite$|deploy\.SparkSubmitSuite$|deploy\.StandaloneDynamicAllocationSuite$).*'"
   # Configure the full build.
   - PHASE=install  \
     PROFILES="-Pmesos -Pyarn -Phadoop-2.7 -Pkubernetes -Pkinesis-asl -Phive -Phive-thriftserver"  \

From b139b4615890dd63b3091e7bd9138df4d7c36852 Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 15 Mar 2017 19:27:03 -0700
Subject: [PATCH 449/534] Fix lint-check failures and javadoc8 break (#187)

* [MINOR][BUILD] Fix lint-check failures and javadoc8 break

## What changes were proposed in this pull request?

This PR proposes to fix lint-check failures and javadoc8 break.

Few errors were introduced as below:

**lint-check failures**

```
[ERROR] src/test/java/org/apache/spark/network/TransportClientFactorySuite.java:[45,1] (imports) RedundantImport: Duplicate import to line 43 - org.apache.spark.network.util.MapConfigProvider.
[ERROR] src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java:[255,10] (modifier) RedundantModifier: Redundant 'final' modifier.
```

**javadoc8**

```
[error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/StreamingQueryProgress.java:19: error: bad use of '>'
[error]  *                   "max" -> "2016-12-05T20:54:20.827Z"  // maximum event time seen in this trigger
[error]                             ^
[error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/StreamingQueryProgress.java:20: error: bad use of '>'
[error]  *                   "min" -> "2016-12-05T20:54:20.827Z"  // minimum event time seen in this trigger
[error]                             ^
[error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/StreamingQueryProgress.java:21: error: bad use of '>'
[error]  *                   "avg" -> "2016-12-05T20:54:20.827Z"  // average event time seen in this trigger
[error]                             ^
[error] .../spark/sql/core/target/java/org/apache/spark/sql/streaming/StreamingQueryProgress.java:22: error: bad use of '>'
[error]  *                   "watermark" -> "2016-12-05T20:54:20.827Z"  // watermark used in this trigger
[error]
```

## How was this patch tested?

Manually checked as below:

**lint-check failures**

```
./dev/lint-java
Checkstyle checks passed.
```

**javadoc8**

This seems hidden in the API doc but I manually checked after removing access modifier as below:

It looks not rendering properly (scaladoc).

![2016-12-16 3 40 34](https://cloud.githubusercontent.com/assets/6477701/21255175/8df1fe6e-c3ad-11e6-8cda-ce7f76c6677a.png)

After this PR, it renders as below:

- scaladoc
  ![2016-12-16 3 40 23](https://cloud.githubusercontent.com/assets/6477701/21255135/4a11dab6-c3ad-11e6-8ab2-b091c4f45029.png)

- javadoc
  ![2016-12-16 3 41 10](https://cloud.githubusercontent.com/assets/6477701/21255137/4bba1d9c-c3ad-11e6-9b88-62f1f697b56a.png)

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #16307 from HyukjinKwon/lint-javadoc8.

(cherry picked from commit ed84cd068419550b4a1b88a41e73f1870c3967e4)

* Bring back needed import
---
 .../java/org/apache/spark/unsafe/types/CalendarInterval.java  | 2 +-
 .../main/scala/org/apache/spark/sql/streaming/progress.scala  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
index a7b0e6f80c2b6..fd6e95c3e0a38 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
@@ -252,7 +252,7 @@ public static long parseSecondNano(String secondNano) throws IllegalArgumentExce
   public final int months;
   public final long microseconds;
 
-  public final long milliseconds() {
+  public long milliseconds() {
     return this.microseconds / MICROS_PER_MILLI;
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index e219cfde12656..41c096ae2b824 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -68,12 +68,12 @@ class StateOperatorProgress private[sql](
  *                incremented.
  * @param durationMs The amount of time taken to perform various operations in milliseconds.
  * @param eventTime Statistics of event time seen in this batch. It may contain the following keys:
- *                 {
+ *                 {{{
  *                   "max" -> "2016-12-05T20:54:20.827Z"  // maximum event time seen in this trigger
  *                   "min" -> "2016-12-05T20:54:20.827Z"  // minimum event time seen in this trigger
  *                   "avg" -> "2016-12-05T20:54:20.827Z"  // average event time seen in this trigger
  *                   "watermark" -> "2016-12-05T20:54:20.827Z"  // watermark used in this trigger
- *                 }
+ *                 }}}
  *                 All timestamps are in ISO8601 format, i.e. UTC timestamps.
  * @param stateOperators Information about operators in the query that store state.
  * @param sources detailed statistics on data being read from each of the streaming sources.

From 8c081890ffe248bb83be67a32d0679133f152541 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 8 Mar 2017 12:27:54 -0700
Subject: [PATCH 450/534] Docs improvements (#176)

* Adding official alpha docker image to docs

* Reorder sections and create a specific one for "advanced"

* Provide limitations and instructions about running on GKE

* Fix title of advanced section: submission

* Improved section on running in the cloud

* Update versioning

* Address comments

* Address comments

(cherry picked from commit e5da90dc57a831d99adef9d31b20893de57cae4d)
---
 docs/running-on-kubernetes-cloud.md    | 24 ++++++++
 docs/running-on-kubernetes.md          | 76 ++++++++++++++++----------
 resource-managers/kubernetes/README.md |  8 +++
 3 files changed, 78 insertions(+), 30 deletions(-)
 create mode 100644 docs/running-on-kubernetes-cloud.md

diff --git a/docs/running-on-kubernetes-cloud.md b/docs/running-on-kubernetes-cloud.md
new file mode 100644
index 0000000000000..244c64d696ab3
--- /dev/null
+++ b/docs/running-on-kubernetes-cloud.md
@@ -0,0 +1,24 @@
+---
+layout: global
+title: Running Spark in the cloud with Kubernetes
+---
+
+For general information about running Spark on Kubernetes, refer to [running Spark on Kubernetes](running-on-kubernetes.md).
+
+A Kubernetes cluster may be brought up on different cloud providers or on premise. It is commonly provisioned through [Google Container Engine](https://cloud.google.com/container-engine/), or using [kops](https://github.com/kubernetes/kops) on AWS, or on premise using [kubeadm](https://kubernetes.io/docs/getting-started-guides/kubeadm/).
+
+## Running on Google Container Engine (GKE)
+
+* Create a GKE [container cluster](https://cloud.google.com/container-engine/docs/clusters/operations).
+* Obtain kubectl and [configure](https://cloud.google.com/container-engine/docs/clusters/operations#configuring_kubectl) it appropriately.
+* Find the identity of the master associated with this project.
+
+    > kubectl cluster-info
+    Kubernetes master is running at https://<master-ip>:443
+
+* Run spark-submit with the master option set to `k8s://https://<master-ip>:443`. The instructions for running spark-submit are provided in the [running on kubernetes](running-on-kubernetes.md) tutorial.
+* Check that your driver pod, and subsequently your executor pods are launched using `kubectl get pods`.
+* Read the stdout and stderr of the driver pod using `kubectl logs <name-of-driver-pod>`, or stream the logs using `kubectl logs -f <name-of-driver-pod>`.
+
+Known issues:
+* If you face OAuth token expiry errors when you run spark-submit, it is likely because the token needs to be refreshed. The easiest way to fix this is to run any `kubectl` command, say, `kubectl version` and then retry your submission.
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 27ddc4b04062f..73c28ec69919b 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -12,15 +12,28 @@ currently limited and not well-tested. This should not be used in production env
 * You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/), [nodes](https://kubernetes.io/docs/admin/node/) and [services](https://kubernetes.io/docs/user-guide/services/) in your cluster. You can verify that you can list these resources by running `kubectl get nodes`, `kubectl get pods` and `kubectl get svc` which should give you a list of nodes, pods and services (if any) respectively.
 * You must have an extracted spark distribution with Kubernetes support, or build one from [source](https://github.com/apache-spark-on-k8s/spark).
 
-## Setting Up Docker Images
+## Driver & Executor Images
 
 Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
 be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
 frequently used with Kubernetes, so Spark provides some support for working with Docker to get started quickly.
 
-To use Spark on Kubernetes with Docker, images for the driver and the executors need to built and published to an
-accessible Docker registry. Spark distributions include the Docker files for the driver and the executor at
-`dockerfiles/driver/Dockerfile` and `docker/executor/Dockerfile`, respectively. Use these Docker files to build the
+If you wish to use pre-built docker images, you may use the images published in [kubespark](https://hub.docker.com/u/kubespark/). The images are as follows:
+
+<table class="table">
+<tr><th>Component</th><th>Image</th></tr>
+<tr>
+  <td>Spark Driver Image</td>
+  <td><code>kubespark/spark-driver:v2.1.0-k8s-support-0.1.0-alpha.1</code></td>
+</tr>
+<tr>
+  <td>Spark Executor Image</td>
+  <td><code>kubespark/spark-executor:v2.1.0-k8s-support-0.1.0-alpha.1</code></td>
+</tr>
+</table>
+
+You may also build these docker images from sources, or customize them as required. Spark distributions include the Docker files for the driver and the executor at
+`dockerfiles/driver/Dockerfile` and `dockerfiles/executor/Dockerfile`, respectively. Use these Docker files to build the
 Docker images, and then tag them with the registry that the images should be sent to. Finally, push the images to the
 registry.
 
@@ -44,8 +57,8 @@ are set up as described above:
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-k8s-support-0.1.0-alpha.1 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-k8s-support-0.1.0-alpha.1 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
@@ -55,7 +68,6 @@ being contacted at `api_server_url`. If no HTTP protocol is specified in the URL
 setting the master to `k8s://example.com:443` is equivalent to setting it to `k8s://https://example.com:443`, but to
 connect without SSL on a different port, the master would be set to `k8s://http://example.com:8443`.
 
-
 If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
 
     > kubectl cluster-info
@@ -67,33 +79,17 @@ In the above example, the specific Kubernetes cluster can be used with spark sub
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
  
-### Dependency Management and Docker Containers
+### Specifying input files
 
 Spark supports specifying JAR paths that are either on the submitting host's disk, or are located on the disk of the
 driver and executors. Refer to the [application submission](submitting-applications.html#advanced-dependency-management)
 section for details. Note that files specified with the `local://` scheme should be added to the container image of both
 the driver and the executors. Files without a scheme or with the scheme `file://` are treated as being on the disk of
 the submitting machine, and are uploaded to the driver running in Kubernetes before launching the application.
- 
-### Setting Up SSL For Submitting the Driver
 
-When submitting to Kubernetes, a pod is started for the driver, and the pod starts an HTTP server. This HTTP server
-receives the driver's configuration, including uploaded driver jars, from the client before starting the application.
-Spark supports using SSL to encrypt the traffic in this bootstrapping process. It is recommended to configure this
-whenever possible. 
+### Accessing Kubernetes Clusters
 
-See the [security page](security.html) and [configuration](configuration.html) sections for more information on
-configuring SSL; use the prefix `spark.ssl.kubernetes.submit` in configuring the SSL-related fields in the context
-of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
-pod in starting the application, set `spark.ssl.kubernetes.submit.trustStore`.
-
-One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
-container image's disk. Thus `spark.ssl.kubernetes.submit.keyStore` can be a URI with a scheme of either `file:`
-or `local:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
-the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
-`local:`, the file is assumed to already be on the container's disk at the appropriate path.
-
-### Kubernetes Clusters and the authenticated proxy endpoint
+For details about running on public cloud environments, such as Google Container Engine (GKE), refer to [running Spark in the cloud with Kubernetes](running-on-kubernetes-cloud.md).
 
 Spark-submit also supports submission through the
 [local kubectl proxy](https://kubernetes.io/docs/user-guide/accessing-the-cluster/#using-kubectl-proxy). One can use the
@@ -112,16 +108,36 @@ If our local proxy were listening on port 8001, we would have our submission loo
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=registry-host:5000/spark-driver:latest \
-      --conf spark.kubernetes.executor.docker.image=registry-host:5000/spark-executor:latest \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-k8s-support-0.1.0-alpha.1 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-k8s-support-0.1.0-alpha.1 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
 Communication between Spark and Kubernetes clusters is performed using the fabric8 kubernetes-client library.
 The above mechanism using `kubectl proxy` can be used when we have authentication providers that the fabric8
-kubernetes-client library does not support. Authentication using X509 Client Certs and oauth tokens
+kubernetes-client library does not support. Authentication using X509 Client Certs and OAuth tokens
 is currently supported.
 
-### Determining the Driver Base URI
+## Advanced
+ 
+### Setting Up SSL For Submitting the Driver
+
+When submitting to Kubernetes, a pod is started for the driver, and the pod starts an HTTP server. This HTTP server
+receives the driver's configuration, including uploaded driver jars, from the client before starting the application.
+Spark supports using SSL to encrypt the traffic in this bootstrapping process. It is recommended to configure this
+whenever possible. 
+
+See the [security page](security.html) and [configuration](configuration.html) sections for more information on
+configuring SSL; use the prefix `spark.ssl.kubernetes.submit` in configuring the SSL-related fields in the context
+of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
+pod in starting the application, set `spark.ssl.kubernetes.submit.trustStore`.
+
+One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
+container image's disk. Thus `spark.ssl.kubernetes.submit.keyStore` can be a URI with a scheme of either `file:`
+or `local:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
+the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
+`local:`, the file is assumed to already be on the container's disk at the appropriate path.
+
+### Submission of Local Files through Ingress/External controller
 
 Kubernetes pods run with their own IP address space. If Spark is run in cluster mode, the driver pod may not be
 accessible to the submitter. However, the submitter needs to send local dependencies from its local disk to the driver
diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 25b62ba35a193..d70c38fdc64d5 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -53,6 +53,14 @@ Afterwards, the integration tests can be executed with Maven or your IDE. Note t
 `pre-integration-test` phase must be run every time the Spark main code changes. When running tests from the
 command line, the `pre-integration-test` phase should automatically be invoked if the `integration-test` phase is run.
 
+After the above step, the integration test can be run using the following command:
+
+```sh
+build/mvn integration-test \
+    -Pkubernetes -Pkubernetes-integration-tests \
+    -pl resource-managers/kubernetes/integration-tests -am
+```
+
 # Preserve the Minikube VM
 
 The integration tests make use of [Minikube](https://github.com/kubernetes/minikube), which fires up a virtual machine

From 8756494fef07a4bd4646d552bc199b32475cae6b Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 8 Mar 2017 12:31:41 -0800
Subject: [PATCH 451/534] Add Apache license to a few files (#175)

* Add Apache license to a few files

* Ignore license check on META-INF service

(cherry picked from commit 2a6143804b0f3112b1aefb90f2aa3efb1e04b525)
---
 dev/.rat-excludes                               |  1 +
 ....deploy.rest.kubernetes.DriverServiceManager |  2 +-
 .../src/main/docker/driver/Dockerfile           | 17 +++++++++++++++++
 .../src/main/docker/executor/Dockerfile         | 17 +++++++++++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index 6be1c72bc6cfb..f69567d8f6752 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -103,3 +103,4 @@ org.apache.spark.scheduler.ExternalClusterManager
 org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
 spark-warehouse
 structured-streaming/*
+org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
diff --git a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
index 5a306335b4166..56203ee38ac99 100644
--- a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
+++ b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
@@ -1,2 +1,2 @@
 org.apache.spark.deploy.rest.kubernetes.ExternalSuppliedUrisDriverServiceManager
-org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
\ No newline at end of file
+org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 92fdfb8ac5f41..3bf6b50ff69c1 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 FROM openjdk:8-alpine
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index a225110d55c14..cd5ac466a1fa0 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -1,3 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 FROM openjdk:8-alpine
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build

From fece639494669f6d6ff52846b7c92d3adf7b8576 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 8 Mar 2017 14:26:06 -0700
Subject: [PATCH 452/534] Adding clarification pre-alpha (#181)

(cherry picked from commit be109ab952dee94c91147641a2da933ee4d1bd42)
---
 docs/running-on-kubernetes.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 73c28ec69919b..bc0d89bf13d5e 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -10,7 +10,7 @@ currently limited and not well-tested. This should not be used in production env
 
 * You must have a running Kubernetes cluster with access configured to it using [kubectl](https://kubernetes.io/docs/user-guide/prereqs/). If you do not already have a working Kubernetes cluster, you may setup a test cluster on your local machine using [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
 * You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/), [nodes](https://kubernetes.io/docs/admin/node/) and [services](https://kubernetes.io/docs/user-guide/services/) in your cluster. You can verify that you can list these resources by running `kubectl get nodes`, `kubectl get pods` and `kubectl get svc` which should give you a list of nodes, pods and services (if any) respectively.
-* You must have an extracted spark distribution with Kubernetes support, or build one from [source](https://github.com/apache-spark-on-k8s/spark).
+* You must [build Spark with Kubernetes support](../resource-managers/kubernetes/README.md#building-spark-with-kubernetes-support) from source.
 
 ## Driver & Executor Images
 

From 35724a3a596372dae0378c1a9481368efc17f9c7 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 13 Mar 2017 14:52:28 -0700
Subject: [PATCH 453/534] Allow providing an OAuth token for authenticating
 against k8s (#180)

* Allow providing an OAuth token for authenticating against k8s

* Organize imports

* Fix style

* Remove extra newline

* Use OAuth token data instead of a file.

(cherry picked from commit 1aba3617c6e10b4e712d49bd8c39fa9e2cbfad46)
---
 docs/running-on-kubernetes.md                        |  8 ++++++++
 .../org/apache/spark/deploy/kubernetes/Client.scala  |  6 ++++++
 .../org/apache/spark/deploy/kubernetes/config.scala  | 12 ++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index bc0d89bf13d5e..c1f3a3ca653b9 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -222,6 +222,14 @@ from the other deployment modes. See the [configuration page](configuration.html
     machine's disk.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.submit.oauthToken</code></td>
+  <td>(none)</td>
+  <td>
+    OAuth token to use when authenticating against the against the Kubernetes API server. Note that unlike the other
+    authentication options, this should be the exact string value of the token to use for the authentication.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.submit.serviceAccountName</code></td>
   <td><code>default</code></td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 770821e97d12c..6f715ebad2d75 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -21,6 +21,7 @@ import java.security.SecureRandom
 import java.util.ServiceLoader
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
+import com.google.common.base.Charsets
 import com.google.common.io.Files
 import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model._
@@ -131,6 +132,11 @@ private[spark] class Client(
     sparkConf.get(KUBERNETES_CLIENT_CERT_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
     }
+    sparkConf.get(KUBERNETES_OAUTH_TOKEN).foreach { token =>
+      k8ConfBuilder = k8ConfBuilder.withOauthToken(token)
+      // Remove the oauth token from Spark conf so that its doesn't appear in the Spark UI.
+      sparkConf.set(KUBERNETES_OAUTH_TOKEN, "<present_but_redacted>")
+    }
 
     val k8ClientConfig = k8ConfBuilder.build
     Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig)) { kubernetesClient =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index dc61ad4025f0f..0c4269080335f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -83,6 +83,18 @@ package object config {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_OAUTH_TOKEN =
+    ConfigBuilder("spark.kubernetes.submit.oauthToken")
+      .doc("""
+          | OAuth token to use when authenticating against the
+          | against the Kubernetes API server. Note that unlike
+          | the other authentication options, this should be the
+          | exact string value of the token to use for the
+          | authentication.
+        """.stripMargin)
+      .stringConf
+      .createOptional
+
   private[spark] val KUBERNETES_SERVICE_ACCOUNT_NAME =
     ConfigBuilder("spark.kubernetes.submit.serviceAccountName")
       .doc("""

From d5502ed828450910818778d8a843c449147346ba Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Fri, 17 Mar 2017 16:09:31 -0700
Subject: [PATCH 454/534] Allow the driver pod's credentials to be shipped from
 the submission client (squashed) (#192)

* Allow the driver pod's credentials to be shipped through secrets.

* Fix scalastyle

* Change apiserver -> authentication

* Address comments.

Also some quality of life fixes, most notably formatting all of the
documentation strings in config.scala to no longer use triple quotes.
Triple quoted strings are difficult to format consistently.

* Fix scalastyle

* Fix comment

* Remove unnecessary constants

* Remove unnecessary whitespace

* Authentication -> Authenticate
---
 docs/running-on-kubernetes.md                 |  88 ++++--
 .../spark/deploy/kubernetes/Client.scala      |  46 +--
 ...iverPodKubernetesCredentialsProvider.scala |  66 +++++
 .../kubernetes/KubernetesClientBuilder.scala  |  65 +++--
 .../kubernetes/SslConfigurationProvider.scala |   3 +-
 .../spark/deploy/kubernetes/config.scala      | 261 +++++++++---------
 .../spark/deploy/kubernetes/constants.scala   |   5 +-
 .../rest/KubernetesRestProtocolMessages.scala |  21 +-
 .../KubernetesSparkRestServer.scala           |  47 ++++
 .../NodePortUrisDriverServiceManager.scala    |   4 +-
 .../KubernetesClusterSchedulerBackend.scala   |   9 +-
 .../integrationtest/KubernetesSuite.scala     | 225 ++++++---------
 12 files changed, 498 insertions(+), 342 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index c1f3a3ca653b9..dcfa70a85a970 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -66,7 +66,7 @@ The Spark master, specified either via passing the `--master` command line argum
 master string with `k8s://` will cause the Spark application to launch on the Kubernetes cluster, with the API server
 being contacted at `api_server_url`. If no HTTP protocol is specified in the URL, it defaults to `https`. For example,
 setting the master to `k8s://example.com:443` is equivalent to setting it to `k8s://https://example.com:443`, but to
-connect without SSL on a different port, the master would be set to `k8s://http://example.com:8443`.
+connect without TLS on a different port, the master would be set to `k8s://http://example.com:8443`.
 
 If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
 
@@ -119,20 +119,20 @@ is currently supported.
 
 ## Advanced
  
-### Setting Up SSL For Submitting the Driver
+### Setting Up TLS For Submitting the Driver
 
 When submitting to Kubernetes, a pod is started for the driver, and the pod starts an HTTP server. This HTTP server
 receives the driver's configuration, including uploaded driver jars, from the client before starting the application.
-Spark supports using SSL to encrypt the traffic in this bootstrapping process. It is recommended to configure this
+Spark supports using TLS to encrypt the traffic in this bootstrapping process. It is recommended to configure this
 whenever possible. 
 
 See the [security page](security.html) and [configuration](configuration.html) sections for more information on
-configuring SSL; use the prefix `spark.ssl.kubernetes.submit` in configuring the SSL-related fields in the context
+configuring TLS; use the prefix `spark.ssl.kubernetes.submission` in configuring the TLS-related fields in the context
 of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
-pod in starting the application, set `spark.ssl.kubernetes.submit.trustStore`.
+pod in starting the application, set `spark.ssl.kubernetes.submission.trustStore`.
 
 One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
-container image's disk. Thus `spark.ssl.kubernetes.submit.keyStore` can be a URI with a scheme of either `file:`
+container image's disk. Thus `spark.ssl.kubernetes.submission.keyStore` can be a URI with a scheme of either `file:`
 or `local:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
 the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
 `local:`, the file is assumed to already be on the container's disk at the appropriate path.
@@ -200,42 +200,88 @@ from the other deployment modes. See the [configuration page](configuration.html
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submit.caCertFile</code></td>
+  <td><code>spark.kubernetes.authenticate.submission.caCertFile</code></td>
   <td>(none)</td>
   <td>
-    CA cert file for connecting to Kubernetes over SSL. This file should be located on the submitting machine's disk.
+    Path to the CA cert file for connecting to the Kubernetes API server over TLS when starting the driver. This file
+    must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not provide
+    a scheme).
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submit.clientKeyFile</code></td>
+  <td><code>spark.kubernetes.authenticate.submission.clientKeyFile</code></td>
   <td>(none)</td>
   <td>
-    Client key file for authenticating against the Kubernetes API server. This file should be located on the submitting
-    machine's disk.
+    Path to the client key file for authenticating against the Kubernetes API server when starting the driver. This file
+    must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not provide
+    a scheme).
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submit.clientCertFile</code></td>
+  <td><code>spark.kubernetes.authenticate.submission.clientCertFile</code></td>
   <td>(none)</td>
   <td>
-    Client cert file for authenticating against the Kubernetes API server. This file should be located on the submitting
-    machine's disk.
+    Path to the client cert file for authenticating against the Kubernetes API server when starting the driver. This
+    file must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not
+    provide a scheme).
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submit.oauthToken</code></td>
+  <td><code>spark.kubernetes.authenticate.submission.oauthToken</code></td>
   <td>(none)</td>
   <td>
-    OAuth token to use when authenticating against the against the Kubernetes API server. Note that unlike the other
-    authentication options, this should be the exact string value of the token to use for the authentication.
+    OAuth token to use when authenticating against the Kubernetes API server when starting the driver. Note
+    that unlike the other authentication options, this is expected to be the exact string value of the token to use for
+    the authentication.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submit.serviceAccountName</code></td>
+  <td><code>spark.kubernetes.authenticate.driver.caCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the CA cert file for connecting to the Kubernetes API server over TLS from the driver pod when requesting
+    executors. This file must be located on the submitting machine's disk, and will be uploaded to the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.clientKeyFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client key file for authenticating against the Kubernetes API server from the driver pod when requesting
+    executors. This file must be located on the submitting machine's disk, and will be uploaded to the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme). If this is specified, it is highly
+    recommended to set up TLS for the driver submission server, as this value is sensitive information that would be
+    passed to the driver pod in plaintext otherwise.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.clientCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client cert file for authenticating against the Kubernetes API server from the driver pod when
+    requesting executors. This file must be located on the submitting machine's disk, and will be uploaded to the
+    driver pod. Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.oauthToken</code></td>
+  <td>(none)</td>
+  <td>
+    OAuth token to use when authenticating against the against the Kubernetes API server from the driver pod when
+    requesting executors. Note that unlike the other authentication options, this must be the exact string value of
+    the token to use for the authentication. This token value is uploaded to the driver pod. If this is specified, it is
+    highly recommended to set up TLS for the driver submission server, as this value is sensitive information that would
+    be passed to the driver pod in plaintext otherwise.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.serviceAccountName</code></td>
   <td><code>default</code></td>
   <td>
     Service account that is used when running the driver pod. The driver pod uses this service account when requesting
-    executor pods from the API server.
+    executor pods from the API server. Note that this cannot be specified alongside a CA cert file, client key file,
+    client cert file, and/or OAuth token.
   </td>
 </tr>
 <tr>
@@ -281,7 +327,7 @@ from the other deployment modes. See the [configuration page](configuration.html
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driverSubmitTimeout</code></td>
+  <td><code>spark.kubernetes.driverSubmissionTimeout</code></td>
   <td>60s</td>
   <td>
     Time to wait for the driver pod to start running before aborting its execution.
@@ -296,7 +342,7 @@ from the other deployment modes. See the [configuration page](configuration.html
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submit.waitAppCompletion</code></td>
+  <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
   <td><code>true</code></td>
   <td>
     In cluster mode, whether to wait for the application to finish before exiting the launcher process.  When changed to
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 6f715ebad2d75..e6b2e31568653 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -21,7 +21,6 @@ import java.security.SecureRandom
 import java.util.ServiceLoader
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
-import com.google.common.base.Charsets
 import com.google.common.io.Files
 import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model._
@@ -33,7 +32,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, RemoteAppResource, UploadedAppResource}
+import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, KubernetesCredentials, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.deploy.rest.kubernetes._
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ShutdownHookManager, Utils}
@@ -53,7 +52,7 @@ private[spark] class Client(
     .getOrElse("spark")
   private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
   private val secretName = s"$SUBMISSION_APP_SECRET_PREFIX-$kubernetesAppId"
-  private val secretDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId"
+  private val secretDirectory = s"$DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR/$kubernetesAppId"
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
   private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
   private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
@@ -119,23 +118,22 @@ private[spark] class Client(
       customAnnotations,
       KUBERNETES_DRIVER_ANNOTATIONS.key,
       "annotations")
+    val driverPodKubernetesCredentials = new DriverPodKubernetesCredentialsProvider(sparkConf).get()
     var k8ConfBuilder = new K8SConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(master)
       .withNamespace(namespace)
-    sparkConf.get(KUBERNETES_CA_CERT_FILE).foreach {
+    sparkConf.get(KUBERNETES_SUBMIT_CA_CERT_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withCaCertFile(f)
     }
-    sparkConf.get(KUBERNETES_CLIENT_KEY_FILE).foreach {
+    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_KEY_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withClientKeyFile(f)
     }
-    sparkConf.get(KUBERNETES_CLIENT_CERT_FILE).foreach {
+    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_CERT_FILE).foreach {
       f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
     }
-    sparkConf.get(KUBERNETES_OAUTH_TOKEN).foreach { token =>
+    sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { token =>
       k8ConfBuilder = k8ConfBuilder.withOauthToken(token)
-      // Remove the oauth token from Spark conf so that its doesn't appear in the Spark UI.
-      sparkConf.set(KUBERNETES_OAUTH_TOKEN, "<present_but_redacted>")
     }
 
     val k8ClientConfig = k8ConfBuilder.build
@@ -174,11 +172,6 @@ private[spark] class Client(
             .done()
           kubernetesResourceCleaner.registerOrUpdateResource(submitServerSecret)
           val sslConfiguration = sslConfigurationProvider.getSslConfiguration()
-          val driverKubernetesSelectors = (Map(
-            SPARK_DRIVER_LABEL -> kubernetesAppId,
-            SPARK_APP_ID_LABEL -> kubernetesAppId,
-            SPARK_APP_NAME_LABEL -> appName)
-            ++ parsedCustomLabels)
           val (driverPod, driverService) = launchDriverKubernetesComponents(
             kubernetesClient,
             driverServiceManager,
@@ -198,7 +191,8 @@ private[spark] class Client(
             sslConfiguration,
             driverService,
             submitterLocalFiles,
-            submitterLocalJars)
+            submitterLocalJars,
+            driverPodKubernetesCredentials)
           // Now that the application has started, persist the components that were created beyond
           // the shutdown hook. We still want to purge the one-time secrets, so do not unregister
           // those.
@@ -245,7 +239,8 @@ private[spark] class Client(
       sslConfiguration: SslConfiguration,
       driverService: Service,
       submitterLocalFiles: Iterable[String],
-      submitterLocalJars: Iterable[String]): Unit = {
+      submitterLocalJars: Iterable[String],
+      driverPodKubernetesCredentials: KubernetesCredentials): Unit = {
     sparkConf.getOption("spark.app.id").foreach { id =>
       logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
         s" overridden as $kubernetesAppId")
@@ -257,6 +252,12 @@ private[spark] class Client(
     sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
     sparkConf.setIfMissing("spark.blockmanager.port",
       DEFAULT_BLOCKMANAGER_PORT.toString)
+    sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
+      sparkConf.set(KUBERNETES_SUBMIT_OAUTH_TOKEN, "<present_but_redacted>")
+    }
+    sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
+      sparkConf.set(KUBERNETES_DRIVER_OAUTH_TOKEN, "<present_but_redacted>")
+    }
     val driverSubmitter = buildDriverSubmissionClient(
       kubernetesClient,
       driverServiceManager,
@@ -266,7 +267,10 @@ private[spark] class Client(
     driverSubmitter.ping()
     logInfo(s"Submitting local resources to driver pod for application " +
       s"$kubernetesAppId ...")
-    val submitRequest = buildSubmissionRequest(submitterLocalFiles, submitterLocalJars)
+    val submitRequest = buildSubmissionRequest(
+      submitterLocalFiles,
+      submitterLocalJars,
+      driverPodKubernetesCredentials)
     driverSubmitter.submitApplication(submitRequest)
     logInfo("Successfully submitted local resources and driver configuration to" +
       " driver pod.")
@@ -449,7 +453,7 @@ private[spark] class Client(
             .endSecret()
           .endVolume()
         .addToVolumes(sslConfiguration.sslPodVolumes: _*)
-        .withServiceAccount(serviceAccount)
+        .withServiceAccount(serviceAccount.getOrElse("default"))
         .addNewContainer()
           .withName(DRIVER_CONTAINER_NAME)
           .withImage(driverDockerImage)
@@ -625,7 +629,8 @@ private[spark] class Client(
 
   private def buildSubmissionRequest(
       submitterLocalFiles: Iterable[String],
-      submitterLocalJars: Iterable[String]): KubernetesCreateSubmissionRequest = {
+      submitterLocalJars: Iterable[String],
+      driverPodKubernetesCredentials: KubernetesCredentials): KubernetesCreateSubmissionRequest = {
     val mainResourceUri = Utils.resolveURI(mainAppResource)
     val resolvedAppResource: AppResource = Option(mainResourceUri.getScheme)
         .getOrElse("file") match {
@@ -648,7 +653,8 @@ private[spark] class Client(
       secret = secretBase64String,
       sparkProperties = sparkConf.getAll.toMap,
       uploadedJarsBase64Contents = uploadJarsBase64Contents,
-      uploadedFilesBase64Contents = uploadFilesBase64Contents)
+      uploadedFilesBase64Contents = uploadFilesBase64Contents,
+      driverPodKubernetesCredentials = driverPodKubernetesCredentials)
   }
 
   private def buildDriverSubmissionClient(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala
new file mode 100644
index 0000000000000..cee47aad79393
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.File
+
+import com.google.common.io.{BaseEncoding, Files}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.internal.config.OptionalConfigEntry
+
+private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf) {
+
+  def get(): KubernetesCredentials = {
+    sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME).foreach { _ =>
+      require(sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).isEmpty,
+        "Cannot specify both a service account and a driver pod OAuth token.")
+      require(sparkConf.get(KUBERNETES_DRIVER_CA_CERT_FILE).isEmpty,
+        "Cannot specify both a service account and a driver pod CA cert file.")
+      require(sparkConf.get(KUBERNETES_DRIVER_CLIENT_KEY_FILE).isEmpty,
+        "Cannot specify both a service account and a driver pod client key file.")
+      require(sparkConf.get(KUBERNETES_DRIVER_CLIENT_CERT_FILE).isEmpty,
+        "Cannot specify both a service account and a driver pod client cert file.")
+    }
+    val oauthToken = sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN)
+    val caCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CA_CERT_FILE,
+      s"Driver CA cert file provided at %s does not exist or is not a file.")
+    val clientKeyDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
+      s"Driver client key file provided at %s does not exist or is not a file.")
+    val clientCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
+      s"Driver client cert file provided at %s does not exist or is not a file.")
+    val serviceAccountName = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
+    KubernetesCredentials(
+      oauthToken = oauthToken,
+      caCertDataBase64 = caCertDataBase64,
+      clientKeyDataBase64 = clientKeyDataBase64,
+      clientCertDataBase64 = clientCertDataBase64)
+  }
+
+  private def safeFileConfToBase64(
+      conf: OptionalConfigEntry[String],
+      fileNotFoundFormatString: String): Option[String] = {
+    sparkConf.get(conf)
+      .map(new File(_))
+      .map { file =>
+        require(file.isFile, String.format(fileNotFoundFormatString, file.getAbsolutePath))
+        BaseEncoding.base64().encode(Files.toByteArray(file))
+      }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
index 89369b30694ee..554ed17ff25c4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
@@ -22,33 +22,62 @@ import com.google.common.base.Charsets
 import com.google.common.io.Files
 import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
 
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 
-private[spark] object KubernetesClientBuilder {
-  private val API_SERVER_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
-  private val CA_CERT_FILE = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
+private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: String) {
+  private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
+  private val SERVICE_ACCOUNT_CA_CERT = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
+  private val oauthTokenFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN)
+  private val caCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE)
+  private val clientKeyFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE)
+  private val clientCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE)
 
   /**
-   * Creates a {@link KubernetesClient}, expecting to be from
-   * within the context of a pod. When doing so, credentials files
-   * are picked up from canonical locations, as they are injected
-   * into the pod's disk space.
+   * Creates a {@link KubernetesClient}, expecting to be from within the context of a pod. When
+   * doing so, service account token files can be picked up from canonical locations.
    */
-  def buildFromWithinPod(
-      kubernetesNamespace: String): DefaultKubernetesClient = {
-    var clientConfigBuilder = new ConfigBuilder()
+  def buildFromWithinPod(): DefaultKubernetesClient = {
+    val baseClientConfigBuilder = new ConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(KUBERNETES_MASTER_INTERNAL_URL)
-      .withNamespace(kubernetesNamespace)
+      .withNamespace(namespace)
 
-    if (CA_CERT_FILE.isFile) {
-      clientConfigBuilder = clientConfigBuilder.withCaCertFile(CA_CERT_FILE.getAbsolutePath)
-    }
+    val configBuilder = oauthTokenFile
+        .orElse(caCertFile)
+        .orElse(clientKeyFile)
+        .orElse(clientCertFile)
+        .map { _ =>
+      var mountedAuthConfigBuilder = baseClientConfigBuilder
+      oauthTokenFile.foreach { tokenFilePath =>
+        val tokenFile = new File(tokenFilePath)
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder
+          .withOauthToken(Files.toString(tokenFile, Charsets.UTF_8))
+      }
+      caCertFile.foreach { caFile =>
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withCaCertFile(caFile)
+      }
+      clientKeyFile.foreach { keyFile =>
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientKeyFile(keyFile)
+      }
+      clientCertFile.foreach { certFile =>
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientCertFile(certFile)
+      }
+      mountedAuthConfigBuilder
+    }.getOrElse {
+      var serviceAccountConfigBuilder = baseClientConfigBuilder
+      if (SERVICE_ACCOUNT_CA_CERT.isFile) {
+        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withCaCertFile(
+          SERVICE_ACCOUNT_CA_CERT.getAbsolutePath)
+      }
 
-    if (API_SERVER_TOKEN.isFile) {
-      clientConfigBuilder = clientConfigBuilder.withOauthToken(
-        Files.toString(API_SERVER_TOKEN, Charsets.UTF_8))
+      if (SERVICE_ACCOUNT_TOKEN.isFile) {
+        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withOauthToken(
+          Files.toString(SERVICE_ACCOUNT_TOKEN, Charsets.UTF_8))
+      }
+      serviceAccountConfigBuilder
     }
-    new DefaultKubernetesClient(clientConfigBuilder.build)
+    new DefaultKubernetesClient(configBuilder.build)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
index 4c031fcba91ab..4bbe3ed385a4d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
@@ -49,7 +49,8 @@ private[spark] class SslConfigurationProvider(
     kubernetesResourceCleaner: KubernetesResourceCleaner) {
   private val SECURE_RANDOM = new SecureRandom()
   private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
-  private val sslSecretsDirectory = s"$DRIVER_CONTAINER_SECRETS_BASE_DIR/$kubernetesAppId-ssl"
+  private val sslSecretsDirectory = DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR +
+    s"/$kubernetesAppId-ssl"
 
   def getSslConfiguration(): SslConfiguration = {
     val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 0c4269080335f..e33c761ecc8d1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -27,233 +27,240 @@ package object config {
 
   private[spark] val KUBERNETES_NAMESPACE =
     ConfigBuilder("spark.kubernetes.namespace")
-      .doc("""
-          | The namespace that will be used for running the driver and
-          | executor pods. When using spark-submit in cluster mode,
-          | this can also be passed to spark-submit via the
-          | --kubernetes-namespace command line argument.
-        """.stripMargin)
+      .doc("The namespace that will be used for running the driver and executor pods. When using" +
+        " spark-submit in cluster mode, this can also be passed to spark-submit via the" +
+        " --kubernetes-namespace command line argument.")
       .stringConf
       .createWithDefault("default")
 
   private[spark] val DRIVER_DOCKER_IMAGE =
     ConfigBuilder("spark.kubernetes.driver.docker.image")
-      .doc("""
-          | Docker image to use for the driver. Specify this using the
-          | standard Docker tag format.
-        """.stripMargin)
+      .doc("Docker image to use for the driver. Specify this using the standard Docker tag format.")
       .stringConf
       .createWithDefault(s"spark-driver:$sparkVersion")
 
   private[spark] val EXECUTOR_DOCKER_IMAGE =
     ConfigBuilder("spark.kubernetes.executor.docker.image")
-      .doc("""
-          | Docker image to use for the executors. Specify this using
-          | the standard Docker tag format.
-        """.stripMargin)
+      .doc("Docker image to use for the executors. Specify this using the standard Docker tag" +
+        " format.")
       .stringConf
       .createWithDefault(s"spark-executor:$sparkVersion")
 
-  private[spark] val KUBERNETES_CA_CERT_FILE =
-    ConfigBuilder("spark.kubernetes.submit.caCertFile")
-      .doc("""
-          | CA cert file for connecting to Kubernetes over SSL. This
-          | file should be located on the submitting machine's disk.
-        """.stripMargin)
+  private val APISERVER_SUBMIT_CONF_PREFIX = "spark.kubernetes.authenticate.submission"
+  private val APISERVER_DRIVER_CONF_PREFIX = "spark.kubernetes.authenticate.driver"
+
+  private[spark] val KUBERNETES_SUBMIT_CA_CERT_FILE =
+    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.caCertFile")
+      .doc("Path to the CA cert file for connecting to Kubernetes over SSL when creating" +
+        " Kubernetes resources for the driver. This file should be located on the submitting" +
+        " machine's disk.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_SUBMIT_CLIENT_KEY_FILE =
+    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.clientKeyFile")
+      .doc("Path to the client key file for authenticating against the Kubernetes API server" +
+        " when initially creating Kubernetes resources for the driver. This file should be" +
+        " located on the submitting machine's disk.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_SUBMIT_CLIENT_CERT_FILE =
+    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.clientCertFile")
+      .doc("Path to the client cert file for authenticating against the Kubernetes API server" +
+        " when initially creating Kubernetes resources for the driver. This file should be" +
+        " located on the submitting machine's disk.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_SUBMIT_OAUTH_TOKEN =
+    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.oauthToken")
+      .doc("OAuth token to use when authenticating against the against the Kubernetes API server" +
+        " when initially creating Kubernetes resources for the driver. Note that unlike the other" +
+        " authentication options, this should be the exact string value of the token to use for" +
+        " the authentication.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_CA_CERT_FILE =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.caCertFile")
+      .doc("Path to the CA cert file for connecting to Kubernetes over TLS from the driver pod" +
+        " when requesting executors. This file should be located on the submitting machine's disk" +
+        " and will be uploaded to the driver pod.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_CLIENT_KEY_FILE =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.clientKeyFile")
+      .doc("Path to the client key file for authenticating against the Kubernetes API server from" +
+        " the driver pod when requesting executors. This file should be located on the submitting" +
+        " machine's disk, and will be uploaded to the driver pod.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_CLIENT_CERT_FILE =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.clientCertFile")
+      .doc("Path to the client cert file for authenticating against the Kubernetes API server" +
+        " from the driver pod when requesting executors. This file should be located on the" +
+        " submitting machine's disk, and will be uploaded to the driver pod.")
       .stringConf
       .createOptional
 
-  private[spark] val KUBERNETES_CLIENT_KEY_FILE =
-    ConfigBuilder("spark.kubernetes.submit.clientKeyFile")
-      .doc("""
-          | Client key file for authenticating against the Kubernetes
-          | API server. This file should be located on the submitting
-          | machine's disk.
-        """.stripMargin)
+  private[spark] val KUBERNETES_DRIVER_OAUTH_TOKEN =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.oauthToken")
+      .doc("OAuth token to use when authenticating against the Kubernetes API server from the" +
+        " driver pod when requesting executors. Note that unlike the other authentication options" +
+        " this should be the exact string value of the token to use for the authentication. This" +
+        " token value is mounted as a secret on the driver pod.")
       .stringConf
       .createOptional
 
-  private[spark] val KUBERNETES_CLIENT_CERT_FILE =
-    ConfigBuilder("spark.kubernetes.submit.clientCertFile")
-      .doc("""
-          | Client cert file for authenticating against the
-          | Kubernetes API server. This file should be located on
-          | the submitting machine's disk.
-        """.stripMargin)
+  private[spark] val KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.caCertFile")
+      .doc("Path on the driver pod's disk containing the CA cert file to use when authenticating" +
+        " against Kubernetes.")
       .stringConf
       .createOptional
 
-  private[spark] val KUBERNETES_OAUTH_TOKEN =
-    ConfigBuilder("spark.kubernetes.submit.oauthToken")
-      .doc("""
-          | OAuth token to use when authenticating against the
-          | against the Kubernetes API server. Note that unlike
-          | the other authentication options, this should be the
-          | exact string value of the token to use for the
-          | authentication.
-        """.stripMargin)
+  private[spark] val KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.clientKeyFile")
+      .doc("Path on the driver pod's disk containing the client key file to use when" +
+        " authenticating against Kubernetes.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.clientCertFile")
+      .doc("Path on the driver pod's disk containing the client cert file to use when" +
+        " authenticating against Kubernetes.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN =
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.oauthTokenFile")
+      .doc("Path on the driver pod's disk containing the OAuth token file to use when" +
+        " authenticating against Kubernetes.")
+      .internal()
       .stringConf
       .createOptional
 
   private[spark] val KUBERNETES_SERVICE_ACCOUNT_NAME =
-    ConfigBuilder("spark.kubernetes.submit.serviceAccountName")
-      .doc("""
-          | Service account that is used when running the driver pod.
-          | The driver pod uses this service account when requesting
-          | executor pods from the API server.
-        """.stripMargin)
+    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.serviceAccountName")
+      .doc("Service account that is used when running the driver pod. The driver pod uses" +
+        " this service account when requesting executor pods from the API server. If specific" +
+        " credentials are given for the driver pod to use, the driver will favor" +
+        " using those credentials instead.")
       .stringConf
-      .createWithDefault("default")
+      .createOptional
 
   // Note that while we set a default for this when we start up the
   // scheduler, the specific default value is dynamically determined
   // based on the executor memory.
   private[spark] val KUBERNETES_EXECUTOR_MEMORY_OVERHEAD =
     ConfigBuilder("spark.kubernetes.executor.memoryOverhead")
-      .doc("""
-          | The amount of off-heap memory (in megabytes) to be
-          | allocated per executor. This is memory that accounts for
-          | things like VM overheads, interned strings, other native
-          | overheads, etc. This tends to grow with the executor size
-          | (typically 6-10%).
-        """.stripMargin)
+      .doc("The amount of off-heap memory (in megabytes) to be allocated per executor. This" +
+        " is memory that accounts for things like VM overheads, interned strings, other native" +
+        " overheads, etc. This tends to grow with the executor size. (typically 6-10%).")
       .bytesConf(ByteUnit.MiB)
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_MEMORY_OVERHEAD =
     ConfigBuilder("spark.kubernetes.driver.memoryOverhead")
-      .doc("""
-          | The amount of off-heap memory (in megabytes) to be
-          | allocated for the driver and the driver submission server.
-          | This is memory that accounts for things like VM overheads,
-          | interned strings, other native overheads, etc. This tends
-          | to grow with the driver's memory size (typically 6-10%).
-           """.stripMargin)
+      .doc("The amount of off-heap memory (in megabytes) to be allocated for the driver and the" +
+        " driver submission server. This is memory that accounts for things like VM overheads," +
+        " interned strings, other native overheads, etc. This tends to grow with the driver's" +
+        " memory size (typically 6-10%).")
       .bytesConf(ByteUnit.MiB)
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_LABELS =
     ConfigBuilder("spark.kubernetes.driver.labels")
-      .doc("""
-          | Custom labels that will be added to the driver pod.
-          | This should be a comma-separated list of label key-value
-          | pairs, where each label is in the format key=value. Note
-          | that Spark also adds its own labels to the driver pod
-          | for bookkeeping purposes.
-        """.stripMargin)
+      .doc("Custom labels that will be added to the driver pod. This should be a comma-separated" +
+        " list of label key-value pairs, where each label is in the format key=value. Note that" +
+        " Spark also adds its own labels to the driver pod for bookkeeping purposes.")
       .stringConf
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_ANNOTATIONS =
     ConfigBuilder("spark.kubernetes.driver.annotations")
-      .doc("""
-             | Custom annotations that will be added to the driver pod.
-             | This should be a comma-separated list of annotation key-value
-             | pairs, where each annotation is in the format key=value.
-           """.stripMargin)
+      .doc("Custom annotations that will be added to the driver pod. This should be a" +
+        " comma-separated list of annotation key-value pairs, where each annotation is in the" +
+        " format key=value.")
       .stringConf
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_SUBMIT_TIMEOUT =
-    ConfigBuilder("spark.kubernetes.driverSubmitTimeout")
-      .doc("""
-          | Time to wait for the driver process to start running
-          | before aborting its execution.
-        """.stripMargin)
+    ConfigBuilder("spark.kubernetes.driverSubmissionTimeout")
+      .doc("Time to wait for the driver process to start running before aborting its execution.")
       .timeConf(TimeUnit.SECONDS)
       .createWithDefault(60L)
 
   private[spark] val KUBERNETES_DRIVER_SUBMIT_KEYSTORE =
-    ConfigBuilder("spark.ssl.kubernetes.submit.keyStore")
-      .doc("""
-          | KeyStore file for the driver submission server listening
-          | on SSL. Can be pre-mounted on the driver container
-          | or uploaded from the submitting client.
-        """.stripMargin)
+    ConfigBuilder("spark.ssl.kubernetes.submission.keyStore")
+      .doc("KeyStore file for the driver submission server listening on SSL. Can be pre-mounted" +
+        " on the driver container or uploaded from the submitting client.")
       .stringConf
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE =
-    ConfigBuilder("spark.ssl.kubernetes.submit.trustStore")
-      .doc("""
-          | TrustStore containing certificates for communicating
-          | to the driver submission server over SSL.
-        """.stripMargin)
+    ConfigBuilder("spark.ssl.kubernetes.submission.trustStore")
+      .doc("TrustStore containing certificates for communicating to the driver submission server" +
+        " over SSL.")
       .stringConf
       .createOptional
 
   private[spark] val DRIVER_SUBMIT_SSL_ENABLED =
-    ConfigBuilder("spark.ssl.kubernetes.submit.enabled")
-      .doc("""
-             | Whether or not to use SSL when sending the
-             | application dependencies to the driver pod.
-             |
-           """.stripMargin)
+    ConfigBuilder("spark.ssl.kubernetes.submission.enabled")
+      .doc("Whether or not to use SSL when sending the application dependencies to the driver pod.")
       .booleanConf
       .createWithDefault(false)
 
   private[spark] val KUBERNETES_DRIVER_SERVICE_NAME =
     ConfigBuilder("spark.kubernetes.driver.service.name")
-        .doc("""
-            | Kubernetes service that exposes the driver pod
-            | for external access.
-          """.stripMargin)
+        .doc("Kubernetes service that exposes the driver pod for external access.")
         .internal()
         .stringConf
         .createOptional
 
   private[spark] val KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY =
     ConfigBuilder("spark.kubernetes.driver.submissionServerMemory")
-      .doc("""
-          | The amount of memory to allocate for the driver submission server.
-        """.stripMargin)
+      .doc("The amount of memory to allocate for the driver submission server.")
       .bytesConf(ByteUnit.MiB)
       .createWithDefaultString("256m")
 
   private[spark] val EXPOSE_KUBERNETES_DRIVER_SERVICE_UI_PORT =
     ConfigBuilder("spark.kubernetes.driver.service.exposeUiPort")
-      .doc("""
-          | Whether to expose the driver Web UI port as a service NodePort. Turned off by default
-          | because NodePort is a limited resource. Use alternatives such as Ingress if possible.
-        """.stripMargin)
+      .doc("Whether to expose the driver Web UI port as a service NodePort. Turned off by default" +
+        " because NodePort is a limited resource. Use alternatives if possible.")
       .booleanConf
       .createWithDefault(false)
 
   private[spark] val KUBERNETES_DRIVER_POD_NAME =
     ConfigBuilder("spark.kubernetes.driver.pod.name")
-      .doc("""
-          | Name of the driver pod.
-        """.stripMargin)
+      .doc("Name of the driver pod.")
       .internal()
       .stringConf
       .createOptional
 
   private[spark] val DRIVER_SERVICE_MANAGER_TYPE =
     ConfigBuilder("spark.kubernetes.driver.serviceManagerType")
-      .doc(s"""
-          | A tag indicating which class to use for creating the
-          | Kubernetes service and determining its URI for the submission
-          | client.
-        """.stripMargin)
+      .doc("A tag indicating which class to use for creating the Kubernetes service and" +
+        " determining its URI for the submission client.")
       .stringConf
       .createWithDefault(NodePortUrisDriverServiceManager.TYPE)
 
   private[spark] val WAIT_FOR_APP_COMPLETION =
-    ConfigBuilder("spark.kubernetes.submit.waitAppCompletion")
-      .doc(
-        """
-          | In cluster mode, whether to wait for the application to finish before exiting the
-          | launcher process.
-        """.stripMargin)
+    ConfigBuilder("spark.kubernetes.submission.waitAppCompletion")
+      .doc("In cluster mode, whether to wait for the application to finish before exiting the" +
+        " launcher process.")
       .booleanConf
       .createWithDefault(true)
 
   private[spark] val REPORT_INTERVAL =
     ConfigBuilder("spark.kubernetes.report.interval")
-      .doc(
-        """
-          | Interval between reports of the current app status in cluster mode.
-        """.stripMargin)
+      .doc("Interval between reports of the current app status in cluster mode.")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("1s")
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 4af065758e674..23d216e799fff 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -24,7 +24,8 @@ package object constants {
   private[spark] val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
 
   // Secrets
-  private[spark] val DRIVER_CONTAINER_SECRETS_BASE_DIR = "/var/run/secrets/spark-submission"
+  private[spark] val DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR =
+    "/var/run/secrets/spark-submission"
   private[spark] val SUBMISSION_APP_SECRET_NAME = "spark-submission-server-secret"
   private[spark] val SUBMISSION_APP_SECRET_PREFIX = "spark-submission-server-secret"
   private[spark] val SUBMISSION_APP_SECRET_VOLUME_NAME = "spark-submission-secret-volume"
@@ -73,7 +74,7 @@ package object constants {
 
   // Miscellaneous
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
-  private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submit"
+  private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submission"
   private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
   private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
   private[spark] val MEMORY_OVERHEAD_MIN = 384L
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
index 0d2d1a1c6f5e3..1ea44109c5f5e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
@@ -20,14 +20,21 @@ import com.fasterxml.jackson.annotation.{JsonSubTypes, JsonTypeInfo}
 
 import org.apache.spark.SPARK_VERSION
 
+case class KubernetesCredentials(
+    oauthToken: Option[String],
+    caCertDataBase64: Option[String],
+    clientKeyDataBase64: Option[String],
+    clientCertDataBase64: Option[String])
+
 case class KubernetesCreateSubmissionRequest(
-  appResource: AppResource,
-  mainClass: String,
-  appArgs: Array[String],
-  sparkProperties: Map[String, String],
-  secret: String,
-  uploadedJarsBase64Contents: TarGzippedData,
-  uploadedFilesBase64Contents: TarGzippedData) extends SubmitRestProtocolRequest {
+    appResource: AppResource,
+    mainClass: String,
+    appArgs: Array[String],
+    sparkProperties: Map[String, String],
+    secret: String,
+    driverPodKubernetesCredentials: KubernetesCredentials,
+    uploadedJarsBase64Contents: TarGzippedData,
+    uploadedFilesBase64Contents: TarGzippedData) extends SubmitRestProtocolRequest {
   message = "create"
   clientSparkVersion = SPARK_VERSION
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 5952acc0d5916..4688521a59d38 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -31,7 +31,9 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.rest._
+import org.apache.spark.internal.config.OptionalConfigEntry
 import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
 
 private case class KubernetesSparkRestServerArguments(
@@ -152,6 +154,7 @@ private[spark] class KubernetesSparkRestServer(
                 appArgs,
                 sparkProperties,
                 secret,
+                driverPodKubernetesCredentials,
                 uploadedJars,
                 uploadedFiles) =>
               val decodedSecret = Base64.decodeBase64(secret)
@@ -214,6 +217,8 @@ private[spark] class KubernetesSparkRestServer(
                 } else {
                   resolvedSparkProperties.remove("spark.files")
                 }
+                resolvedSparkProperties ++= writeKubernetesCredentials(
+                  driverPodKubernetesCredentials, tempDir)
 
                 val command = new ArrayBuffer[String]
                 command += javaExecutable
@@ -280,6 +285,48 @@ private[spark] class KubernetesSparkRestServer(
       CompressionUtils.unpackAndWriteCompressedFiles(files, workingDir)
     }
 
+    private def writeKubernetesCredentials(
+        kubernetesCredentials: KubernetesCredentials,
+        rootTempDir: File): Map[String, String] = {
+      val resolvedDirectory = new File(rootTempDir, "kubernetes-credentials")
+      if (!resolvedDirectory.mkdir()) {
+        throw new IllegalStateException(s"Failed to create credentials dir at "
+          + resolvedDirectory.getAbsolutePath)
+      }
+      val oauthTokenFile = writeRawStringCredentialAndGetConf("oauth-token.txt", resolvedDirectory,
+        KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN, kubernetesCredentials.oauthToken)
+      val caCertFile = writeBase64CredentialAndGetConf("ca.crt", resolvedDirectory,
+        KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE, kubernetesCredentials.caCertDataBase64)
+      val clientKeyFile = writeBase64CredentialAndGetConf("key.key", resolvedDirectory,
+        KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE, kubernetesCredentials.clientKeyDataBase64)
+      val clientCertFile = writeBase64CredentialAndGetConf("cert.crt", resolvedDirectory,
+        KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE, kubernetesCredentials.clientCertDataBase64)
+      (oauthTokenFile ++ caCertFile ++ clientKeyFile ++ clientCertFile).toMap
+    }
+
+    private def writeRawStringCredentialAndGetConf(
+        fileName: String,
+        dir: File,
+        conf: OptionalConfigEntry[String],
+        credential: Option[String]): Option[(String, String)] = {
+      credential.map { cred =>
+        val credentialFile = new File(dir, fileName)
+        Files.write(cred, credentialFile, Charsets.UTF_8)
+        (conf.key, credentialFile.getAbsolutePath)
+      }
+    }
+
+    private def writeBase64CredentialAndGetConf(
+        fileName: String,
+        dir: File,
+        conf: OptionalConfigEntry[String],
+        credential: Option[String]): Option[(String, String)] = {
+      credential.map { cred =>
+        val credentialFile = new File(dir, fileName)
+        Files.write(BaseEncoding.base64().decode(cred), credentialFile)
+        (conf.key, credentialFile.getAbsolutePath)
+      }
+    }
 
     /**
      * Retrieve the path on the driver container where the main app resource is, and what value it
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
index fa8362677f38f..1416476824793 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
@@ -39,8 +39,8 @@ private[spark] class NodePortUrisDriverServiceManager extends DriverServiceManag
     val urlScheme = if (sparkConf.get(DRIVER_SUBMIT_SSL_ENABLED)) {
       "https"
     } else {
-      logWarning("Submitting application details, application secret, and local" +
-        " jars to the cluster over an insecure connection. You should configure SSL" +
+      logWarning("Submitting application details, application secret, Kubernetes credentials," +
+        " and local jars to the cluster over an insecure connection. You should configure SSL" +
         " to secure this step.")
       "http"
     }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 90907ff83ed84..234829a541c30 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -16,17 +16,14 @@
  */
 package org.apache.spark.scheduler.cluster.kubernetes
 
-import java.util.UUID
-import java.util.concurrent.Executors
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, Pod, QuantityBuilder}
 import scala.collection.JavaConverters._
 import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.deploy.kubernetes.{Client, KubernetesClientBuilder}
+import org.apache.spark.deploy.kubernetes.KubernetesClientBuilder
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.rpc.RpcEndpointAddress
@@ -76,8 +73,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
 
-  private val kubernetesClient = KubernetesClientBuilder
-    .buildFromWithinPod(kubernetesNamespace)
+  private val kubernetesClient = new KubernetesClientBuilder(conf, kubernetesNamespace)
+    .buildFromWithinPod()
 
   private val driverPod = try {
     kubernetesClient.pods().inNamespace(kubernetesNamespace).
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 6aa1c1fee0d47..16564ca746b40 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 import java.io.File
 import java.nio.file.Paths
 import java.util.UUID
+import java.util.concurrent.TimeUnit
 
 import com.google.common.base.Charsets
 import com.google.common.collect.ImmutableList
@@ -54,6 +55,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   private val HELPER_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs-helpers")
       .toFile
       .listFiles()(0)
+  private val SUBMITTER_LOCAL_MAIN_APP_RESOURCE = s"file://${EXAMPLES_JAR_FILE.getAbsolutePath}"
+  private val CONTAINER_LOCAL_MAIN_APP_RESOURCE = s"local:///opt/spark/examples/" +
+    s"integration-tests-jars/${EXAMPLES_JAR_FILE.getName}"
+  private val CONTAINER_LOCAL_HELPER_JAR_PATH = s"local:///opt/spark/examples/" +
+    s"integration-tests-jars/${HELPER_JAR_FILE.getName}"
 
   private val TEST_EXISTENCE_FILE = Paths.get("test-data", "input.txt").toFile
   private val TEST_EXISTENCE_FILE_CONTENTS = Files.toString(TEST_EXISTENCE_FILE, Charsets.UTF_8)
@@ -68,6 +74,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   private var clientConfig: Config = _
   private var keyStoreFile: File = _
   private var trustStoreFile: File = _
+  private var sparkConf: SparkConf = _
 
   override def beforeAll(): Unit = {
     Minikube.startMinikube()
@@ -100,6 +107,22 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         || servicesList.getItems == null
         || servicesList.getItems.isEmpty)
     }
+    sparkConf = new SparkConf(true)
+      .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
+      .set(KUBERNETES_SUBMIT_CA_CERT_FILE, clientConfig.getCaCertFile)
+      .set(KUBERNETES_SUBMIT_CLIENT_KEY_FILE, clientConfig.getClientKeyFile)
+      .set(KUBERNETES_SUBMIT_CLIENT_CERT_FILE, clientConfig.getClientCertFile)
+      .set(KUBERNETES_NAMESPACE, NAMESPACE)
+      .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
+      .set(EXECUTOR_DOCKER_IMAGE, "spark-executor:latest")
+      .setJars(Seq(HELPER_JAR_FILE.getAbsolutePath))
+      .set("spark.executor.memory", "500m")
+      .set("spark.executor.cores", "1")
+      .set("spark.executors.instances", "1")
+      .set("spark.app.name", "spark-pi")
+      .set("spark.ui.enabled", "true")
+      .set("spark.testing", "false")
+      .set(WAIT_FOR_APP_COMPLETION, false)
   }
 
   after {
@@ -112,7 +135,10 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         .delete
     })
     // spark-submit sets system properties so we have to clear them
-    new SparkConf(true).getAll.map(_._1).foreach { System.clearProperty }
+    new SparkConf(true)
+      .getAll.map(_._1)
+      .filter(_ != "spark.docker.test.persistMinikube")
+      .foreach { System.clearProperty }
   }
 
   override def afterAll(): Unit = {
@@ -159,28 +185,10 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     // We'll make assertions based on spark rest api, so we need to turn on
     // spark.ui.enabled explicitly since the scalatest-maven-plugin would set it
     // to false by default.
-    val sparkConf = new SparkConf(true)
-      .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
-      .set("spark.kubernetes.submit.caCertFile", clientConfig.getCaCertFile)
-      .set("spark.kubernetes.submit.clientKeyFile", clientConfig.getClientKeyFile)
-      .set("spark.kubernetes.submit.clientCertFile", clientConfig.getClientCertFile)
-      .set("spark.kubernetes.namespace", NAMESPACE)
-      .set("spark.kubernetes.driver.docker.image", "spark-driver:latest")
-      .set("spark.kubernetes.executor.docker.image", "spark-executor:latest")
-      .set("spark.jars", HELPER_JAR_FILE.getAbsolutePath)
-      .set("spark.executor.memory", "500m")
-      .set("spark.executor.cores", "1")
-      .set("spark.executors.instances", "1")
-      .set("spark.app.name", "spark-pi")
-      .set("spark.ui.enabled", "true")
-      .set("spark.testing", "false")
-      .set("spark.kubernetes.submit.waitAppCompletion", "false")
-    val mainAppResource = s"file://${EXAMPLES_JAR_FILE.getAbsolutePath}"
-
     new Client(
       sparkConf = sparkConf,
       mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = mainAppResource,
+      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
       appArgs = Array.empty[String]).run()
     val sparkMetricsService = getSparkMetricsService("spark-pi")
     expectationsForStaticAllocation(sparkMetricsService)
@@ -199,64 +207,38 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       "--class", SPARK_PI_MAIN_CLASS,
       "--conf", "spark.ui.enabled=true",
       "--conf", "spark.testing=false",
-      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
-      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
-      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
-      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
-      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
+      "--conf", s"${KUBERNETES_SUBMIT_CA_CERT_FILE.key}=${clientConfig.getCaCertFile}",
+      "--conf", s"${KUBERNETES_SUBMIT_CLIENT_KEY_FILE.key}=${clientConfig.getClientKeyFile}",
+      "--conf", s"${KUBERNETES_SUBMIT_CLIENT_CERT_FILE.key}=${clientConfig.getClientCertFile}",
+      "--conf", s"${EXECUTOR_DOCKER_IMAGE.key}=spark-executor:latest",
+      "--conf", s"${DRIVER_DOCKER_IMAGE.key}=spark-driver:latest",
+      "--conf", s"${WAIT_FOR_APP_COMPLETION.key}=false",
       EXAMPLES_JAR_FILE.getAbsolutePath)
     SparkSubmit.main(args)
     val sparkMetricsService = getSparkMetricsService("spark-pi")
     expectationsForStaticAllocation(sparkMetricsService)
   }
 
-  test("Run using spark-submit with the examples jar on the docker image") {
-    val args = Array(
-      "--master", s"k8s://${Minikube.getMinikubeIp}:8443",
-      "--deploy-mode", "cluster",
-      "--kubernetes-namespace", NAMESPACE,
-      "--name", "spark-pi",
-      "--executor-memory", "512m",
-      "--executor-cores", "1",
-      "--num-executors", "1",
-      "--jars", s"local:///opt/spark/examples/integration-tests-jars/${HELPER_JAR_FILE.getName}",
-      "--class", SPARK_PI_MAIN_CLASS,
-      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
-      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
-      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
-      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
-      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      s"local:///opt/spark/examples/integration-tests-jars/${EXAMPLES_JAR_FILE.getName}")
-    SparkSubmit.main(args)
+  test("Run with the examples jar on the docker image") {
+    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = SPARK_PI_MAIN_CLASS,
+      mainAppResource = CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
     val sparkMetricsService = getSparkMetricsService("spark-pi")
     expectationsForStaticAllocation(sparkMetricsService)
   }
 
   test("Run with custom labels and annotations") {
-    val args = Array(
-      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
-      "--deploy-mode", "cluster",
-      "--kubernetes-namespace", NAMESPACE,
-      "--name", "spark-pi",
-      "--executor-memory", "512m",
-      "--executor-cores", "1",
-      "--num-executors", "1",
-      "--jars", HELPER_JAR_FILE.getAbsolutePath,
-      "--class", SPARK_PI_MAIN_CLASS,
-      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
-      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
-      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
-      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
-      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-      "--conf", "spark.kubernetes.driver.labels=label1=label1value,label2=label2value",
-      "--conf", "spark.kubernetes.driver.annotations=" +
-        "annotation1=annotation1value," +
-        "annotation2=annotation2value",
-      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR_FILE.getAbsolutePath)
-    SparkSubmit.main(args)
+    sparkConf.set(KUBERNETES_DRIVER_LABELS, "label1=label1value,label2=label2value")
+    sparkConf.set(KUBERNETES_DRIVER_ANNOTATIONS, "annotation1=annotation1value," +
+        "annotation2=annotation2value")
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = SPARK_PI_MAIN_CLASS,
+      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
     val driverPodMetadata = minikubeKubernetesClient
       .pods
       .withLabel("spark-app-name", "spark-pi")
@@ -283,57 +265,22 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Enable SSL on the driver submit server") {
-    val args = Array(
-      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
-      "--deploy-mode", "cluster",
-      "--kubernetes-namespace", NAMESPACE,
-      "--name", "spark-pi",
-      "--executor-memory", "512m",
-      "--executor-cores", "1",
-      "--num-executors", "1",
-      "--jars", HELPER_JAR_FILE.getAbsolutePath,
-      "--class", SPARK_PI_MAIN_CLASS,
-      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
-      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
-      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
-      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
-      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-      "--conf", "spark.ssl.kubernetes.submit.enabled=true",
-      "--conf", "spark.ssl.kubernetes.submit.keyStore=" +
-        s"file://${keyStoreFile.getAbsolutePath}",
-      "--conf", "spark.ssl.kubernetes.submit.keyStorePassword=changeit",
-      "--conf", "spark.ssl.kubernetes.submit.keyPassword=changeit",
-      "--conf", "spark.ssl.kubernetes.submit.trustStore=" +
-        s"file://${trustStoreFile.getAbsolutePath}",
-      "--conf", s"spark.ssl.kubernetes.driverlaunch.trustStorePassword=changeit",
-      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR_FILE.getAbsolutePath)
-    SparkSubmit.main(args)
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, s"file://${keyStoreFile.getAbsolutePath}")
+    sparkConf.set("spark.ssl.kubernetes.submission.keyStorePassword", "changeit")
+    sparkConf.set("spark.ssl.kubernetes.submission.keyPassword", "changeit")
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE,
+      s"file://${trustStoreFile.getAbsolutePath}")
+    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = SPARK_PI_MAIN_CLASS,
+      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
   }
 
   test("Added files should exist on the driver.") {
-    val args = Array(
-      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
-      "--deploy-mode", "cluster",
-      "--kubernetes-namespace", NAMESPACE,
-      "--name", "spark-file-existence-test",
-      "--executor-memory", "512m",
-      "--executor-cores", "1",
-      "--num-executors", "1",
-      "--jars", HELPER_JAR_FILE.getAbsolutePath,
-      "--files", TEST_EXISTENCE_FILE.getAbsolutePath,
-      "--class", FILE_EXISTENCE_MAIN_CLASS,
-      "--conf", "spark.ui.enabled=false",
-      "--conf", "spark.testing=true",
-      "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
-      "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
-      "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
-      "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
-      "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-      "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-      EXAMPLES_JAR_FILE.getAbsolutePath,
-      TEST_EXISTENCE_FILE.getName,
-      TEST_EXISTENCE_FILE_CONTENTS)
+    sparkConf.set("spark.files", TEST_EXISTENCE_FILE.getAbsolutePath)
+    sparkConf.setAppName("spark-file-existence-test")
     val podCompletedFuture = SettableFuture.create[Boolean]
     val watch = new Watcher[Pod] {
       override def eventReceived(action: Action, pod: Pod): Unit = {
@@ -364,8 +311,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         .pods
         .withLabel("spark-app-name", "spark-file-existence-test")
         .watch(watch)) { _ =>
-      SparkSubmit.main(args)
-      assert(podCompletedFuture.get, "Failed to run driver pod")
+      new Client(
+        sparkConf = sparkConf,
+        mainClass = FILE_EXISTENCE_MAIN_CLASS,
+        mainAppResource = CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+        appArgs = Array(TEST_EXISTENCE_FILE.getName, TEST_EXISTENCE_FILE_CONTENTS)).run()
+      assert(podCompletedFuture.get(60, TimeUnit.SECONDS), "Failed to run driver pod")
       val driverPod = minikubeKubernetesClient
         .pods
         .withLabel("spark-app-name", "spark-file-existence-test")
@@ -386,27 +337,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     Utils.tryWithResource(minikubeKubernetesClient.services()
         .withLabel("spark-app-name", "spark-pi")
         .watch(externalUriProviderWatch)) { _ =>
-      val args = Array(
-        "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
-        "--deploy-mode", "cluster",
-        "--kubernetes-namespace", NAMESPACE,
-        "--name", "spark-pi",
-        "--executor-memory", "512m",
-        "--executor-cores", "1",
-        "--num-executors", "1",
-        "--jars", HELPER_JAR_FILE.getAbsolutePath,
-        "--class", SPARK_PI_MAIN_CLASS,
-        "--conf", "spark.ui.enabled=true",
-        "--conf", "spark.testing=false",
-        "--conf", s"spark.kubernetes.submit.caCertFile=${clientConfig.getCaCertFile}",
-        "--conf", s"spark.kubernetes.submit.clientKeyFile=${clientConfig.getClientKeyFile}",
-        "--conf", s"spark.kubernetes.submit.clientCertFile=${clientConfig.getClientCertFile}",
-        "--conf", "spark.kubernetes.executor.docker.image=spark-executor:latest",
-        "--conf", "spark.kubernetes.driver.docker.image=spark-driver:latest",
-        "--conf", "spark.kubernetes.submit.waitAppCompletion=false",
-        "--conf", s"${DRIVER_SERVICE_MANAGER_TYPE.key}=${ExternalSuppliedUrisDriverServiceManager.TYPE}",
-        EXAMPLES_JAR_FILE.getAbsolutePath)
-      SparkSubmit.main(args)
+      sparkConf.set(DRIVER_SERVICE_MANAGER_TYPE, ExternalSuppliedUrisDriverServiceManager.TYPE)
+      new Client(
+        sparkConf = sparkConf,
+        mainClass = SPARK_PI_MAIN_CLASS,
+        mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+        appArgs = Array.empty[String]).run()
       val sparkMetricsService = getSparkMetricsService("spark-pi")
       expectationsForStaticAllocation(sparkMetricsService)
       assert(externalUriProviderWatch.annotationSet.get)
@@ -425,4 +361,17 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         "Resolved URI annotation not set on driver service.")
     }
   }
+
+  test("Mount the Kubernetes credentials onto the driver pod") {
+    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE, clientConfig.getCaCertFile)
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE, clientConfig.getClientKeyFile)
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE, clientConfig.getClientCertFile)
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = SPARK_PI_MAIN_CLASS,
+      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
+    expectationsForStaticAllocation(sparkMetricsService)
+  }
 }

From 078697f63c5719b355d8db72d17e0014b5e3cd01 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 20 Mar 2017 13:38:18 -0700
Subject: [PATCH 455/534] Support using PEM files to configure SSL for driver
 submission (#173)

* Support configuring SSL using PEM files.

* Address some missed comments

* Fix import ordering

* Slight rewording of comments

* Fix scalastyle
---
 docs/running-on-kubernetes.md                 |  14 +-
 resource-managers/kubernetes/core/pom.xml     |   4 +
 .../spark/deploy/kubernetes/Client.scala      |  23 +-
 ...DriverSubmitSslConfigurationProvider.scala | 353 ++++++++++++++++++
 .../kubernetes/SslConfigurationProvider.scala | 204 ----------
 .../spark/deploy/kubernetes/config.scala      |  35 +-
 .../spark/deploy/kubernetes/constants.scala   |   6 +-
 .../KubernetesSparkRestServer.scala           |  69 +++-
 .../kubernetes/PemsToKeyStoreConverter.scala  | 125 +++++++
 .../src/main/docker/driver/Dockerfile         |   2 +
 .../kubernetes/integration-tests/pom.xml      |   4 -
 .../integrationtest/KubernetesSuite.scala     |  39 +-
 .../integrationtest/sslutil/SSLUtils.scala    |  80 ++--
 13 files changed, 680 insertions(+), 278 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index dcfa70a85a970..3b6935560a575 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -127,16 +127,24 @@ Spark supports using TLS to encrypt the traffic in this bootstrapping process. I
 whenever possible. 
 
 See the [security page](security.html) and [configuration](configuration.html) sections for more information on
-configuring TLS; use the prefix `spark.ssl.kubernetes.submission` in configuring the TLS-related fields in the context
+configuring TLS; use the prefix `spark.ssl.kubernetes.driversubmitserver` in configuring the TLS-related fields in the context
 of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
-pod in starting the application, set `spark.ssl.kubernetes.submission.trustStore`.
+pod in starting the application, set `spark.ssl.kubernetes.driversubmitserver.trustStore`.
 
 One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
-container image's disk. Thus `spark.ssl.kubernetes.submission.keyStore` can be a URI with a scheme of either `file:`
+container image's disk. Thus `spark.ssl.kubernetes.driversubmitserver.keyStore` can be a URI with a scheme of either `file:`
 or `local:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
 the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
 `local:`, the file is assumed to already be on the container's disk at the appropriate path.
 
+Finally, the submission server and client can be configured to use PEM files instead of Java keyStores. When using
+this mode, set `spark.ssl.kubernetes.driversubmitserver.keyPem` and
+`spark.ssl.kubernetes.driversubmitserver.serverCertPem` to configure the key and certificate files on the driver
+submission server. These files can be uploaded from the submitter's machine if they have no scheme or a scheme of
+`file:`, or they can be located on the container's disk if they have the scheme `local:`. The client's certificate
+file should be provided via setting `spark.ssl.kubernetes.driversubmitserver.clientCertPem`, and this file must be
+located on the submitting machine's local disk.
+
 ### Submission of Local Files through Ingress/External controller
 
 Kubernetes pods run with their own IP address space. If Spark is run in cluster mode, the driver pod may not be
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 985ffd08f3fc7..6d2f1d0fd2769 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -87,6 +87,10 @@
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcpkix-jdk15on</artifactId>
+    </dependency>
     <!-- End of shaded deps. -->
 
   </dependencies>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index e6b2e31568653..7e700b569a3fb 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -161,7 +161,7 @@ private[spark] class Client(
           driverServiceManager.handleSubmissionError(
             new SparkException("Submission shutting down early...")))
         try {
-          val sslConfigurationProvider = new SslConfigurationProvider(
+          val sslConfigurationProvider = new DriverSubmitSslConfigurationProvider(
             sparkConf, kubernetesAppId, kubernetesClient, kubernetesResourceCleaner)
           val submitServerSecret = kubernetesClient.secrets().createNew()
             .withNewMetadata()
@@ -182,7 +182,7 @@ private[spark] class Client(
           configureOwnerReferences(
             kubernetesClient,
             submitServerSecret,
-            sslConfiguration.sslSecrets,
+            sslConfiguration.sslSecret,
             driverPod,
             driverService)
           submitApplicationToDriverServer(
@@ -209,7 +209,6 @@ private[spark] class Client(
           Utils.tryLogNonFatalError {
             driverServiceManager.stop()
           }
-
           // Remove the shutdown hooks that would be redundant
           Utils.tryLogNonFatalError {
             ShutdownHookManager.removeShutdownHook(resourceCleanShutdownHook)
@@ -236,7 +235,7 @@ private[spark] class Client(
   private def submitApplicationToDriverServer(
       kubernetesClient: KubernetesClient,
       driverServiceManager: DriverServiceManager,
-      sslConfiguration: SslConfiguration,
+      sslConfiguration: DriverSubmitSslConfiguration,
       driverService: Service,
       submitterLocalFiles: Iterable[String],
       submitterLocalJars: Iterable[String],
@@ -298,7 +297,7 @@ private[spark] class Client(
       customLabels: Map[String, String],
       customAnnotations: Map[String, String],
       submitServerSecret: Secret,
-      sslConfiguration: SslConfiguration): (Pod, Service) = {
+      sslConfiguration: DriverSubmitSslConfiguration): (Pod, Service) = {
     val driverKubernetesSelectors = (Map(
       SPARK_DRIVER_LABEL -> kubernetesAppId,
       SPARK_APP_ID_LABEL -> kubernetesAppId,
@@ -349,7 +348,7 @@ private[spark] class Client(
   private def configureOwnerReferences(
       kubernetesClient: KubernetesClient,
       submitServerSecret: Secret,
-      sslSecrets: Array[Secret],
+      sslSecret: Option[Secret],
       driverPod: Pod,
       driverService: Service): Service = {
     val driverPodOwnerRef = new OwnerReferenceBuilder()
@@ -359,7 +358,7 @@ private[spark] class Client(
       .withKind(driverPod.getKind)
       .withController(true)
       .build()
-    sslSecrets.foreach(secret => {
+    sslSecret.foreach(secret => {
       val updatedSecret = kubernetesClient.secrets().withName(secret.getMetadata.getName).edit()
         .editMetadata()
         .addToOwnerReferences(driverPodOwnerRef)
@@ -425,10 +424,10 @@ private[spark] class Client(
       driverKubernetesSelectors: Map[String, String],
       customAnnotations: Map[String, String],
       submitServerSecret: Secret,
-      sslConfiguration: SslConfiguration): Pod = {
+      sslConfiguration: DriverSubmitSslConfiguration): Pod = {
     val containerPorts = buildContainerPorts()
     val probePingHttpGet = new HTTPGetActionBuilder()
-      .withScheme(if (sslConfiguration.sslOptions.enabled) "HTTPS" else "HTTP")
+      .withScheme(if (sslConfiguration.enabled) "HTTPS" else "HTTP")
       .withPath("/v1/submissions/ping")
       .withNewPort(SUBMISSION_SERVER_PORT_NAME)
       .build()
@@ -452,7 +451,7 @@ private[spark] class Client(
             .withSecretName(submitServerSecret.getMetadata.getName)
             .endSecret()
           .endVolume()
-        .addToVolumes(sslConfiguration.sslPodVolumes: _*)
+        .addToVolumes(sslConfiguration.sslPodVolume.toSeq: _*)
         .withServiceAccount(serviceAccount.getOrElse("default"))
         .addNewContainer()
           .withName(DRIVER_CONTAINER_NAME)
@@ -463,7 +462,7 @@ private[spark] class Client(
             .withMountPath(secretDirectory)
             .withReadOnly(true)
             .endVolumeMount()
-          .addToVolumeMounts(sslConfiguration.sslPodVolumeMounts: _*)
+          .addToVolumeMounts(sslConfiguration.sslPodVolumeMount.toSeq: _*)
           .addNewEnv()
             .withName(ENV_SUBMISSION_SECRET_LOCATION)
             .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
@@ -661,7 +660,7 @@ private[spark] class Client(
       kubernetesClient: KubernetesClient,
       driverServiceManager: DriverServiceManager,
       service: Service,
-      sslConfiguration: SslConfiguration): KubernetesSparkRestApi = {
+      sslConfiguration: DriverSubmitSslConfiguration): KubernetesSparkRestApi = {
     val serviceUris = driverServiceManager.getDriverServiceSubmissionServerUris(service)
     require(serviceUris.nonEmpty, "No uris found to contact the driver!")
     HttpClientUtil.createClient[KubernetesSparkRestApi](
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala
new file mode 100644
index 0000000000000..a83c9a9896a08
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.{File, FileInputStream}
+import java.security.{KeyStore, SecureRandom}
+import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{EnvVar, EnvVarBuilder, Secret, Volume, VolumeBuilder, VolumeMount, VolumeMountBuilder}
+import io.fabric8.kubernetes.client.KubernetesClient
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.rest.kubernetes.{KubernetesFileUtils, PemsToKeyStoreConverter}
+import org.apache.spark.util.Utils
+
+/**
+ * Raw SSL configuration as the user specified in SparkConf for setting up the driver
+ * submission server.
+ */
+private case class DriverSubmitSslConfigurationParameters(
+    storeBasedSslOptions: SSLOptions,
+    isKeyStoreLocalFile: Boolean,
+    driverSubmitServerKeyPem: Option[File],
+    isDriverSubmitKeyPemLocalFile: Boolean,
+    driverSubmitServerCertPem: Option[File],
+    isDriverSubmitServerCertPemLocalFile: Boolean,
+    submissionClientCertPem: Option[File])
+
+/**
+ * Resolved from translating options provided in
+ * {@link DriverSubmitSslConfigurationParameters} into Kubernetes volumes, environment variables
+ * for the driver pod, Kubernetes secrets, client-side trust managers, and the client-side SSL
+ * context. This is used for setting up the SSL connection for the submission server where the
+ * application local dependencies and configuration is provided from.
+ */
+private[spark] case class DriverSubmitSslConfiguration(
+    enabled: Boolean,
+    sslPodEnvVars: Array[EnvVar],
+    sslPodVolume: Option[Volume],
+    sslPodVolumeMount: Option[VolumeMount],
+    sslSecret: Option[Secret],
+    driverSubmitClientTrustManager: Option[X509TrustManager],
+    driverSubmitClientSslContext: SSLContext)
+
+/**
+ * Provides the SSL configuration for bootstrapping the driver pod to listen for the driver
+ * submission over SSL, and then supply the client-side configuration for establishing the
+ * SSL connection. This is done in two phases: first, interpreting the raw configuration
+ * values from the SparkConf object; then second, converting the configuration parameters
+ * into the appropriate Kubernetes constructs, namely the volume and volume mount to add to the
+ * driver pod, and the secret to create at the API server; and finally, constructing the
+ * client-side trust manager and SSL context for sending the local dependencies.
+ */
+private[spark] class DriverSubmitSslConfigurationProvider(
+    sparkConf: SparkConf,
+    kubernetesAppId: String,
+    kubernetesClient: KubernetesClient,
+    kubernetesResourceCleaner: KubernetesResourceCleaner) {
+  private val SECURE_RANDOM = new SecureRandom()
+  private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
+  private val sslSecretsDirectory = DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR +
+    s"/$kubernetesAppId-ssl"
+
+  def getSslConfiguration(): DriverSubmitSslConfiguration = {
+    val sslConfigurationParameters = parseSslConfigurationParameters()
+    if (sslConfigurationParameters.storeBasedSslOptions.enabled) {
+      val storeBasedSslOptions = sslConfigurationParameters.storeBasedSslOptions
+      val keyStoreSecret = resolveFileToSecretMapping(
+          sslConfigurationParameters.isKeyStoreLocalFile,
+          SUBMISSION_SSL_KEYSTORE_SECRET_NAME,
+          storeBasedSslOptions.keyStore,
+          "KeyStore")
+      val keyStorePathEnv = resolveFilePathEnv(
+          sslConfigurationParameters.isKeyStoreLocalFile,
+          ENV_SUBMISSION_KEYSTORE_FILE,
+          SUBMISSION_SSL_KEYSTORE_SECRET_NAME,
+          storeBasedSslOptions.keyStore)
+      val storePasswordSecret = storeBasedSslOptions.keyStorePassword.map(password => {
+        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
+        (SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME, passwordBase64)
+      }).toMap
+      val storePasswordLocationEnv = storeBasedSslOptions.keyStorePassword.map(_ => {
+        new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE)
+          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME")
+          .build()
+      })
+      val storeKeyPasswordSecret = storeBasedSslOptions.keyPassword.map(password => {
+        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
+        (SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME, passwordBase64)
+      }).toMap
+      val storeKeyPasswordEnv = storeBasedSslOptions.keyPassword.map(_ => {
+        new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE)
+          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME")
+          .build()
+      })
+      val storeTypeEnv = storeBasedSslOptions.keyStoreType.map(storeType => {
+        new EnvVarBuilder()
+          .withName(ENV_SUBMISSION_KEYSTORE_TYPE)
+          .withValue(storeType)
+          .build()
+      })
+      val keyPemSecret = resolveFileToSecretMapping(
+        sslConfigurationParameters.isDriverSubmitKeyPemLocalFile,
+        secretName = SUBMISSION_SSL_KEY_PEM_SECRET_NAME,
+        secretType = "Key pem",
+        secretFile = sslConfigurationParameters.driverSubmitServerKeyPem)
+      val keyPemLocationEnv = resolveFilePathEnv(
+        sslConfigurationParameters.isDriverSubmitKeyPemLocalFile,
+        envName = ENV_SUBMISSION_KEY_PEM_FILE,
+        secretName = SUBMISSION_SSL_KEY_PEM_SECRET_NAME,
+        maybeFile = sslConfigurationParameters.driverSubmitServerKeyPem)
+      val certPemSecret = resolveFileToSecretMapping(
+        sslConfigurationParameters.isDriverSubmitServerCertPemLocalFile,
+        secretName = SUBMISSION_SSL_CERT_PEM_SECRET_NAME,
+        secretType = "Cert pem",
+        secretFile = sslConfigurationParameters.driverSubmitServerCertPem)
+      val certPemLocationEnv = resolveFilePathEnv(
+        sslConfigurationParameters.isDriverSubmitServerCertPemLocalFile,
+        envName = ENV_SUBMISSION_CERT_PEM_FILE,
+        secretName = SUBMISSION_SSL_CERT_PEM_SECRET_NAME,
+        maybeFile = sslConfigurationParameters.driverSubmitServerCertPem)
+      val useSslEnv = new EnvVarBuilder()
+        .withName(ENV_SUBMISSION_USE_SSL)
+        .withValue("true")
+        .build()
+      val sslVolume = new VolumeBuilder()
+        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
+        .withNewSecret()
+        .withSecretName(sslSecretsName)
+        .endSecret()
+        .build()
+      val sslVolumeMount = new VolumeMountBuilder()
+        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
+        .withReadOnly(true)
+        .withMountPath(sslSecretsDirectory)
+        .build()
+      val allSecrets = keyStoreSecret ++
+        storePasswordSecret ++
+        storeKeyPasswordSecret ++
+        keyPemSecret ++
+        certPemSecret
+      val sslSecret = kubernetesClient.secrets().createNew()
+        .withNewMetadata()
+        .withName(sslSecretsName)
+        .endMetadata()
+        .withData(allSecrets.asJava)
+        .withType("Opaque")
+        .done()
+      kubernetesResourceCleaner.registerOrUpdateResource(sslSecret)
+      val allSslEnvs = keyStorePathEnv ++
+        storePasswordLocationEnv ++
+        storeKeyPasswordEnv ++
+        storeTypeEnv ++
+        keyPemLocationEnv ++
+        Array(useSslEnv) ++
+        certPemLocationEnv
+      val (driverSubmitClientTrustManager, driverSubmitClientSslContext) =
+        buildSslConnectionConfiguration(sslConfigurationParameters)
+      DriverSubmitSslConfiguration(
+        true,
+        allSslEnvs.toArray,
+        Some(sslVolume),
+        Some(sslVolumeMount),
+        Some(sslSecret),
+        driverSubmitClientTrustManager,
+        driverSubmitClientSslContext)
+    } else {
+      DriverSubmitSslConfiguration(
+        false,
+        Array[EnvVar](),
+        None,
+        None,
+        None,
+        None,
+        SSLContext.getDefault)
+    }
+  }
+
+  private def resolveFilePathEnv(
+      isLocal: Boolean,
+      envName: String,
+      secretName: String,
+      maybeFile: Option[File]): Option[EnvVar] = {
+    maybeFile.map(file => {
+      val pemPath = if (isLocal) {
+        s"$sslSecretsDirectory/$secretName"
+      } else {
+        file.getAbsolutePath
+      }
+      new EnvVarBuilder()
+        .withName(envName)
+        .withValue(pemPath)
+        .build()
+    })
+  }
+
+  private def resolveFileToSecretMapping(
+      isLocal: Boolean,
+      secretName: String,
+      secretFile: Option[File],
+      secretType: String): Map[String, String] = {
+    secretFile.filter(_ => isLocal).map(file => {
+      if (!file.isFile) {
+        throw new SparkException(s"$secretType specified at ${file.getAbsolutePath} is not" +
+          s" a file or does not exist.")
+      }
+      val keyStoreBytes = Files.toByteArray(file)
+      (secretName, BaseEncoding.base64().encode(keyStoreBytes))
+    }).toMap
+  }
+
+  private def parseSslConfigurationParameters(): DriverSubmitSslConfigurationParameters = {
+    val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE)
+    val maybeTrustStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE)
+    val maybeKeyPem = sparkConf.get(DRIVER_SUBMIT_SSL_KEY_PEM)
+    val maybeDriverSubmitServerCertPem = sparkConf.get(DRIVER_SUBMIT_SSL_SERVER_CERT_PEM)
+    val maybeDriverSubmitClientCertPem = sparkConf.get(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM)
+    validatePemsDoNotConflictWithStores(
+      maybeKeyStore,
+      maybeTrustStore,
+      maybeKeyPem,
+      maybeDriverSubmitServerCertPem,
+      maybeDriverSubmitClientCertPem)
+    val resolvedSparkConf = sparkConf.clone()
+    val (isLocalKeyStore, resolvedKeyStore) = resolveLocalFile(maybeKeyStore, "keyStore")
+    resolvedKeyStore.foreach {
+      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE, _)
+    }
+    val (isLocalDriverSubmitServerCertPem, resolvedDriverSubmitServerCertPem) =
+      resolveLocalFile(maybeDriverSubmitServerCertPem, "server cert PEM")
+    val (isLocalKeyPem, resolvedKeyPem) = resolveLocalFile(maybeKeyPem, "key PEM")
+    maybeTrustStore.foreach { trustStore =>
+      require(KubernetesFileUtils.isUriLocalFile(trustStore), s"Invalid trustStore URI" +
+        s" $trustStore; trustStore URI for submit server must have no scheme, or scheme file://")
+      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
+        Utils.resolveURI(trustStore).getPath)
+    }
+    val driverSubmitClientCertPem = maybeDriverSubmitClientCertPem.map { driverSubmitClientCert =>
+      require(KubernetesFileUtils.isUriLocalFile(driverSubmitClientCert),
+        "Invalid client certificate PEM URI $driverSubmitClientCert: client certificate URI must" +
+          " have no scheme, or scheme file://")
+      Utils.resolveURI(driverSubmitClientCert).getPath
+    }
+    val securityManager = new SparkSecurityManager(resolvedSparkConf)
+    val storeBasedSslOptions = securityManager.getSSLOptions(DRIVER_SUBMIT_SSL_NAMESPACE)
+    DriverSubmitSslConfigurationParameters(
+      storeBasedSslOptions,
+      isLocalKeyStore,
+      resolvedKeyPem.map(new File(_)),
+      isLocalKeyPem,
+      resolvedDriverSubmitServerCertPem.map(new File(_)),
+      isLocalDriverSubmitServerCertPem,
+      driverSubmitClientCertPem.map(new File(_)))
+  }
+
+  private def resolveLocalFile(file: Option[String],
+      fileType: String): (Boolean, Option[String]) = {
+    file.map { f =>
+      require(isValidSslFileScheme(f), s"Invalid $fileType URI $f, $fileType URI" +
+        s" for submit server must have scheme file:// or local:// (no scheme defaults to file://")
+      val isLocal = KubernetesFileUtils.isUriLocalFile(f)
+      (isLocal, Option.apply(Utils.resolveURI(f).getPath))
+    }.getOrElse(false, None)
+  }
+
+  private def validatePemsDoNotConflictWithStores(
+      maybeKeyStore: Option[String],
+      maybeTrustStore: Option[String],
+      maybeKeyPem: Option[String],
+      maybeDriverSubmitServerCertPem: Option[String],
+      maybeSubmitClientCertPem: Option[String]) = {
+    maybeKeyPem.orElse(maybeDriverSubmitServerCertPem).foreach { _ =>
+      require(maybeKeyStore.isEmpty,
+        "Cannot specify server PEM files and key store files; must specify only one or the other.")
+    }
+    maybeKeyPem.foreach { _ =>
+      require(maybeDriverSubmitServerCertPem.isDefined,
+        "When specifying the key PEM file, the server certificate PEM file must also be provided.")
+    }
+    maybeDriverSubmitServerCertPem.foreach { _ =>
+      require(maybeKeyPem.isDefined,
+        "When specifying the server certificate PEM file, the key PEM file must also be provided.")
+    }
+    maybeTrustStore.foreach { _ =>
+      require(maybeSubmitClientCertPem.isEmpty,
+        "Cannot specify client cert file and truststore file; must specify only one or the other.")
+    }
+  }
+
+  private def isValidSslFileScheme(rawUri: String): Boolean = {
+    val resolvedScheme = Option.apply(Utils.resolveURI(rawUri).getScheme).getOrElse("file")
+    resolvedScheme == "file" || resolvedScheme == "local"
+  }
+
+  private def buildSslConnectionConfiguration(
+      sslConfigurationParameters: DriverSubmitSslConfigurationParameters)
+      : (Option[X509TrustManager], SSLContext) = {
+    val maybeTrustStore = sslConfigurationParameters.submissionClientCertPem.map { certPem =>
+      PemsToKeyStoreConverter.convertCertPemToTrustStore(
+        certPem,
+        sslConfigurationParameters.storeBasedSslOptions.trustStoreType)
+    }.orElse(sslConfigurationParameters.storeBasedSslOptions.trustStore.map { trustStoreFile =>
+      if (!trustStoreFile.isFile) {
+        throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
+          s" does not exist or is not a file.")
+      }
+      val trustStore = KeyStore.getInstance(
+        sslConfigurationParameters
+          .storeBasedSslOptions
+          .trustStoreType
+          .getOrElse(KeyStore.getDefaultType))
+      Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
+        val trustStorePassword = sslConfigurationParameters
+          .storeBasedSslOptions
+          .trustStorePassword
+          .map(_.toCharArray)
+          .orNull
+        trustStore.load(trustStoreStream, trustStorePassword)
+      }
+      trustStore
+    })
+    maybeTrustStore.map { trustStore =>
+      val trustManagerFactory = TrustManagerFactory.getInstance(
+        TrustManagerFactory.getDefaultAlgorithm)
+      trustManagerFactory.init(trustStore)
+      val trustManagers = trustManagerFactory.getTrustManagers
+      val sslContext = SSLContext.getInstance("TLSv1.2")
+      sslContext.init(null, trustManagers, SECURE_RANDOM)
+      (Option.apply(trustManagers(0).asInstanceOf[X509TrustManager]), sslContext)
+    }.getOrElse((Option.empty[X509TrustManager], SSLContext.getDefault))
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
deleted file mode 100644
index 4bbe3ed385a4d..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SslConfigurationProvider.scala
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes
-
-import java.io.FileInputStream
-import java.security.{KeyStore, SecureRandom}
-import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{EnvVar, EnvVarBuilder, Secret, Volume, VolumeBuilder, VolumeMount, VolumeMountBuilder}
-import io.fabric8.kubernetes.client.KubernetesClient
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.util.Utils
-
-private[spark] case class SslConfiguration(
-  sslOptions: SSLOptions,
-  isKeyStoreLocalFile: Boolean,
-  sslPodEnvVars: Array[EnvVar],
-  sslPodVolumes: Array[Volume],
-  sslPodVolumeMounts: Array[VolumeMount],
-  sslSecrets: Array[Secret],
-  driverSubmitClientTrustManager: Option[X509TrustManager],
-  driverSubmitClientSslContext: SSLContext)
-
-private[spark] class SslConfigurationProvider(
-    sparkConf: SparkConf,
-    kubernetesAppId: String,
-    kubernetesClient: KubernetesClient,
-    kubernetesResourceCleaner: KubernetesResourceCleaner) {
-  private val SECURE_RANDOM = new SecureRandom()
-  private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
-  private val sslSecretsDirectory = DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR +
-    s"/$kubernetesAppId-ssl"
-
-  def getSslConfiguration(): SslConfiguration = {
-    val (driverSubmitSslOptions, isKeyStoreLocalFile) = parseDriverSubmitSslOptions()
-    if (driverSubmitSslOptions.enabled) {
-      val sslSecretsMap = mutable.HashMap[String, String]()
-      val sslEnvs = mutable.Buffer[EnvVar]()
-      val secrets = mutable.Buffer[Secret]()
-      driverSubmitSslOptions.keyStore.foreach(store => {
-        val resolvedKeyStoreFile = if (isKeyStoreLocalFile) {
-          if (!store.isFile) {
-            throw new SparkException(s"KeyStore specified at $store is not a file or" +
-              s" does not exist.")
-          }
-          val keyStoreBytes = Files.toByteArray(store)
-          val keyStoreBase64 = BaseEncoding.base64().encode(keyStoreBytes)
-          sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_SECRET_NAME -> keyStoreBase64)
-          s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_SECRET_NAME"
-        } else {
-          store.getAbsolutePath
-        }
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_FILE)
-          .withValue(resolvedKeyStoreFile)
-          .build()
-      })
-      driverSubmitSslOptions.keyStorePassword.foreach(password => {
-        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
-        sslSecretsMap += (SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME -> passwordBase64)
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE)
-          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME")
-          .build()
-      })
-      driverSubmitSslOptions.keyPassword.foreach(password => {
-        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
-        sslSecretsMap += (SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME -> passwordBase64)
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE)
-          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME")
-          .build()
-      })
-      driverSubmitSslOptions.keyStoreType.foreach(storeType => {
-        sslEnvs += new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_TYPE)
-          .withValue(storeType)
-          .build()
-      })
-      sslEnvs += new EnvVarBuilder()
-        .withName(ENV_SUBMISSION_USE_SSL)
-        .withValue("true")
-        .build()
-      val sslVolume = new VolumeBuilder()
-        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
-        .withNewSecret()
-        .withSecretName(sslSecretsName)
-        .endSecret()
-        .build()
-      val sslVolumeMount = new VolumeMountBuilder()
-        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
-        .withReadOnly(true)
-        .withMountPath(sslSecretsDirectory)
-        .build()
-      val sslSecrets = kubernetesClient.secrets().createNew()
-        .withNewMetadata()
-        .withName(sslSecretsName)
-        .endMetadata()
-        .withData(sslSecretsMap.asJava)
-        .withType("Opaque")
-        .done()
-      kubernetesResourceCleaner.registerOrUpdateResource(sslSecrets)
-      secrets += sslSecrets
-      val (driverSubmitClientTrustManager, driverSubmitClientSslContext) =
-        buildSslConnectionConfiguration(driverSubmitSslOptions)
-      SslConfiguration(
-        driverSubmitSslOptions,
-        isKeyStoreLocalFile,
-        sslEnvs.toArray,
-        Array(sslVolume),
-        Array(sslVolumeMount),
-        secrets.toArray,
-        driverSubmitClientTrustManager,
-        driverSubmitClientSslContext)
-    } else {
-      SslConfiguration(
-        driverSubmitSslOptions,
-        isKeyStoreLocalFile,
-        Array[EnvVar](),
-        Array[Volume](),
-        Array[VolumeMount](),
-        Array[Secret](),
-        None,
-        SSLContext.getDefault)
-    }
-  }
-
-  private def parseDriverSubmitSslOptions(): (SSLOptions, Boolean) = {
-    val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_KEYSTORE)
-    val resolvedSparkConf = sparkConf.clone()
-    val (isLocalKeyStore, resolvedKeyStore) = maybeKeyStore.map(keyStore => {
-      val keyStoreURI = Utils.resolveURI(keyStore)
-      val isProvidedKeyStoreLocal = keyStoreURI.getScheme match {
-        case "file" | null => true
-        case "local" => false
-        case _ => throw new SparkException(s"Invalid KeyStore URI $keyStore; keyStore URI" +
-          " for submit server must have scheme file:// or local:// (no scheme defaults" +
-          " to file://)")
-      }
-      (isProvidedKeyStoreLocal, Option.apply(keyStoreURI.getPath))
-    }).getOrElse((false, Option.empty[String]))
-    resolvedKeyStore.foreach {
-      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, _)
-    }
-    sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE).foreach { trustStore =>
-      val trustStoreURI = Utils.resolveURI(trustStore)
-      trustStoreURI.getScheme match {
-        case "file" | null =>
-          resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE, trustStoreURI.getPath)
-        case _ => throw new SparkException(s"Invalid trustStore URI $trustStore; trustStore URI" +
-          " for submit server must have no scheme, or scheme file://")
-      }
-    }
-    val securityManager = new SparkSecurityManager(resolvedSparkConf)
-    (securityManager.getSSLOptions(KUBERNETES_SUBMIT_SSL_NAMESPACE), isLocalKeyStore)
-  }
-
-  private def buildSslConnectionConfiguration(driverSubmitSslOptions: SSLOptions):
-      (Option[X509TrustManager], SSLContext) = {
-    driverSubmitSslOptions.trustStore.map(trustStoreFile => {
-      val trustManagerFactory = TrustManagerFactory.getInstance(
-        TrustManagerFactory.getDefaultAlgorithm)
-      val trustStore = KeyStore.getInstance(
-        driverSubmitSslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType))
-      if (!trustStoreFile.isFile) {
-        throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
-          s" does not exist or is not a file.")
-      }
-      Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
-        driverSubmitSslOptions.trustStorePassword match {
-          case Some(password) =>
-            trustStore.load(trustStoreStream, password.toCharArray)
-          case None => trustStore.load(trustStoreStream, null)
-        }
-      }
-      trustManagerFactory.init(trustStore)
-      val trustManagers = trustManagerFactory.getTrustManagers
-      val sslContext = SSLContext.getInstance("TLSv1.2")
-      sslContext.init(null, trustManagers, SECURE_RANDOM)
-      (Option.apply(trustManagers(0).asInstanceOf[X509TrustManager]), sslContext)
-    }).getOrElse((Option.empty[X509TrustManager], SSLContext.getDefault))
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index e33c761ecc8d1..3328809e186e4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -197,26 +197,51 @@ package object config {
       .timeConf(TimeUnit.SECONDS)
       .createWithDefault(60L)
 
-  private[spark] val KUBERNETES_DRIVER_SUBMIT_KEYSTORE =
-    ConfigBuilder("spark.ssl.kubernetes.submission.keyStore")
+  private[spark] val KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE =
+    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.keyStore")
       .doc("KeyStore file for the driver submission server listening on SSL. Can be pre-mounted" +
         " on the driver container or uploaded from the submitting client.")
       .stringConf
       .createOptional
 
-  private[spark] val KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE =
-    ConfigBuilder("spark.ssl.kubernetes.submission.trustStore")
+  private[spark] val KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE =
+    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.trustStore")
       .doc("TrustStore containing certificates for communicating to the driver submission server" +
         " over SSL.")
       .stringConf
       .createOptional
 
   private[spark] val DRIVER_SUBMIT_SSL_ENABLED =
-    ConfigBuilder("spark.ssl.kubernetes.submission.enabled")
+    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.enabled")
       .doc("Whether or not to use SSL when sending the application dependencies to the driver pod.")
       .booleanConf
       .createWithDefault(false)
 
+  private[spark] val DRIVER_SUBMIT_SSL_KEY_PEM =
+    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.keyPem")
+      .doc("Key PEM file that the driver submission server will use when setting up TLS" +
+        " connections. Can be pre-mounted on the driver pod's disk or uploaded from the" +
+        " submitting client's machine.")
+      .stringConf
+      .createOptional
+
+  private[spark] val DRIVER_SUBMIT_SSL_SERVER_CERT_PEM =
+    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.serverCertPem")
+      .doc("Certificate PEM file that is associated with the key PEM file" +
+        " the submission server uses to set up TLS connections. Can be pre-mounted" +
+        " on the driver pod's disk or uploaded from the submitting client's machine.")
+      .stringConf
+      .createOptional
+
+  private[spark] val DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM =
+    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.clientCertPem")
+      .doc("Certificate pem file that the submission client uses to connect to the submission" +
+        " server over TLS. This should often be the same as the server certificate, but can be" +
+        " different if the submission client will contact the driver through a proxy instead of" +
+        " the driver service directly.")
+      .stringConf
+      .createOptional
+
   private[spark] val KUBERNETES_DRIVER_SERVICE_NAME =
     ConfigBuilder("spark.kubernetes.driver.service.name")
         .doc("Kubernetes service that exposes the driver pod for external access.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 23d216e799fff..0e5fada302421 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -36,6 +36,8 @@ package object constants {
   private[spark] val SUBMISSION_SSL_KEYSTORE_SECRET_NAME = "spark-submission-server-keystore"
   private[spark] val SUBMISSION_SSL_SECRETS_PREFIX = "spark-submission-server-ssl"
   private[spark] val SUBMISSION_SSL_SECRETS_VOLUME_NAME = "spark-submission-server-ssl-secrets"
+  private[spark] val SUBMISSION_SSL_KEY_PEM_SECRET_NAME = "spark-submission-server-key-pem"
+  private[spark] val SUBMISSION_SSL_CERT_PEM_SECRET_NAME = "spark-submission-server-cert-pem"
 
   // Default and fixed ports
   private[spark] val SUBMISSION_SERVER_PORT = 7077
@@ -57,6 +59,8 @@ package object constants {
   private[spark] val ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE =
       "SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"
   private[spark] val ENV_SUBMISSION_KEYSTORE_TYPE = "SPARK_SUBMISSION_KEYSTORE_TYPE"
+  private[spark] val ENV_SUBMISSION_KEY_PEM_FILE = "SPARK_SUBMISSION_KEY_PEM_FILE"
+  private[spark] val ENV_SUBMISSION_CERT_PEM_FILE = "SPARK_SUBMISSION_CERT_PEM_FILE"
   private[spark] val ENV_SUBMISSION_USE_SSL = "SPARK_SUBMISSION_USE_SSL"
   private[spark] val ENV_EXECUTOR_PORT = "SPARK_EXECUTOR_PORT"
   private[spark] val ENV_DRIVER_URL = "SPARK_DRIVER_URL"
@@ -74,7 +78,7 @@ package object constants {
 
   // Miscellaneous
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
-  private[spark] val KUBERNETES_SUBMIT_SSL_NAMESPACE = "kubernetes.submission"
+  private[spark] val DRIVER_SUBMIT_SSL_NAMESPACE = "kubernetes.driversubmitserver"
   private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
   private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
   private[spark] val MEMORY_OVERHEAD_MIN = 384L
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
index 4688521a59d38..4ca01b2f6bd38 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.rest.kubernetes
 import java.io.{File, FileOutputStream, StringReader}
 import java.net.URI
 import java.nio.file.Paths
+import java.security.SecureRandom
 import java.util.concurrent.CountDownLatch
 import java.util.concurrent.atomic.AtomicInteger
 import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
@@ -26,10 +27,11 @@ import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
 import com.google.common.base.Charsets
 import com.google.common.io.{BaseEncoding, ByteStreams, Files}
 import org.apache.commons.codec.binary.Base64
+import org.apache.commons.lang3.RandomStringUtils
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SSLOptions}
+import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.rest._
@@ -44,7 +46,9 @@ private case class KubernetesSparkRestServerArguments(
     keyStoreFile: Option[String] = None,
     keyStorePasswordFile: Option[String] = None,
     keyStoreType: Option[String] = None,
-    keyPasswordFile: Option[String] = None) {
+    keyPasswordFile: Option[String] = None,
+    keyPemFile: Option[String] = None,
+    certPemFile: Option[String] = None) {
   def validate(): KubernetesSparkRestServerArguments = {
     require(host.isDefined, "Hostname not set via --hostname.")
     require(port.isDefined, "Port not set via --port")
@@ -83,6 +87,12 @@ private object KubernetesSparkRestServerArguments {
         case "--keystore-key-password-file" :: value :: tail =>
           args = tail
           resolvedArguments.copy(keyPasswordFile = Some(value))
+        case "--key-pem-file" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(keyPemFile = Some(value))
+        case "--cert-pem-file" :: value :: tail =>
+          args = tail
+          resolvedArguments.copy(certPemFile = Some(value))
         // TODO polish usage message
         case Nil => resolvedArguments
         case unknown => throw new IllegalStateException(s"Unknown argument(s) found: $unknown")
@@ -377,26 +387,43 @@ private[spark] class KubernetesSparkRestServer(
 
 private[spark] object KubernetesSparkRestServer {
   private val barrier = new CountDownLatch(1)
+  private val SECURE_RANDOM = new SecureRandom()
 
   def main(args: Array[String]): Unit = {
     val parsedArguments = KubernetesSparkRestServerArguments.fromArgsArray(args)
     val secretFile = new File(parsedArguments.secretFile.get)
-    if (!secretFile.isFile) {
-      throw new IllegalArgumentException(s"Secret file specified by --secret-file" +
-        " is not a file, or does not exist.")
-    }
+    require(secretFile.isFile, "Secret file specified by --secret-file is not a file, or" +
+      " does not exist.")
     val sslOptions = if (parsedArguments.useSsl) {
-      val keyStorePassword = parsedArguments
-        .keyStorePasswordFile
-        .map(new File(_))
-        .map(Files.toString(_, Charsets.UTF_8))
+      validateSslOptions(parsedArguments)
       val keyPassword = parsedArguments
         .keyPasswordFile
         .map(new File(_))
         .map(Files.toString(_, Charsets.UTF_8))
+        // If key password isn't set but we're using PEM files, generate a password
+        .orElse(parsedArguments.keyPemFile.map(_ => randomPassword()))
+      val keyStorePassword = parsedArguments
+        .keyStorePasswordFile
+        .map(new File(_))
+        .map(Files.toString(_, Charsets.UTF_8))
+        // If keystore password isn't set but we're using PEM files, generate a password
+        .orElse(parsedArguments.keyPemFile.map(_ => randomPassword()))
+      val resolvedKeyStore = parsedArguments.keyStoreFile.map(new File(_)).orElse(
+        parsedArguments.keyPemFile.map(keyPemFile => {
+          parsedArguments.certPemFile.map(certPemFile => {
+            PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
+              new File(keyPemFile),
+              new File(certPemFile),
+              "provided-key",
+              keyStorePassword,
+              keyPassword,
+              parsedArguments.keyStoreType)
+          })
+        }).getOrElse(throw new SparkException("When providing PEM files to set up TLS for the" +
+          " submission server, both the key and the certificate must be specified.")))
       new SSLOptions(
         enabled = true,
-        keyStore = parsedArguments.keyStoreFile.map(new File(_)),
+        keyStore = resolvedKeyStore,
         keyStoreType = parsedArguments.keyStoreType,
         keyStorePassword = keyStorePassword,
         keyPassword = keyPassword)
@@ -425,5 +452,25 @@ private[spark] object KubernetesSparkRestServer {
     barrier.await()
     System.exit(exitCode.get())
   }
+
+  private def validateSslOptions(parsedArguments: KubernetesSparkRestServerArguments): Unit = {
+    parsedArguments.keyStoreFile.foreach { _ =>
+      require(parsedArguments.keyPemFile.orElse(parsedArguments.certPemFile).isEmpty,
+        "Cannot provide both key/cert PEM files and a keyStore file; select one or the other" +
+          " for configuring SSL.")
+    }
+    parsedArguments.keyPemFile.foreach { _ =>
+      require(parsedArguments.certPemFile.isDefined,
+        "When providing the key PEM file, the certificate PEM file must also be provided.")
+    }
+    parsedArguments.certPemFile.foreach { _ =>
+      require(parsedArguments.keyPemFile.isDefined,
+        "When providing the certificate PEM file, the key PEM file must also be provided.")
+    }
+  }
+
+  private def randomPassword(): String = {
+    RandomStringUtils.random(1024, 0, Integer.MAX_VALUE, false, false, null, SECURE_RANDOM)
+  }
 }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
new file mode 100644
index 0000000000000..e5c43560eccb4
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.io.{File, FileInputStream, FileOutputStream, InputStreamReader}
+import java.nio.file.Paths
+import java.security.{KeyStore, PrivateKey}
+import java.security.cert.Certificate
+import java.util.UUID
+
+import com.google.common.base.Charsets
+import org.bouncycastle.asn1.pkcs.PrivateKeyInfo
+import org.bouncycastle.cert.X509CertificateHolder
+import org.bouncycastle.cert.jcajce.JcaX509CertificateConverter
+import org.bouncycastle.openssl.{PEMKeyPair, PEMParser}
+import org.bouncycastle.openssl.jcajce.JcaPEMKeyConverter
+import scala.collection.mutable
+
+import org.apache.spark.SparkException
+import org.apache.spark.util.Utils
+
+private[spark] object PemsToKeyStoreConverter {
+
+  /**
+   * Loads the given key-cert pair into a temporary keystore file. Returns the File pointing
+   * to where the keyStore was written to disk.
+   */
+  def convertPemsToTempKeyStoreFile(
+      keyPemFile: File,
+      certPemFile: File,
+      keyAlias: String,
+      keyStorePassword: Option[String],
+      keyPassword: Option[String],
+      keyStoreType: Option[String]): File = {
+    require(keyPemFile.isFile, s"Key PEM file provided at ${keyPemFile.getAbsolutePath}" +
+      " does not exist or is not a file.")
+    require(certPemFile.isFile, s"Cert PEM file provided at ${certPemFile.getAbsolutePath}" +
+      " does not exist or is not a file.")
+    val privateKey = parsePrivateKeyFromPemFile(keyPemFile)
+    val certificates = parseCertificatesFromPemFile(certPemFile)
+    val resolvedKeyStoreType = keyStoreType.getOrElse(KeyStore.getDefaultType)
+    val keyStore = KeyStore.getInstance(resolvedKeyStoreType)
+    keyStore.load(null, null)
+    keyStore.setKeyEntry(
+      keyAlias,
+      privateKey,
+      keyPassword.map(_.toCharArray).orNull,
+      certificates)
+    val keyStoreOutputPath = Paths.get(s"keystore-${UUID.randomUUID()}.$resolvedKeyStoreType")
+    Utils.tryWithResource(new FileOutputStream(keyStoreOutputPath.toFile)) { storeStream =>
+      keyStore.store(storeStream, keyStorePassword.map(_.toCharArray).orNull)
+    }
+    keyStoreOutputPath.toFile
+  }
+
+  def convertCertPemToTrustStore(
+      certPemFile: File,
+      trustStoreType: Option[String]): KeyStore = {
+    require(certPemFile.isFile, s"Cert PEM file provided at ${certPemFile.getAbsolutePath}" +
+      " does not exist or is not a file.")
+    val trustStore = KeyStore.getInstance(trustStoreType.getOrElse(KeyStore.getDefaultType))
+    trustStore.load(null, null)
+    parseCertificatesFromPemFile(certPemFile).zipWithIndex.foreach { case (cert, index) =>
+      trustStore.setCertificateEntry(s"certificate-$index", cert)
+    }
+    trustStore
+  }
+
+  private def withPemParsedFromFile[T](pemFile: File)(f: (PEMParser => T)): T = {
+    Utils.tryWithResource(new FileInputStream(pemFile)) { pemStream =>
+      Utils.tryWithResource(new InputStreamReader(pemStream, Charsets.UTF_8)) { pemReader =>
+        Utils.tryWithResource(new PEMParser(pemReader))(f)
+      }
+    }
+  }
+
+  private def parsePrivateKeyFromPemFile(keyPemFile: File): PrivateKey = {
+    withPemParsedFromFile(keyPemFile) { keyPemParser =>
+      val converter = new JcaPEMKeyConverter
+      keyPemParser.readObject() match {
+        case privateKey: PrivateKeyInfo =>
+          converter.getPrivateKey(privateKey)
+        case keyPair: PEMKeyPair =>
+          converter.getPrivateKey(keyPair.getPrivateKeyInfo)
+        case _ =>
+          throw new SparkException(s"Key file provided at ${keyPemFile.getAbsolutePath}" +
+            s" is not a key pair or private key PEM file.")
+      }
+    }
+  }
+
+  private def parseCertificatesFromPemFile(certPemFile: File): Array[Certificate] = {
+    withPemParsedFromFile(certPemFile) { certPemParser =>
+      val certificates = mutable.Buffer[Certificate]()
+      var pemObject = certPemParser.readObject()
+      while (pemObject != null) {
+        pemObject match {
+          case certificate: X509CertificateHolder =>
+            val converter = new JcaX509CertificateConverter
+            certificates += converter.getCertificate(certificate)
+          case _ =>
+        }
+        pemObject = certPemParser.readObject()
+      }
+      if (certificates.isEmpty) {
+        throw new SparkException(s"No certificates found in ${certPemFile.getAbsolutePath}")
+      }
+      certificates.toArray
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 3bf6b50ff69c1..1f35e7e5eb209 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -42,6 +42,8 @@ CMD SSL_ARGS="" && \
     if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_TYPE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-type $SPARK_SUBMISSION_KEYSTORE_TYPE"; fi && \
     if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-password-file $SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"; fi && \
     if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-key-password-file $SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"; fi && \
+    if ! [ -z ${SPARK_SUBMISSION_KEY_PEM_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --key-pem-file $SPARK_SUBMISSION_KEY_PEM_FILE"; fi && \
+    if ! [ -z ${SPARK_SUBMISSION_CERT_PEM_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --cert-pem-file $SPARK_SUBMISSION_CERT_PEM_FILE"; fi && \
     exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.KubernetesSparkRestServer \
       --hostname $HOSTNAME \
       --port $SPARK_SUBMISSION_SERVER_PORT \
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 5c54d0e5e3aab..da78e783cac1b 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -106,10 +106,6 @@
         </exclusion>
       </exclusions>
     </dependency>
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcpkix-jdk15on</artifactId>
-    </dependency>
   </dependencies>
 
   <build>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 16564ca746b40..0e55e64fd1d77 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -72,8 +72,6 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   private val NAMESPACE = UUID.randomUUID().toString.replaceAll("-", "")
   private var minikubeKubernetesClient: KubernetesClient = _
   private var clientConfig: Config = _
-  private var keyStoreFile: File = _
-  private var trustStoreFile: File = _
   private var sparkConf: SparkConf = _
 
   override def beforeAll(): Unit = {
@@ -86,13 +84,6 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .done()
     minikubeKubernetesClient = Minikube.getKubernetesClient.inNamespace(NAMESPACE)
     clientConfig = minikubeKubernetesClient.getConfiguration
-    val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
-      Minikube.getMinikubeIp,
-      "changeit",
-      "changeit",
-      "changeit")
-    keyStoreFile = keyStore
-    trustStoreFile = trustStore
   }
 
   before {
@@ -182,9 +173,6 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Run a simple example") {
-    // We'll make assertions based on spark rest api, so we need to turn on
-    // spark.ui.enabled explicitly since the scalatest-maven-plugin would set it
-    // to false by default.
     new Client(
       sparkConf = sparkConf,
       mainClass = SPARK_PI_MAIN_CLASS,
@@ -265,11 +253,30 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Enable SSL on the driver submit server") {
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_KEYSTORE, s"file://${keyStoreFile.getAbsolutePath}")
-    sparkConf.set("spark.ssl.kubernetes.submission.keyStorePassword", "changeit")
-    sparkConf.set("spark.ssl.kubernetes.submission.keyPassword", "changeit")
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_TRUSTSTORE,
+    val (keyStoreFile, trustStoreFile) = SSLUtils.generateKeyStoreTrustStorePair(
+      Minikube.getMinikubeIp,
+      "changeit",
+      "changeit",
+      "changeit")
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE, s"file://${keyStoreFile.getAbsolutePath}")
+    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyStorePassword", "changeit")
+    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyPassword", "changeit")
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
       s"file://${trustStoreFile.getAbsolutePath}")
+    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.trustStorePassword", "changeit")
+    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = SPARK_PI_MAIN_CLASS,
+      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+  }
+
+  test("Enable SSL on the driver submit server using PEM files") {
+    val (keyPem, certPem) = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
+    sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyPem.getAbsolutePath}")
+    sparkConf.set(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${certPem.getAbsolutePath}")
+    sparkConf.set(DRIVER_SUBMIT_SSL_SERVER_CERT_PEM, s"file://${certPem.getAbsolutePath}")
     sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
     new Client(
       sparkConf = sparkConf,
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
index bde7b43226660..2078e0585e8f0 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
@@ -16,15 +16,18 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest.sslutil
 
-import java.io.{File, FileOutputStream}
+import java.io.{File, FileOutputStream, OutputStreamWriter}
 import java.math.BigInteger
 import java.nio.file.Files
-import java.security.{KeyPairGenerator, KeyStore, SecureRandom}
+import java.security.cert.X509Certificate
+import java.security.{KeyPair, KeyPairGenerator, KeyStore, SecureRandom}
 import java.util.{Calendar, Random}
 import javax.security.auth.x500.X500Principal
 
+import com.google.common.base.Charsets
 import org.bouncycastle.asn1.x509.{Extension, GeneralName, GeneralNames}
 import org.bouncycastle.cert.jcajce.{JcaX509CertificateConverter, JcaX509v3CertificateBuilder}
+import org.bouncycastle.openssl.jcajce.JcaPEMWriter
 import org.bouncycastle.operator.jcajce.JcaContentSignerBuilder
 
 import org.apache.spark.util.Utils
@@ -39,6 +42,58 @@ private[spark] object SSLUtils {
     val keyPairGenerator = KeyPairGenerator.getInstance("RSA")
     keyPairGenerator.initialize(512)
     val keyPair = keyPairGenerator.generateKeyPair()
+    val certificate = generateCertificate(ipAddress, keyPair)
+    val keyStore = KeyStore.getInstance("JKS")
+    keyStore.load(null, null)
+    keyStore.setKeyEntry("key", keyPair.getPrivate,
+      keyPassword.toCharArray, Array(certificate))
+    val tempDir = Files.createTempDirectory("temp-ssl-stores").toFile
+    tempDir.deleteOnExit()
+    val keyStoreFile = new File(tempDir, "keyStore.jks")
+    Utils.tryWithResource(new FileOutputStream(keyStoreFile)) {
+      keyStore.store(_, keyStorePassword.toCharArray)
+    }
+    val trustStore = KeyStore.getInstance("JKS")
+    trustStore.load(null, null)
+    trustStore.setCertificateEntry("key", certificate)
+    val trustStoreFile = new File(tempDir, "trustStore.jks")
+    Utils.tryWithResource(new FileOutputStream(trustStoreFile)) {
+      trustStore.store(_, trustStorePassword.toCharArray)
+    }
+    (keyStoreFile, trustStoreFile)
+  }
+
+  def generateKeyCertPemPair(ipAddress: String): (File, File) = {
+    val keyPairGenerator = KeyPairGenerator.getInstance("RSA")
+    keyPairGenerator.initialize(512)
+    val keyPair = keyPairGenerator.generateKeyPair()
+    val certificate = generateCertificate(ipAddress, keyPair)
+    val tempDir = Files.createTempDirectory("temp-ssl-pems").toFile
+    tempDir.deleteOnExit()
+    val keyPemFile = new File(tempDir, "key.pem")
+    val certPemFile = new File(tempDir, "cert.pem")
+    Utils.tryWithResource(new FileOutputStream(keyPemFile)) { keyPemStream =>
+      Utils.tryWithResource(
+          new OutputStreamWriter(keyPemStream, Charsets.UTF_8)) { streamWriter =>
+        Utils.tryWithResource(
+            new JcaPEMWriter(streamWriter)) { pemWriter =>
+          pemWriter.writeObject(keyPair.getPrivate)
+        }
+      }
+    }
+    Utils.tryWithResource(new FileOutputStream(certPemFile)) { keyPemStream =>
+      Utils.tryWithResource(
+          new OutputStreamWriter(keyPemStream, Charsets.UTF_8)) { streamWriter =>
+        Utils.tryWithResource(
+            new JcaPEMWriter(streamWriter)) { pemWriter =>
+          pemWriter.writeObject(certificate)
+        }
+      }
+    }
+    (keyPemFile, certPemFile)
+  }
+
+  private def generateCertificate(ipAddress: String, keyPair: KeyPair): X509Certificate = {
     val selfPrincipal = new X500Principal(s"cn=$ipAddress")
     val currentDate = Calendar.getInstance
     val validForOneHundredYears = Calendar.getInstance
@@ -56,25 +111,6 @@ private[spark] object SSLUtils {
       .setSecureRandom(new SecureRandom())
       .build(keyPair.getPrivate)
     val bcCertificate = certificateBuilder.build(signer)
-    val jcaCertificate = new JcaX509CertificateConverter().getCertificate(bcCertificate)
-    val keyStore = KeyStore.getInstance("JKS")
-    keyStore.load(null, null)
-    keyStore.setKeyEntry("key", keyPair.getPrivate,
-      keyPassword.toCharArray, Array(jcaCertificate))
-    val tempDir = Files.createTempDirectory("temp-ssl-stores").toFile()
-    tempDir.deleteOnExit()
-    val keyStoreFile = new File(tempDir, "keyStore.jks")
-    Utils.tryWithResource(new FileOutputStream(keyStoreFile)) {
-      keyStore.store(_, keyStorePassword.toCharArray)
-    }
-    val trustStore = KeyStore.getInstance("JKS")
-    trustStore.load(null, null)
-    trustStore.setCertificateEntry("key", jcaCertificate)
-    val trustStoreFile = new File(tempDir, "trustStore.jks")
-    Utils.tryWithResource(new FileOutputStream(trustStoreFile)) {
-      trustStore.store(_, trustStorePassword.toCharArray)
-    }
-    (keyStoreFile, trustStoreFile)
+    new JcaX509CertificateConverter().getCertificate(bcCertificate)
   }
-
 }

From 7039934e392620f958e7908bb91cdde3dae7a04f Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Mon, 20 Mar 2017 23:51:50 -0700
Subject: [PATCH 456/534] Update tags on docker images. (#196)

---
 docs/running-on-kubernetes.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 3b6935560a575..b03396f37f644 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -24,11 +24,11 @@ If you wish to use pre-built docker images, you may use the images published in
 <tr><th>Component</th><th>Image</th></tr>
 <tr>
   <td>Spark Driver Image</td>
-  <td><code>kubespark/spark-driver:v2.1.0-k8s-support-0.1.0-alpha.1</code></td>
+  <td><code>kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-rc1</code></td>
 </tr>
 <tr>
   <td>Spark Executor Image</td>
-  <td><code>kubespark/spark-executor:v2.1.0-k8s-support-0.1.0-alpha.1</code></td>
+  <td><code>kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-rc1</code></td>
 </tr>
 </table>
 
@@ -57,8 +57,8 @@ are set up as described above:
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-k8s-support-0.1.0-alpha.1 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-k8s-support-0.1.0-alpha.1 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-rc1 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-rc1 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
@@ -108,8 +108,8 @@ If our local proxy were listening on port 8001, we would have our submission loo
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-k8s-support-0.1.0-alpha.1 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-k8s-support-0.1.0-alpha.1 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-rc1 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-rc1 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
 Communication between Spark and Kubernetes clusters is performed using the fabric8 kubernetes-client library.

From 325424653e2d06553bf547c7a08b076d91527f7b Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 22 Mar 2017 12:10:46 -0700
Subject: [PATCH 457/534] Add additional instructions to use release tarball
 (#198)

---
 docs/running-on-kubernetes.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index b03396f37f644..794099638f80c 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -10,7 +10,7 @@ currently limited and not well-tested. This should not be used in production env
 
 * You must have a running Kubernetes cluster with access configured to it using [kubectl](https://kubernetes.io/docs/user-guide/prereqs/). If you do not already have a working Kubernetes cluster, you may setup a test cluster on your local machine using [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
 * You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/), [nodes](https://kubernetes.io/docs/admin/node/) and [services](https://kubernetes.io/docs/user-guide/services/) in your cluster. You can verify that you can list these resources by running `kubectl get nodes`, `kubectl get pods` and `kubectl get svc` which should give you a list of nodes, pods and services (if any) respectively.
-* You must [build Spark with Kubernetes support](../resource-managers/kubernetes/README.md#building-spark-with-kubernetes-support) from source.
+* You must have a spark distribution with Kubernetes support. This may be obtained from the [release tarball](https://github.com/apache-spark-on-k8s/spark/releases) or by [building Spark with Kubernetes support](../resource-managers/kubernetes/README.md#building-spark-with-kubernetes-support).
 
 ## Driver & Executor Images
 

From 35a5e32b1600f1d713877bde23f3708c8dfc120e Mon Sep 17 00:00:00 2001
From: Ye Yin <eyniy@qq.com>
Date: Thu, 30 Mar 2017 13:01:29 +0800
Subject: [PATCH 458/534] Support specify CPU cores for driver pod (#207)

---
 .../scala/org/apache/spark/deploy/kubernetes/Client.scala | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index 7e700b569a3fb..e628464aa6201 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -64,6 +64,9 @@ private[spark] class Client(
     .map(_.split(","))
     .getOrElse(Array.empty[String])
 
+  // CPU settings
+  private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
+
   // Memory settings
   private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
   private val driverSubmitServerMemoryMb = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY)
@@ -431,6 +434,9 @@ private[spark] class Client(
       .withPath("/v1/submissions/ping")
       .withNewPort(SUBMISSION_SERVER_PORT_NAME)
       .build()
+    val driverCpuQuantity = new QuantityBuilder(false)
+      .withAmount(driverCpuCores)
+      .build()
     val driverMemoryQuantity = new QuantityBuilder(false)
       .withAmount(s"${driverContainerMemoryMb}M")
       .build()
@@ -478,6 +484,8 @@ private[spark] class Client(
             .endEnv()
           .addToEnv(sslConfiguration.sslPodEnvVars: _*)
           .withNewResources()
+            .addToRequests("cpu", driverCpuQuantity)
+            .addToLimits("cpu", driverCpuQuantity)
             .addToRequests("memory", driverMemoryQuantity)
             .addToLimits("memory", driverMemoryLimitQuantity)
             .endResources()

From 0a13206df61a96bc84882fcc34629f599ab88530 Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Wed, 5 Apr 2017 13:08:37 -0700
Subject: [PATCH 459/534] Register executors using pod IPs instead of pod host
 names (#215)

* Register executors using pod IPs

* Fix block manager port typo

* Fix import

* Keep requiredEnv to be a val

* Clean up indentation
---
 .../spark/deploy/kubernetes/Client.scala      |  4 ++--
 .../spark/deploy/kubernetes/constants.scala   |  1 +
 .../KubernetesClusterSchedulerBackend.scala   | 21 +++++++++++++------
 .../src/main/docker/executor/Dockerfile       |  2 +-
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
index e628464aa6201..5d115115b4595 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
@@ -252,8 +252,8 @@ private[spark] class Client(
     sparkConf.set("spark.app.id", kubernetesAppId)
     sparkConf.setIfMissing("spark.app.name", appName)
     sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
-    sparkConf.setIfMissing("spark.blockmanager.port",
-      DEFAULT_BLOCKMANAGER_PORT.toString)
+    sparkConf.setIfMissing("spark.driver.blockManager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
+    sparkConf.setIfMissing("spark.blockManager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
     sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
       sparkConf.set(KUBERNETES_SUBMIT_OAUTH_TOKEN, "<present_but_redacted>")
     }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 0e5fada302421..03b3d21ac9c45 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -68,6 +68,7 @@ package object constants {
   private[spark] val ENV_EXECUTOR_MEMORY = "SPARK_EXECUTOR_MEMORY"
   private[spark] val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   private[spark] val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
+  private[spark] val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
   private[spark] val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
 
   // Annotation keys
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 234829a541c30..7eb1a6214df07 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -18,7 +18,8 @@ package org.apache.spark.scheduler.cluster.kubernetes
 
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
-import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, Pod, QuantityBuilder}
+import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder,
+    EnvVarSourceBuilder, Pod, QuantityBuilder}
 import scala.collection.JavaConverters._
 import scala.concurrent.{ExecutionContext, Future}
 
@@ -177,11 +178,19 @@ private[spark] class KubernetesClusterSchedulerBackend(
       (ENV_EXECUTOR_CORES, executorCores),
       (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId()),
-      (ENV_EXECUTOR_ID, executorId)
-    ).map(env => new EnvVarBuilder()
-      .withName(env._1)
-      .withValue(env._2)
-      .build())
+      (ENV_EXECUTOR_ID, executorId))
+      .map(env => new EnvVarBuilder()
+        .withName(env._1)
+        .withValue(env._2)
+        .build()
+      ) ++ Seq(
+      new EnvVarBuilder()
+        .withName(ENV_EXECUTOR_POD_IP)
+        .withValueFrom(new EnvVarSourceBuilder()
+          .withNewFieldRef("v1", "status.podIP")
+          .build())
+        .build()
+      )
     val requiredPorts = Seq(
       (EXECUTOR_PORT_NAME, executorPort),
       (BLOCK_MANAGER_PORT_NAME, blockmanagerPort))
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index cd5ac466a1fa0..23c6751f1b3ed 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -37,4 +37,4 @@ ENV SPARK_HOME /opt/spark
 WORKDIR /opt/spark
 
 # TODO support spark.executor.extraClassPath
-CMD exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp ${SPARK_HOME}/jars/\* org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $HOSTNAME
+CMD exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp ${SPARK_HOME}/jars/\* org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP

From 13f16d54cf3c4a2f32d4f3302cb570ad347cb3f0 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 10 Apr 2017 11:27:41 -0700
Subject: [PATCH 460/534] Upgrade bouncycastle, force bcprov version (#223)

---
 pom.xml                                   | 7 ++++++-
 resource-managers/kubernetes/core/pom.xml | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 9cfaf6eb65323..3ac2235b7b742 100644
--- a/pom.xml
+++ b/pom.xml
@@ -137,7 +137,7 @@
     <parquet.version>1.8.1</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <feign.version>8.18.0</feign.version>
-    <bouncycastle.version>1.52</bouncycastle.version>
+    <bouncycastle.version>1.54</bouncycastle.version>
     <jetty.version>9.2.16.v20160414</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
     <chill.version>0.8.0</chill.version>
@@ -332,6 +332,11 @@
         <artifactId>bcpkix-jdk15on</artifactId>
         <version>${bouncycastle.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.bouncycastle</groupId>
+        <artifactId>bcprov-jdk15on</artifactId>
+        <version>${bouncycastle.version}</version>
+      </dependency>
       <!-- This artifact is a shaded version of ASM 5.0.4. The POM that was used to produce this
            is at https://github.com/apache/geronimo-xbean/tree/xbean-4.4/xbean-asm5-shaded
            For context on why we shade ASM, see SPARK-782 and SPARK-6152. -->
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 6d2f1d0fd2769..649d004f971d5 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -91,6 +91,10 @@
       <groupId>org.bouncycastle</groupId>
       <artifactId>bcpkix-jdk15on</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.bouncycastle</groupId>
+      <artifactId>bcprov-jdk15on</artifactId>
+    </dependency>
     <!-- End of shaded deps. -->
 
   </dependencies>

From c6a5c6e8c793905efd7642b817654d22c3d50d9c Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 12 Apr 2017 17:30:11 -0700
Subject: [PATCH 461/534] Stop executors cleanly before deleting their pods
 (#231)

---
 .../kubernetes/KubernetesClusterSchedulerBackend.scala       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 7eb1a6214df07..ccb4194336a44 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -131,6 +131,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   override def stop(): Unit = {
+    // send stop message to executors so they shut down cleanly
+    super.stop()
+
+    // then delete the executor pods
     // TODO investigate why Utils.tryLogNonFatalError() doesn't work in this context.
     // When using Utils.tryLogNonFatalError some of the code fails but without any logs or
     // indication as to why.
@@ -149,7 +153,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     } catch {
       case e: Throwable => logError("Uncaught exception closing Kubernetes client.", e)
     }
-    super.stop()
   }
 
   private def allocateNewExecutorPod(): (String, Pod) = {

From 0b0fb6f37a28e5ba992b1387e8ce6b2a0afaa98f Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 13 Apr 2017 17:08:30 -0700
Subject: [PATCH 462/534] Upgrade Kubernetes client to 2.2.13. (#230)

---
 resource-managers/kubernetes/core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 649d004f971d5..09f0debd50c9c 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -29,7 +29,7 @@
   <name>Spark Project Kubernetes</name>
   <properties>
     <sbt.project.name>kubernetes</sbt.project.name>
-    <kubernetes.client.version>2.2.1</kubernetes.client.version>
+    <kubernetes.client.version>2.2.13</kubernetes.client.version>
   </properties>
 
   <dependencies>

From 1388e0a51e959d5f345251ba16f3f4e7d4c09194 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 17 Apr 2017 13:22:34 -0700
Subject: [PATCH 463/534] Respect JVM http proxy settings when using Feign.
 (#228)

* Respect JVM http proxy settings when using Feign.

* Address comments

* Address more comments`
---
 .../rest/kubernetes/HttpClientUtil.scala      | 64 ++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
index 576f7058f20ee..33988bdc36f04 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
@@ -16,6 +16,9 @@
  */
 package org.apache.spark.deploy.rest.kubernetes
 
+import java.io.IOException
+import java.net.{InetSocketAddress, ProxySelector, SocketAddress, URI}
+import java.util.Collections
 import javax.net.ssl.{SSLContext, SSLSocketFactory, X509TrustManager}
 
 import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
@@ -24,12 +27,15 @@ import feign.{Client, Feign, Request, Response}
 import feign.Request.Options
 import feign.jackson.{JacksonDecoder, JacksonEncoder}
 import feign.jaxrs.JAXRSContract
+import io.fabric8.kubernetes.client.Config
 import okhttp3.OkHttpClient
 import scala.reflect.ClassTag
 
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
 import org.apache.spark.status.api.v1.JacksonMessageWriter
 
-private[spark] object HttpClientUtil {
+private[spark] object HttpClientUtil extends Logging {
 
   def createClient[T: ClassTag](
       uris: Set[String],
@@ -42,6 +48,49 @@ private[spark] object HttpClientUtil {
     Option.apply(trustContext).foreach(context => {
       httpClientBuilder = httpClientBuilder.sslSocketFactory(sslSocketFactory, context)
     })
+    val uriObjects = uris.map(URI.create)
+    val httpUris = uriObjects.filter(uri => uri.getScheme == "http")
+    val httpsUris = uriObjects.filter(uri => uri.getScheme == "https")
+    val maybeAllProxy = Option.apply(System.getProperty(Config.KUBERNETES_ALL_PROXY))
+    val maybeHttpProxy = Option.apply(System.getProperty(Config.KUBERNETES_HTTP_PROXY))
+      .orElse(maybeAllProxy)
+      .map(uriStringToProxy)
+    val maybeHttpsProxy = Option.apply(System.getProperty(Config.KUBERNETES_HTTPS_PROXY))
+      .orElse(maybeAllProxy)
+      .map(uriStringToProxy)
+    val maybeNoProxy = Option.apply(System.getProperty(Config.KUBERNETES_NO_PROXY))
+      .map(_.split(","))
+      .toSeq
+      .flatten
+    val proxySelector = new ProxySelector {
+      override def select(uri: URI): java.util.List[java.net.Proxy] = {
+        val directProxy = java.net.Proxy.NO_PROXY
+        val resolvedProxy = maybeNoProxy.find( _ == uri.getHost)
+          .map( _ => directProxy)
+          .orElse(uri.getScheme match {
+            case "http" =>
+              logDebug(s"Looking up http proxies to route $uri")
+              maybeHttpProxy.filter { _ =>
+                matchingUriExists(uri, httpUris)
+              }
+            case "https" =>
+              logDebug(s"Looking up https proxies to route $uri")
+              maybeHttpsProxy.filter { _ =>
+                matchingUriExists(uri, httpsUris)
+              }
+            case _ => None
+        }).getOrElse(directProxy)
+        logDebug(s"Routing $uri through ${resolvedProxy.address()} with proxy" +
+          s" type ${resolvedProxy.`type`()}")
+        Collections.singletonList(resolvedProxy)
+      }
+
+      override def connectFailed(uri: URI, sa: SocketAddress, ioe: IOException) = {
+        throw new SparkException(s"Failed to connect to proxy through uri $uri," +
+          s" socket address: $sa", ioe)
+      }
+    }
+    httpClientBuilder = httpClientBuilder.proxySelector(proxySelector)
     val objectMapper = new ObjectMapper()
       .registerModule(new DefaultScalaModule)
       .setDateFormat(JacksonMessageWriter.makeISODateFormat)
@@ -66,4 +115,17 @@ private[spark] object HttpClientUtil {
       .retryer(target)
       .target(target)
   }
+
+  private def matchingUriExists(uri: URI, httpUris: Set[URI]): Boolean = {
+    httpUris.exists(httpUri => {
+      httpUri.getScheme == uri.getScheme && httpUri.getHost == uri.getHost &&
+      httpUri.getPort == uri.getPort
+    })
+  }
+
+  private def uriStringToProxy(uriString: String): java.net.Proxy = {
+    val uriObject = URI.create(uriString)
+    new java.net.Proxy(java.net.Proxy.Type.HTTP,
+      new InetSocketAddress(uriObject.getHost, uriObject.getPort))
+  }
 }

From 3f6e5ead760bca82c3af070d4d1535511bc6468a Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 20 Apr 2017 23:15:24 -0700
Subject: [PATCH 464/534] Staging server for receiving application
 dependencies. (#212)

* Staging server for receiving application dependencies.

* Add unit test for file writing

* Minor fixes

* Remove getting credentials from the API

We still want to post them because in the future we can use these
credentials to monitor the API server and handle cleaning up the data
accordingly.

* Generalize to resource staging server outside of Spark

* Update code documentation

* Val instead of var

* Fix naming, remove unused import

* Move suites from integration test package to core

* Use TrieMap instead of locks

* Address comments

* Fix imports

* Change paths, use POST instead of PUT

* Use a resource identifier as well as a resource secret
---
 pom.xml                                       | 21 ++++
 resource-managers/kubernetes/core/pom.xml     | 21 ++++
 .../kubernetes/v2/ResourceStagingServer.scala | 61 ++++++++++++
 .../v2/ResourceStagingService.scala           | 85 ++++++++++++++++
 .../v2/ResourceStagingServiceImpl.scala       | 98 ++++++++++++++++++
 .../v2/ResourceStagingServiceRetrofit.scala   | 42 ++++++++
 .../rest/kubernetes/v2/RetrofitUtils.scala    | 38 +++++++
 .../v2/StagedResourceIdentifier.scala         | 19 ++++
 .../v2/ResourceStagingServerSuite.scala       | 99 +++++++++++++++++++
 .../v2/ResourceStagingServiceImplSuite.scala  | 60 +++++++++++
 10 files changed, 544 insertions(+)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala

diff --git a/pom.xml b/pom.xml
index 3ac2235b7b742..9533076a211ee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -137,6 +137,7 @@
     <parquet.version>1.8.1</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <feign.version>8.18.0</feign.version>
+    <retrofit.version>2.2.0</retrofit.version>
     <bouncycastle.version>1.54</bouncycastle.version>
     <jetty.version>9.2.16.v20160414</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
@@ -327,6 +328,21 @@
         <artifactId>feign-jaxrs</artifactId>
         <version>${feign.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.squareup.retrofit2</groupId>
+        <artifactId>retrofit</artifactId>
+        <version>${retrofit.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.squareup.retrofit2</groupId>
+        <artifactId>converter-jackson</artifactId>
+        <version>${retrofit.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.squareup.retrofit2</groupId>
+        <artifactId>converter-scalars</artifactId>
+        <version>${retrofit.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.bouncycastle</groupId>
         <artifactId>bcpkix-jdk15on</artifactId>
@@ -686,6 +702,11 @@
         <artifactId>jersey-client</artifactId>
         <version>${jersey.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.glassfish.jersey.media</groupId>
+        <artifactId>jersey-media-multipart</artifactId>
+        <version>${jersey.version}</version>
+      </dependency>
       <dependency>
         <groupId>javax.ws.rs</groupId>
         <artifactId>javax.ws.rs-api</artifactId>
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 09f0debd50c9c..8856339d4f6d9 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -60,10 +60,31 @@
       <groupId>com.netflix.feign</groupId>
       <artifactId>feign-okhttp</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.glassfish.jersey.containers</groupId>
+      <artifactId>jersey-container-servlet</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.glassfish.jersey.media</groupId>
+      <artifactId>jersey-media-multipart</artifactId>
+    </dependency>
     <dependency>
       <groupId>com.netflix.feign</groupId>
       <artifactId>feign-jackson</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.squareup.retrofit2</groupId>
+      <artifactId>retrofit</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.retrofit2</groupId>
+      <artifactId>converter-jackson</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.squareup.retrofit2</groupId>
+      <artifactId>converter-scalars</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>com.netflix.feign</groupId>
       <artifactId>feign-jaxrs</artifactId>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
new file mode 100644
index 0000000000000..e09a788c45321
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.eclipse.jetty.server.{Server, ServerConnector}
+import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
+import org.eclipse.jetty.util.thread.QueuedThreadPool
+import org.glassfish.jersey.media.multipart.MultiPartFeature
+import org.glassfish.jersey.server.ResourceConfig
+import org.glassfish.jersey.servlet.ServletContainer
+
+private[spark] class ResourceStagingServer(
+    port: Int,
+    serviceInstance: ResourceStagingService) {
+
+  private var jettyServer: Option[Server] = None
+
+  def start(): Unit = synchronized {
+    val threadPool = new QueuedThreadPool
+    val contextHandler = new ServletContextHandler()
+    val jsonProvider = new JacksonJaxbJsonProvider()
+    jsonProvider.setMapper(new ObjectMapper().registerModule(new DefaultScalaModule))
+    val resourceConfig = new ResourceConfig().registerInstances(
+      serviceInstance,
+      jsonProvider,
+      new MultiPartFeature)
+    val servletHolder = new ServletHolder("main", new ServletContainer(resourceConfig))
+    contextHandler.setContextPath("/api/")
+    contextHandler.addServlet(servletHolder, "/*")
+    threadPool.setDaemon(true)
+    val server = new Server(threadPool)
+    val connector = new ServerConnector(server)
+    connector.setPort(port)
+    server.addConnector(connector)
+    server.setHandler(contextHandler)
+    server.start()
+    jettyServer = Some(server)
+  }
+
+  def stop(): Unit = synchronized {
+    jettyServer.foreach(_.stop())
+    jettyServer = None
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
new file mode 100644
index 0000000000000..5f7ceb461615e
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.InputStream
+import javax.ws.rs.{Consumes, GET, HeaderParam, Path, PathParam, POST, Produces}
+import javax.ws.rs.core.{MediaType, StreamingOutput}
+
+import org.glassfish.jersey.media.multipart.FormDataParam
+
+import org.apache.spark.deploy.rest.KubernetesCredentials
+
+/**
+ * Service that receives application data that can be retrieved later on. This is primarily used
+ * in the context of Spark, but the concept is generic enough to be used for arbitrary applications.
+ * The use case is to have a place for Kubernetes application submitters to bootstrap dynamic,
+ * heavyweight application data for pods. Application submitters may have data stored on their
+ * local disks that they want to provide to the pods they create through the API server. ConfigMaps
+ * are one way to provide this data, but the data in ConfigMaps are stored in etcd which cannot
+ * maintain data in the hundreds of megabytes in size.
+ * <p>
+ * The general use case is for an application submitter to ship the dependencies to the server via
+ * {@link uploadResources}; the application submitter will then receive a unique secure token.
+ * The application submitter then ought to convert the token into a secret, and use this secret in
+ * a pod that fetches the uploaded dependencies via {@link downloadResources}. An application can
+ * provide multiple resource bundles simply by hitting the upload endpoint multiple times and
+ * downloading each bundle with the appropriate secret.
+ */
+@Path("/v0")
+private[spark] trait ResourceStagingService {
+
+  /**
+   * Register a resource with the dependency service, so that pods with the given labels can
+   * retrieve them when they run.
+   *
+   * @param resources Application resources to upload, compacted together in tar + gzip format.
+   *                  The tarball should contain the files laid out in a flat hierarchy, without
+   *                  any directories. We take a stream here to avoid holding these entirely in
+   *                  memory.
+   * @param podLabels Labels of pods to monitor. When no more pods are running with the given label,
+   *                  after some period of time, these dependencies will be cleared.
+   * @param podNamespace Namespace of pods to monitor.
+   * @param kubernetesCredentials These credentials are primarily used to monitor the progress of
+   *                              the application. When the application shuts down normally, shuts
+   *                              down abnormally and does not restart, or fails to start entirely,
+   *                              the data uploaded through this endpoint is cleared.
+   * @return A unique token that should be provided when retrieving these dependencies later.
+   */
+  @POST
+  @Consumes(Array(MediaType.MULTIPART_FORM_DATA, MediaType.APPLICATION_JSON, MediaType.TEXT_PLAIN))
+  @Produces(Array(MediaType.APPLICATION_JSON))
+  @Path("/resources")
+  def uploadResources(
+      @FormDataParam("podLabels") podLabels: Map[String, String],
+      @FormDataParam("podNamespace") podNamespace: String,
+      @FormDataParam("resources") resources: InputStream,
+      @FormDataParam("kubernetesCredentials") kubernetesCredentials: KubernetesCredentials)
+      : StagedResourceIdentifier
+
+  /**
+   * Download an application's resources. The resources are provided as a stream, where the stream's
+   * underlying data matches the stream that was uploaded in uploadResources.
+   */
+  @GET
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
+  @Path("/resources/{resourceId}")
+  def downloadResources(
+      @PathParam("resourceId") resourceId: String,
+      @HeaderParam("Authorization") resourceSecret: String): StreamingOutput
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
new file mode 100644
index 0000000000000..bb338dacdf511
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.{File, FileOutputStream, InputStream, OutputStream}
+import java.security.SecureRandom
+import java.util.UUID
+import javax.ws.rs.{NotAuthorizedException, NotFoundException}
+import javax.ws.rs.core.StreamingOutput
+
+import com.google.common.io.{BaseEncoding, ByteStreams, Files}
+import scala.collection.concurrent.TrieMap
+
+import org.apache.spark.SparkException
+import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+private[spark] class ResourceStagingServiceImpl(dependenciesRootDir: File)
+    extends ResourceStagingService with Logging {
+
+  private val SECURE_RANDOM = new SecureRandom()
+  // TODO clean up these resources based on the driver's lifecycle
+  private val stagedResources = TrieMap.empty[String, StagedResources]
+
+  override def uploadResources(
+      podLabels: Map[String, String],
+      podNamespace: String,
+      resources: InputStream,
+      kubernetesCredentials: KubernetesCredentials): StagedResourceIdentifier = {
+    val resourceId = UUID.randomUUID().toString
+    val secretBytes = new Array[Byte](1024)
+    SECURE_RANDOM.nextBytes(secretBytes)
+    val resourceSecret = resourceId + "-" + BaseEncoding.base64().encode(secretBytes)
+
+    val namespaceDir = new File(dependenciesRootDir, podNamespace)
+    val resourcesDir = new File(namespaceDir, resourceId)
+    try {
+      if (!resourcesDir.exists()) {
+        if (!resourcesDir.mkdirs()) {
+          throw new SparkException("Failed to create dependencies directory for application" +
+            s" at ${resourcesDir.getAbsolutePath}")
+        }
+      }
+      // TODO encrypt the written data with the secret.
+      val resourcesTgz = new File(resourcesDir, "resources.data")
+      Utils.tryWithResource(new FileOutputStream(resourcesTgz)) { ByteStreams.copy(resources, _) }
+      stagedResources(resourceId) = StagedResources(
+        resourceSecret,
+        podLabels,
+        podNamespace,
+        resourcesTgz,
+        kubernetesCredentials)
+      StagedResourceIdentifier(resourceId, resourceSecret)
+    } catch {
+      case e: Throwable =>
+        if (!resourcesDir.delete()) {
+          logWarning(s"Failed to delete application directory $resourcesDir.")
+        }
+        throw e
+    }
+  }
+
+  override def downloadResources(resourceId: String, resourceSecret: String): StreamingOutput = {
+    val resource = stagedResources
+        .get(resourceId)
+        .getOrElse(throw new NotFoundException(s"No resource bundle found with id $resourceId"))
+    if (!resource.resourceSecret.equals(resourceSecret)) {
+      throw new NotAuthorizedException(s"Unauthorized to download resource with id $resourceId")
+    }
+    new StreamingOutput {
+      override def write(outputStream: OutputStream) = {
+        Files.copy(resource.resourcesFile, outputStream)
+      }
+    }
+  }
+}
+
+private case class StagedResources(
+  resourceSecret: String,
+  podLabels: Map[String, String],
+  podNamespace: String,
+  resourcesFile: File,
+  kubernetesCredentials: KubernetesCredentials)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
new file mode 100644
index 0000000000000..daf03f764b35a
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import okhttp3.{RequestBody, ResponseBody}
+import retrofit2.Call
+import retrofit2.http.{Multipart, Path, Streaming}
+
+/**
+ * Retrofit-compatible variant of {@link ResourceStagingService}. For documentation on
+ * how to use this service, see the aforementioned JAX-RS based interface.
+ */
+private[spark] trait ResourceStagingServiceRetrofit {
+
+  @Multipart
+  @retrofit2.http.POST("/api/v0/resources/")
+  def uploadResources(
+      @retrofit2.http.Part("podLabels") podLabels: RequestBody,
+      @retrofit2.http.Part("podNamespace") podNamespace: RequestBody,
+      @retrofit2.http.Part("resources") resources: RequestBody,
+      @retrofit2.http.Part("kubernetesCredentials")
+          kubernetesCredentials: RequestBody): Call[StagedResourceIdentifier]
+
+  @Streaming
+  @retrofit2.http.GET("/api/v0/resources/{resourceId}")
+  def downloadResources(@Path("resourceId") resourceId: String,
+      @retrofit2.http.Header("Authorization") resourceSecret: String): Call[ResponseBody]
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
new file mode 100644
index 0000000000000..c5c5c0d35b7cb
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import retrofit2.Retrofit
+import retrofit2.converter.jackson.JacksonConverterFactory
+import retrofit2.converter.scalars.ScalarsConverterFactory
+
+private[spark] object RetrofitUtils {
+
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+
+  def createRetrofitClient[T](baseUrl: String, serviceType: Class[T]): T = {
+    new Retrofit.Builder()
+      .baseUrl(baseUrl)
+      .addConverterFactory(ScalarsConverterFactory.create())
+      .addConverterFactory(JacksonConverterFactory.create(OBJECT_MAPPER))
+      .build()
+      .create(serviceType)
+  }
+
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala
new file mode 100644
index 0000000000000..65bc9bc17dae9
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+case class StagedResourceIdentifier(resourceId: String, resourceSecret: String)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
new file mode 100644
index 0000000000000..70ba5be395042
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.net.ServerSocket
+import javax.ws.rs.core.MediaType
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.io.ByteStreams
+import okhttp3.{RequestBody, ResponseBody}
+import org.scalatest.BeforeAndAfterAll
+import retrofit2.Call
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.util.Utils
+
+/**
+ * Tests for {@link ResourceStagingServer} and its APIs. Note that this is not an end-to-end
+ * integration test, and as such does not upload and download files in tar.gz as would be done
+ * in production. Thus we use the retrofit clients directly despite the fact that in practice
+ * we would likely want to create an opinionated abstraction on top of the retrofit client; we
+ * can test this abstraction layer separately, however. This test is mainly for checking that
+ * we've configured the Jetty server correctly and that the endpoints reached over HTTP can
+ * receive streamed uploads and can stream downloads.
+ */
+class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfterAll {
+
+  private val serverPort = new ServerSocket(0).getLocalPort
+  private val serviceImpl = new ResourceStagingServiceImpl(Utils.createTempDir())
+  private val server = new ResourceStagingServer(serverPort, serviceImpl)
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+
+  override def beforeAll(): Unit = {
+    server.start()
+  }
+
+  override def afterAll(): Unit = {
+    server.stop()
+  }
+
+  test("Accept file and jar uploads and downloads") {
+    val retrofitService = RetrofitUtils.createRetrofitClient(s"http://localhost:$serverPort/",
+      classOf[ResourceStagingServiceRetrofit])
+    val resourcesBytes = Array[Byte](1, 2, 3, 4)
+    val labels = Map("label1" -> "label1Value", "label2" -> "label2value")
+    val namespace = "namespace"
+    val labelsJson = OBJECT_MAPPER.writer().writeValueAsString(labels)
+    val resourcesRequestBody = RequestBody.create(
+        okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), resourcesBytes)
+    val labelsRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsJson)
+    val namespaceRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.TEXT_PLAIN), namespace)
+    val kubernetesCredentials = KubernetesCredentials(Some("token"), Some("ca-cert"), None, None)
+    val kubernetesCredentialsString = OBJECT_MAPPER.writer()
+      .writeValueAsString(kubernetesCredentials)
+    val kubernetesCredentialsBody = RequestBody.create(
+        okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), kubernetesCredentialsString)
+    val uploadResponse = retrofitService.uploadResources(
+      labelsRequestBody, namespaceRequestBody, resourcesRequestBody, kubernetesCredentialsBody)
+    val resourceIdentifier = getTypedResponseResult(uploadResponse)
+    checkResponseBodyBytesMatches(
+      retrofitService.downloadResources(
+        resourceIdentifier.resourceId, resourceIdentifier.resourceSecret), resourcesBytes)
+  }
+
+  private def getTypedResponseResult[T](call: Call[T]): T = {
+    val response = call.execute()
+    assert(response.code() >= 200 && response.code() < 300, Option(response.errorBody())
+      .map(_.string())
+      .getOrElse("Error executing HTTP request, but error body was not provided."))
+    val callResult = response.body()
+    assert(callResult != null)
+    callResult
+  }
+
+  private def checkResponseBodyBytesMatches(call: Call[ResponseBody], bytes: Array[Byte]): Unit = {
+    val responseBody = getTypedResponseResult(call)
+    val downloadedBytes = ByteStreams.toByteArray(responseBody.byteStream())
+    assert(downloadedBytes.toSeq === bytes)
+  }
+
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
new file mode 100644
index 0000000000000..b92257005d5df
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.{ByteArrayInputStream, File}
+import java.nio.file.Paths
+
+import com.google.common.io.Files
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.util.Utils
+
+/**
+ * Unit, scala-level tests for KubernetesSparkDependencyServiceImpl. The coverage here
+ * differs from that of KubernetesSparkDependencyServerSuite as here we invoke the
+ * implementation methods directly as opposed to over HTTP, as well as check the
+ * data written to the underlying disk.
+ */
+class ResourceStagingServiceImplSuite extends SparkFunSuite {
+
+  private val dependencyRootDir = Utils.createTempDir()
+  private val serviceImpl = new ResourceStagingServiceImpl(dependencyRootDir)
+  private val resourceBytes = Array[Byte](1, 2, 3, 4)
+  private val kubernetesCredentials = KubernetesCredentials(
+    Some("token"), Some("caCert"), Some("key"), Some("cert"))
+  private val namespace = "namespace"
+  private val labels = Map("label1" -> "label1value", "label2" -> "label2value")
+
+  test("Uploads should write data to the underlying disk") {
+    Utils.tryWithResource(new ByteArrayInputStream(resourceBytes)) { resourceStream =>
+      serviceImpl.uploadResources(labels, namespace, resourceStream, kubernetesCredentials)
+    }
+    val resourceNamespaceDir = Paths.get(dependencyRootDir.getAbsolutePath, "namespace").toFile
+    assert(resourceNamespaceDir.isDirectory, s"Resource namespace dir was not created at" +
+      s" ${resourceNamespaceDir.getAbsolutePath} or is not a directory.")
+    val resourceDirs = resourceNamespaceDir.listFiles()
+    assert(resourceDirs.length === 1, s"Resource root directory did not have exactly one" +
+      s" subdirectory. Got: ${resourceDirs.map(_.getAbsolutePath).mkString(",")}")
+    val resourceTgz = new File(resourceDirs(0), "resources.data")
+    assert(resourceTgz.isFile,
+      s"Resources written to ${resourceTgz.getAbsolutePath} does not exist or is not a file.")
+    val resourceTgzBytes = Files.toByteArray(resourceTgz)
+    assert(resourceTgzBytes.toSeq === resourceBytes.toSeq, "Incorrect resource bytes were written.")
+  }
+}

From e24c4af93c2cff29fb91bb2641ea70db3a22ffa0 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 21 Apr 2017 00:34:27 -0700
Subject: [PATCH 465/534] Reorganize packages between v1 work and v2 work
 (#220)

* Staging server for receiving application dependencies.

* Move packages around to split between v1 work and v2 work

* Add unit test for file writing

* Remove unnecessary main

* Add back license header

* Minor fixes

* Fix integration test with renamed package for client. Fix scalastyle.

* Force json serialization to consider the different package.

* Revert extraneous log

* Fix scalastyle

* Remove getting credentials from the API

We still want to post them because in the future we can use these
credentials to monitor the API server and handle cleaning up the data
accordingly.

* Generalize to resource staging server outside of Spark

* Update code documentation

* Val instead of var

* Fix build

* Fix naming, remove unused import

* Move suites from integration test package to core

* Use TrieMap instead of locks

* Address comments

* Fix imports

* Change paths, use POST instead of PUT

* Use a resource identifier as well as a resource secret
---
 .../scala/org/apache/spark/deploy/SparkSubmit.scala  |  2 +-
 dev/.rat-excludes                                    |  2 +-
 ....deploy.kubernetes.submit.v1.DriverServiceManager |  2 ++
 ...spark.deploy.rest.kubernetes.DriverServiceManager |  2 --
 .../org/apache/spark/deploy/kubernetes/config.scala  |  2 +-
 .../deploy/kubernetes/{ => submit/v1}/Client.scala   |  5 ++---
 .../submit/v1}/CompressionUtils.scala                |  4 ++--
 .../v1}/DriverPodKubernetesCredentialsProvider.scala |  4 ++--
 .../submit/v1}/DriverServiceManager.scala            |  3 +--
 .../v1}/DriverSubmitSslConfigurationProvider.scala   |  4 ++--
 .../ExternalSuppliedUrisDriverServiceManager.scala   |  2 +-
 .../{ => submit/v1}/KubernetesResourceCleaner.scala  |  2 +-
 .../{ => submit/v1}/LoggingPodStatusWatcher.scala    |  5 ++---
 .../v1}/NodePortUrisDriverServiceManager.scala       |  2 +-
 .../rest/kubernetes/{ => v1}/HttpClientUtil.scala    |  2 +-
 .../kubernetes/{ => v1}/KubernetesFileUtils.scala    |  2 +-
 .../v1}/KubernetesRestProtocolMessages.scala         | 12 ++++++++++--
 .../kubernetes/{ => v1}/KubernetesSparkRestApi.scala |  4 ++--
 .../{ => v1}/KubernetesSparkRestServer.scala         |  3 ++-
 .../kubernetes/{ => v1}/MultiServerFeignTarget.scala |  2 +-
 .../{ => v1}/PemsToKeyStoreConverter.scala           |  2 +-
 .../rest/kubernetes/v2/ResourceStagingService.scala  |  2 +-
 .../kubernetes/v2/ResourceStagingServiceImpl.scala   |  2 +-
 .../kubernetes/KubernetesClientBuilder.scala         |  2 +-
 .../KubernetesClusterSchedulerBackend.scala          |  1 -
 .../kubernetes/v2/ResourceStagingServerSuite.scala   |  2 +-
 .../v2/ResourceStagingServiceImplSuite.scala         |  2 +-
 .../src/main/docker/driver/Dockerfile                |  2 +-
 .../kubernetes/integrationtest/KubernetesSuite.scala |  3 +--
 .../integrationtest/minikube/Minikube.scala          |  2 +-
 30 files changed, 45 insertions(+), 41 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
 delete mode 100644 resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/{ => submit/v1}/Client.scala (99%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest/kubernetes => kubernetes/submit/v1}/CompressionUtils.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/{ => submit/v1}/DriverPodKubernetesCredentialsProvider.scala (96%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest/kubernetes => kubernetes/submit/v1}/DriverServiceManager.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/{ => submit/v1}/DriverSubmitSslConfigurationProvider.scala (99%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest/kubernetes => kubernetes/submit/v1}/ExternalSuppliedUrisDriverServiceManager.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/{ => submit/v1}/KubernetesResourceCleaner.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/{ => submit/v1}/LoggingPodStatusWatcher.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest/kubernetes => kubernetes/submit/v1}/NodePortUrisDriverServiceManager.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{ => v1}/HttpClientUtil.scala (99%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{ => v1}/KubernetesFileUtils.scala (96%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/{ => kubernetes/v1}/KubernetesRestProtocolMessages.scala (81%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{ => v1}/KubernetesSparkRestApi.scala (89%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{ => v1}/KubernetesSparkRestServer.scala (99%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{ => v1}/MultiServerFeignTarget.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{ => v1}/PemsToKeyStoreConverter.scala (99%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/{deploy => scheduler/cluster}/kubernetes/KubernetesClientBuilder.scala (98%)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 002b29d5564e1..aeccd0088d76c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -619,7 +619,7 @@ object SparkSubmit {
     }
 
     if (isKubernetesCluster) {
-      childMainClass = "org.apache.spark.deploy.kubernetes.Client"
+      childMainClass = "org.apache.spark.deploy.kubernetes.submit.v1.Client"
       childArgs += args.primaryResource
       childArgs += args.mainClass
       childArgs ++= args.childArgs
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index f69567d8f6752..6a805b3293a6f 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -103,4 +103,4 @@ org.apache.spark.scheduler.ExternalClusterManager
 org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
 spark-warehouse
 structured-streaming/*
-org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
+org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
diff --git a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
new file mode 100644
index 0000000000000..2ed0387c51bc6
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
@@ -0,0 +1,2 @@
+org.apache.spark.deploy.kubernetes.submit.v1.ExternalSuppliedUrisDriverServiceManager
+org.apache.spark.deploy.kubernetes.submit.v1.NodePortUrisDriverServiceManager
diff --git a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
deleted file mode 100644
index 56203ee38ac99..0000000000000
--- a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.rest.kubernetes.DriverServiceManager
+++ /dev/null
@@ -1,2 +0,0 @@
-org.apache.spark.deploy.rest.kubernetes.ExternalSuppliedUrisDriverServiceManager
-org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 3328809e186e4..e403a6e8b927f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.kubernetes
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.{SPARK_VERSION => sparkVersion}
-import org.apache.spark.deploy.rest.kubernetes.NodePortUrisDriverServiceManager
+import org.apache.spark.deploy.kubernetes.submit.v1.NodePortUrisDriverServiceManager
 import org.apache.spark.internal.config.ConfigBuilder
 import org.apache.spark.network.util.ByteUnit
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index 5d115115b4595..72d24f7bf8342 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import java.io.File
 import java.security.SecureRandom
@@ -32,8 +32,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.{AppResource, ContainerAppResource, KubernetesCreateSubmissionRequest, KubernetesCredentials, RemoteAppResource, UploadedAppResource}
-import org.apache.spark.deploy.rest.kubernetes._
+import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesCredentials, KubernetesFileUtils, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ShutdownHookManager, Utils}
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/CompressionUtils.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/CompressionUtils.scala
index 7204cb874aaec..8296218ba1f70 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/CompressionUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/CompressionUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
@@ -26,7 +26,7 @@ import org.apache.commons.compress.utils.CharsetNames
 import org.apache.commons.io.IOUtils
 import scala.collection.mutable
 
-import org.apache.spark.deploy.rest.TarGzippedData
+import org.apache.spark.deploy.rest.kubernetes.v1.TarGzippedData
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ByteBufferOutputStream, Utils}
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
similarity index 96%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
index cee47aad79393..bc7490ef9ec4a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverPodKubernetesCredentialsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import java.io.File
 
@@ -22,7 +22,7 @@ import com.google.common.io.{BaseEncoding, Files}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.internal.config.OptionalConfigEntry
 
 private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala
index d92c0247e2a35..c7d394fcf00ad 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/DriverServiceManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala
@@ -14,8 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
 import io.fabric8.kubernetes.client.KubernetesClient
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
index a83c9a9896a08..10ffddcd7e7fc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/DriverSubmitSslConfigurationProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import java.io.{File, FileInputStream}
 import java.security.{KeyStore, SecureRandom}
@@ -29,7 +29,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.{KubernetesFileUtils, PemsToKeyStoreConverter}
+import org.apache.spark.deploy.rest.kubernetes.v1.{KubernetesFileUtils, PemsToKeyStoreConverter}
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala
index 257571b5a9d3e..4c784aeb5692f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ExternalSuppliedUrisDriverServiceManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import java.util.concurrent.TimeUnit
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala
index 6329bb1359516..266ec652ed8ae 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesResourceCleaner.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import io.fabric8.kubernetes.api.model.HasMetadata
 import io.fabric8.kubernetes.client.KubernetesClient
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
index 17c3db8331ac4..7be334194d9d7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
@@ -14,15 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import java.util.concurrent.{CountDownLatch, Executors, TimeUnit}
 
-import scala.collection.JavaConverters._
-
 import io.fabric8.kubernetes.api.model.Pod
 import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
+import scala.collection.JavaConverters._
 
 import org.apache.spark.internal.Logging
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala
index 1416476824793..965d71917403e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/NodePortUrisDriverServiceManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.kubernetes.submit.v1
 
 import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
 import scala.collection.JavaConverters._
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala
index 33988bdc36f04..ea1abed72c07f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/HttpClientUtil.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.rest.kubernetes.v1
 
 import java.io.IOException
 import java.net.{InetSocketAddress, ProxySelector, SocketAddress, URI}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesFileUtils.scala
similarity index 96%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesFileUtils.scala
index f30be1535f81c..b8e644219097e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesFileUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesFileUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.rest.kubernetes.v1
 
 import org.apache.spark.util.Utils
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
similarity index 81%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
index 1ea44109c5f5e..cd1f9dcdf5879 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
@@ -14,11 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest
+package org.apache.spark.deploy.rest.kubernetes.v1
 
-import com.fasterxml.jackson.annotation.{JsonSubTypes, JsonTypeInfo}
+import com.fasterxml.jackson.annotation.{JsonIgnore, JsonSubTypes, JsonTypeInfo}
 
 import org.apache.spark.SPARK_VERSION
+import org.apache.spark.deploy.rest.{SubmitRestProtocolRequest, SubmitRestProtocolResponse}
+import org.apache.spark.util.Utils
 
 case class KubernetesCredentials(
     oauthToken: Option[String],
@@ -35,6 +37,9 @@ case class KubernetesCreateSubmissionRequest(
     driverPodKubernetesCredentials: KubernetesCredentials,
     uploadedJarsBase64Contents: TarGzippedData,
     uploadedFilesBase64Contents: TarGzippedData) extends SubmitRestProtocolRequest {
+  @JsonIgnore
+  override val messageType: String = s"kubernetes.v1.${Utils.getFormattedClassName(this)}"
+  override val action = messageType
   message = "create"
   clientSparkVersion = SPARK_VERSION
 }
@@ -68,5 +73,8 @@ class PingResponse extends SubmitRestProtocolResponse {
   val text = "pong"
   message = "pong"
   serverSparkVersion = SPARK_VERSION
+  @JsonIgnore
+  override val messageType: String = s"kubernetes.v1.${Utils.getFormattedClassName(this)}"
+  override val action: String = messageType
 }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestApi.scala
similarity index 89%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestApi.scala
index 18eb9b7a12ca6..270e7ea0e77bf 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestApi.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestApi.scala
@@ -14,12 +14,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.rest.kubernetes.v1
 
 import javax.ws.rs.{Consumes, GET, Path, POST, Produces}
 import javax.ws.rs.core.MediaType
 
-import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KubernetesCreateSubmissionRequest, PingResponse}
+import org.apache.spark.deploy.rest.CreateSubmissionResponse
 
 @Path("/v1/submissions/")
 trait KubernetesSparkRestApi {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
index 4ca01b2f6bd38..048427fa4ec23 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.rest.kubernetes.v1
 
 import java.io.{File, FileOutputStream, StringReader}
 import java.net.URI
@@ -34,6 +34,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.v1.CompressionUtils
 import org.apache.spark.deploy.rest._
 import org.apache.spark.internal.config.OptionalConfigEntry
 import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala
index 51313e00ce2da..56ff82ea2fc33 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/MultiServerFeignTarget.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.rest.kubernetes.v1
 
 import feign.{Request, RequestTemplate, RetryableException, Retryer, Target}
 import scala.reflect.ClassTag
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
index e5c43560eccb4..da863a9fb48e2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes
+package org.apache.spark.deploy.rest.kubernetes.v1
 
 import java.io.{File, FileInputStream, FileOutputStream, InputStreamReader}
 import java.nio.file.Paths
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
index 5f7ceb461615e..95cc6ab949d5c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
@@ -22,7 +22,7 @@ import javax.ws.rs.core.{MediaType, StreamingOutput}
 
 import org.glassfish.jersey.media.multipart.FormDataParam
 
-import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 
 /**
  * Service that receives application data that can be retrieved later on. This is primarily used
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
index bb338dacdf511..732969cd67d89 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
@@ -26,7 +26,7 @@ import com.google.common.io.{BaseEncoding, ByteStreams, Files}
 import scala.collection.concurrent.TrieMap
 
 import org.apache.spark.SparkException
-import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
index 554ed17ff25c4..6725992aae978 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes
+package org.apache.spark.scheduler.cluster.kubernetes
 
 import java.io.File
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index ccb4194336a44..130b143c7e92b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -24,7 +24,6 @@ import scala.collection.JavaConverters._
 import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.deploy.kubernetes.KubernetesClientBuilder
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.rpc.RpcEndpointAddress
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
index 70ba5be395042..babc0994d25dc 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
@@ -27,7 +27,7 @@ import org.scalatest.BeforeAndAfterAll
 import retrofit2.Call
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
index b92257005d5df..60850bb877540 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
@@ -22,7 +22,7 @@ import java.nio.file.Paths
 import com.google.common.io.Files
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.rest.KubernetesCredentials
+import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 1f35e7e5eb209..8ab7a58704505 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -44,7 +44,7 @@ CMD SSL_ARGS="" && \
     if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-key-password-file $SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"; fi && \
     if ! [ -z ${SPARK_SUBMISSION_KEY_PEM_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --key-pem-file $SPARK_SUBMISSION_KEY_PEM_FILE"; fi && \
     if ! [ -z ${SPARK_SUBMISSION_CERT_PEM_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --cert-pem-file $SPARK_SUBMISSION_CERT_PEM_FILE"; fi && \
-    exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.KubernetesSparkRestServer \
+    exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.v1.KubernetesSparkRestServer \
       --hostname $HOSTNAME \
       --port $SPARK_SUBMISSION_SERVER_PORT \
       --secret-file $SPARK_SUBMISSION_SECRET_LOCATION \
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 0e55e64fd1d77..8deb790f4b7a0 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -35,14 +35,13 @@ import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkSubmit
-import org.apache.spark.deploy.kubernetes.Client
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
 import org.apache.spark.deploy.kubernetes.integrationtest.sslutil.SSLUtils
-import org.apache.spark.deploy.rest.kubernetes.ExternalSuppliedUrisDriverServiceManager
+import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 import org.apache.spark.util.Utils
 
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
index 07274bf962dde..81491be944d3e 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
@@ -27,7 +27,7 @@ import io.fabric8.kubernetes.client.internal.SSLUtils
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
-import org.apache.spark.deploy.rest.kubernetes.HttpClientUtil
+import org.apache.spark.deploy.rest.kubernetes.v1.HttpClientUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 

From 4940eae3f78c3a7f6eebc55a24e00b066dff22bc Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 21 Apr 2017 02:20:26 -0700
Subject: [PATCH 466/534] Support SSL on the file staging server (#221)

* Staging server for receiving application dependencies.

* Move packages around to split between v1 work and v2 work

* Add unit test for file writing

* Remove unnecessary main

* Allow the file staging server to be secured with TLS.

* Add back license header

* Minor fixes

* Fix integration test with renamed package for client. Fix scalastyle.

* Remove unused import

* Force json serialization to consider the different package.

* Revert extraneous log

* Fix scalastyle

* Remove getting credentials from the API

We still want to post them because in the future we can use these
credentials to monitor the API server and handle cleaning up the data
accordingly.

* Fix build

* Randomize name and namespace in test to prevent collisions

* Generalize to resource staging server outside of Spark

* Update code documentation

* Val instead of var

* Fix unit tests.

* Fix build

* Fix naming, remove unused import

* Move suites from integration test package to core

* Fix unit test

* Use TrieMap instead of locks

* Address comments

* Fix imports

* Address comments

* Change main object name

* Change config variable names

* Change paths, use POST instead of PUT

* Use a resource identifier as well as a resource secret
---
 .../spark/deploy/kubernetes/config.scala      |  33 +++++
 .../v1/PemsToKeyStoreConverter.scala          |   7 +-
 .../kubernetes/v2/ResourceStagingServer.scala |  76 +++++++++-
 ...ourceStagingServerSslOptionsProvider.scala | 133 ++++++++++++++++++
 .../rest/kubernetes/v2/RetrofitUtils.scala    |  31 +++-
 .../spark/deploy/kubernetes}/SSLUtils.scala   |   2 +-
 ...StagingServerSslOptionsProviderSuite.scala | 116 +++++++++++++++
 .../v2/ResourceStagingServerSuite.scala       |  57 ++++++--
 .../kubernetes/integration-tests/pom.xml      |   7 +
 .../integrationtest/KubernetesSuite.scala     |   2 +-
 10 files changed, 442 insertions(+), 22 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
 rename resource-managers/kubernetes/{integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil => core/src/test/scala/org/apache/spark/deploy/kubernetes}/SSLUtils.scala (98%)
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index e403a6e8b927f..15f7a17857f1f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -288,4 +288,37 @@ package object config {
       .doc("Interval between reports of the current app status in cluster mode.")
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("1s")
+
+  // Spark dependency server for submission v2
+
+  private[spark] val RESOURCE_STAGING_SERVER_PORT =
+    ConfigBuilder("spark.kubernetes.resourceStagingServer.port")
+      .doc("Port for the Kubernetes resource staging server to listen on.")
+      .intConf
+      .createWithDefault(10000)
+
+  private[spark] val RESOURCE_STAGING_SERVER_KEY_PEM =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.keyPem")
+      .doc("Key PEM file to use when having the Kubernetes dependency server listen on TLS.")
+      .stringConf
+      .createOptional
+
+  private[spark] val RESOURCE_STAGING_SERVER_CERT_PEM =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.serverCertPem")
+      .doc("Certificate PEM file to use when having the Kubernetes dependency server" +
+        " listen on TLS.")
+      .stringConf
+      .createOptional
+
+  private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_PASSWORD_FILE =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.keyStorePasswordFile")
+      .doc("File containing the keystore password for the Kubernetes dependency server.")
+      .stringConf
+      .createOptional
+
+  private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_KEY_PASSWORD_FILE =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile")
+      .doc("File containing the key password for the Kubernetes dependency server.")
+      .stringConf
+      .createOptional
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
index da863a9fb48e2..2c68b150baf91 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
@@ -60,11 +60,12 @@ private[spark] object PemsToKeyStoreConverter {
       privateKey,
       keyPassword.map(_.toCharArray).orNull,
       certificates)
-    val keyStoreOutputPath = Paths.get(s"keystore-${UUID.randomUUID()}.$resolvedKeyStoreType")
-    Utils.tryWithResource(new FileOutputStream(keyStoreOutputPath.toFile)) { storeStream =>
+    val keyStoreDir = Utils.createTempDir("temp-keystores")
+    val keyStoreFile = new File(keyStoreDir, s"keystore-${UUID.randomUUID()}.$resolvedKeyStoreType")
+    Utils.tryWithResource(new FileOutputStream(keyStoreFile)) { storeStream =>
       keyStore.store(storeStream, keyStorePassword.map(_.toCharArray).orNull)
     }
-    keyStoreOutputPath.toFile
+    keyStoreFile
   }
 
   def convertCertPemToTrustStore(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
index e09a788c45321..8ca13da545d5d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
@@ -16,19 +16,32 @@
  */
 package org.apache.spark.deploy.rest.kubernetes.v2
 
+import java.io.{File, FileInputStream}
+import java.util.Properties
+
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import org.eclipse.jetty.server.{Server, ServerConnector}
+import com.google.common.collect.Maps
+import org.eclipse.jetty.http.HttpVersion
+import org.eclipse.jetty.server.{HttpConfiguration, HttpConnectionFactory, Server, ServerConnector, SslConnectionFactory}
 import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
-import org.eclipse.jetty.util.thread.QueuedThreadPool
+import org.eclipse.jetty.util.thread.{QueuedThreadPool, ScheduledExecutorScheduler}
 import org.glassfish.jersey.media.multipart.MultiPartFeature
 import org.glassfish.jersey.server.ResourceConfig
 import org.glassfish.jersey.servlet.ServletContainer
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.{ConfigReader, SparkConfigProvider}
+import org.apache.spark.util.Utils
 
 private[spark] class ResourceStagingServer(
     port: Int,
-    serviceInstance: ResourceStagingService) {
+    serviceInstance: ResourceStagingService,
+    sslOptionsProvider: ResourceStagingServerSslOptionsProvider) extends Logging {
 
   private var jettyServer: Option[Server] = None
 
@@ -45,17 +58,72 @@ private[spark] class ResourceStagingServer(
     contextHandler.setContextPath("/api/")
     contextHandler.addServlet(servletHolder, "/*")
     threadPool.setDaemon(true)
+    val resolvedConnectionFactories = sslOptionsProvider.getSslOptions
+      .createJettySslContextFactory()
+      .map(sslFactory => {
+        val sslConnectionFactory = new SslConnectionFactory(
+          sslFactory, HttpVersion.HTTP_1_1.asString())
+        val rawHttpConfiguration = new HttpConfiguration()
+        rawHttpConfiguration.setSecureScheme("https")
+        rawHttpConfiguration.setSecurePort(port)
+        val rawHttpConnectionFactory = new HttpConnectionFactory(rawHttpConfiguration)
+        Array(sslConnectionFactory, rawHttpConnectionFactory)
+      }).getOrElse(Array(new HttpConnectionFactory()))
     val server = new Server(threadPool)
-    val connector = new ServerConnector(server)
+    val connector = new ServerConnector(
+      server,
+      null,
+      // Call this full constructor to set this, which forces daemon threads:
+      new ScheduledExecutorScheduler("DependencyServer-Executor", true),
+      null,
+      -1,
+      -1,
+      resolvedConnectionFactories: _*)
     connector.setPort(port)
     server.addConnector(connector)
     server.setHandler(contextHandler)
     server.start()
     jettyServer = Some(server)
+    logInfo(s"Resource staging server started on port $port.")
   }
 
+  def join(): Unit = jettyServer.foreach(_.join())
+
   def stop(): Unit = synchronized {
     jettyServer.foreach(_.stop())
     jettyServer = None
   }
 }
+
+object ResourceStagingServer {
+  def main(args: Array[String]): Unit = {
+    val sparkConf = new SparkConf(true)
+    if (args.nonEmpty) {
+      val propertiesFile = new File(args(0))
+      if (!propertiesFile.isFile) {
+        throw new IllegalArgumentException(s"Server properties file given at" +
+          s" ${propertiesFile.getAbsoluteFile} does not exist or is not a file.")
+      }
+      val properties = new Properties
+      Utils.tryWithResource(new FileInputStream(propertiesFile))(properties.load)
+      val propertiesMap = Maps.fromProperties(properties)
+      val configReader = new ConfigReader(new SparkConfigProvider(propertiesMap))
+      propertiesMap.asScala.keys.foreach { key =>
+        configReader.get(key).foreach(sparkConf.set(key, _))
+      }
+    }
+    val dependenciesRootDir = Utils.createTempDir(namePrefix = "local-application-dependencies")
+    val serviceInstance = new ResourceStagingServiceImpl(dependenciesRootDir)
+    val sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
+    val server = new ResourceStagingServer(
+      port = sparkConf.get(RESOURCE_STAGING_SERVER_PORT),
+      serviceInstance = serviceInstance,
+      sslOptionsProvider = sslOptionsProvider)
+    server.start()
+    try {
+      server.join()
+    } finally {
+      server.stop()
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
new file mode 100644
index 0000000000000..2744ed0a74616
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.rest.kubernetes.v1.PemsToKeyStoreConverter
+import org.apache.spark.internal.Logging
+
+private[spark] trait ResourceStagingServerSslOptionsProvider {
+  def getSslOptions: SSLOptions
+}
+
+private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: SparkConf)
+    extends ResourceStagingServerSslOptionsProvider with Logging {
+  def getSslOptions: SSLOptions = {
+    val baseSslOptions = new SparkSecurityManager(sparkConf)
+      .getSSLOptions("kubernetes.resourceStagingServer")
+    val maybeKeyPem = sparkConf.get(RESOURCE_STAGING_SERVER_KEY_PEM)
+    val maybeCertPem = sparkConf.get(RESOURCE_STAGING_SERVER_CERT_PEM)
+    val maybeKeyStorePasswordFile = sparkConf.get(RESOURCE_STAGING_SERVER_KEYSTORE_PASSWORD_FILE)
+    val maybeKeyPasswordFile = sparkConf.get(RESOURCE_STAGING_SERVER_KEYSTORE_KEY_PASSWORD_FILE)
+
+    logSslConfigurations(
+      baseSslOptions, maybeKeyPem, maybeCertPem, maybeKeyStorePasswordFile, maybeKeyPasswordFile)
+
+    requireNandDefined(baseSslOptions.keyStore, maybeKeyPem,
+      "Shouldn't provide both key PEM and keyStore files for TLS.")
+    requireNandDefined(baseSslOptions.keyStore, maybeCertPem,
+      "Shouldn't provide both certificate PEM and keyStore files for TLS.")
+    requireNandDefined(baseSslOptions.keyStorePassword, maybeKeyStorePasswordFile,
+      "Shouldn't provide both the keyStore password value and the keyStore password file.")
+    requireNandDefined(baseSslOptions.keyPassword, maybeKeyPasswordFile,
+      "Shouldn't provide both the keyStore key password value and the keyStore key password file.")
+    requireBothOrNeitherDefined(
+      maybeKeyPem,
+      maybeCertPem,
+      "When providing a certificate PEM file, the key PEM file must also be provided.",
+      "When providing a key PEM file, the certificate PEM file must also be provided.")
+
+    val resolvedKeyStorePassword = baseSslOptions.keyStorePassword
+      .orElse(maybeKeyStorePasswordFile.map { keyStorePasswordFile =>
+        safeFileToString(keyStorePasswordFile, "KeyStore password file")
+      })
+    val resolvedKeyStoreKeyPassword = baseSslOptions.keyPassword
+      .orElse(maybeKeyPasswordFile.map { keyPasswordFile =>
+        safeFileToString(keyPasswordFile, "KeyStore key password file")
+      })
+    val resolvedKeyStore = baseSslOptions.keyStore
+      .orElse(maybeKeyPem.map { keyPem =>
+        val keyPemFile = new File(keyPem)
+        val certPemFile = new File(maybeCertPem.get)
+        PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
+          keyPemFile,
+          certPemFile,
+          "key",
+          resolvedKeyStorePassword,
+          resolvedKeyStoreKeyPassword,
+          baseSslOptions.keyStoreType)
+      })
+    baseSslOptions.copy(
+      keyStore = resolvedKeyStore,
+      keyStorePassword = resolvedKeyStorePassword,
+      keyPassword = resolvedKeyStoreKeyPassword)
+  }
+
+  private def logSslConfigurations(
+      baseSslOptions: SSLOptions,
+      maybeKeyPem: Option[String],
+      maybeCertPem: Option[String],
+      maybeKeyStorePasswordFile: Option[String],
+      maybeKeyPasswordFile: Option[String]) = {
+    logDebug("The following SSL configurations were provided for the resource staging server:")
+    logDebug(s"KeyStore File: ${baseSslOptions.keyStore.map(_.getAbsolutePath).getOrElse("N/A")}")
+    logDebug("KeyStore Password: " +
+      baseSslOptions.keyStorePassword.map(_ => "<present_but_redacted>").getOrElse("N/A"))
+    logDebug(s"KeyStore Password File: ${maybeKeyStorePasswordFile.getOrElse("N/A")}")
+    logDebug("Key Password: " +
+      baseSslOptions.keyPassword.map(_ => "<present_but_redacted>").getOrElse("N/A"))
+    logDebug(s"Key Password File: ${maybeKeyPasswordFile.getOrElse("N/A")}")
+    logDebug(s"KeyStore Type: ${baseSslOptions.keyStoreType.getOrElse("N/A")}")
+    logDebug(s"Key PEM: ${maybeKeyPem.getOrElse("N/A")}")
+    logDebug(s"Certificate PEM: ${maybeCertPem.getOrElse("N/A")}")
+  }
+
+  private def requireBothOrNeitherDefined(
+      opt1: Option[_],
+      opt2: Option[_],
+      errMessageWhenFirstIsMissing: String,
+      errMessageWhenSecondIsMissing: String): Unit = {
+    requireSecondIfFirstIsDefined(opt1, opt2, errMessageWhenSecondIsMissing)
+    requireSecondIfFirstIsDefined(opt2, opt1, errMessageWhenFirstIsMissing)
+  }
+
+  private def requireSecondIfFirstIsDefined(
+      opt1: Option[_], opt2: Option[_], errMessageWhenSecondIsMissing: String): Unit = {
+    opt1.foreach { _ =>
+      require(opt2.isDefined, errMessageWhenSecondIsMissing)
+    }
+  }
+
+  private def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
+    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
+  }
+
+  private def safeFileToString(filePath: String, fileType: String): String = {
+    val file = new File(filePath)
+    if (!file.isFile) {
+      throw new SparkException(s"$fileType provided at ${file.getAbsolutePath} does not exist or"
+        + s" is not a file.")
+    }
+    Files.toString(file, Charsets.UTF_8)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
index c5c5c0d35b7cb..7416c624e97f6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
@@ -16,21 +16,50 @@
  */
 package org.apache.spark.deploy.rest.kubernetes.v2
 
+import java.io.FileInputStream
+import java.security.{KeyStore, SecureRandom}
+import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
+
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import okhttp3.OkHttpClient
 import retrofit2.Retrofit
 import retrofit2.converter.jackson.JacksonConverterFactory
 import retrofit2.converter.scalars.ScalarsConverterFactory
 
+import org.apache.spark.SSLOptions
+import org.apache.spark.util.Utils
+
 private[spark] object RetrofitUtils {
 
   private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+  private val SECURE_RANDOM = new SecureRandom()
 
-  def createRetrofitClient[T](baseUrl: String, serviceType: Class[T]): T = {
+  def createRetrofitClient[T](baseUrl: String, serviceType: Class[T], sslOptions: SSLOptions): T = {
+    val okHttpClientBuilder = new OkHttpClient.Builder()
+    sslOptions.trustStore.foreach { trustStoreFile =>
+      require(trustStoreFile.isFile, s"TrustStore provided at ${trustStoreFile.getAbsolutePath}"
+        + " does not exist, or is not a file.")
+      val trustStoreType = sslOptions.trustStoreType.getOrElse(KeyStore.getDefaultType)
+      val trustStore = KeyStore.getInstance(trustStoreType)
+      val trustStorePassword = sslOptions.trustStorePassword.map(_.toCharArray).orNull
+      Utils.tryWithResource(new FileInputStream(trustStoreFile)) {
+        trustStore.load(_, trustStorePassword)
+      }
+      val trustManagerFactory = TrustManagerFactory.getInstance(
+        TrustManagerFactory.getDefaultAlgorithm)
+      trustManagerFactory.init(trustStore)
+      val trustManagers = trustManagerFactory.getTrustManagers
+      val sslContext = SSLContext.getInstance("TLSv1.2")
+      sslContext.init(null, trustManagers, SECURE_RANDOM)
+      okHttpClientBuilder.sslSocketFactory(sslContext.getSocketFactory,
+        trustManagers(0).asInstanceOf[X509TrustManager])
+    }
     new Retrofit.Builder()
       .baseUrl(baseUrl)
       .addConverterFactory(ScalarsConverterFactory.create())
       .addConverterFactory(JacksonConverterFactory.create(OBJECT_MAPPER))
+      .client(okHttpClientBuilder.build())
       .build()
       .create(serviceType)
   }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
similarity index 98%
rename from resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
index 2078e0585e8f0..dacb017d8a513 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/sslutil/SSLUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.integrationtest.sslutil
+package org.apache.spark.deploy.kubernetes
 
 import java.io.{File, FileOutputStream, OutputStreamWriter}
 import java.math.BigInteger
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
new file mode 100644
index 0000000000000..290b46a537bf3
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.{File, FileInputStream, StringWriter}
+import java.security.KeyStore
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import org.bouncycastle.openssl.jcajce.JcaPEMWriter
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.SSLUtils
+import org.apache.spark.util.Utils
+
+class ResourceStagingServerSslOptionsProviderSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private var sslTempDir: File = _
+  private var keyStoreFile: File = _
+
+  private var sparkConf: SparkConf = _
+  private var sslOptionsProvider: ResourceStagingServerSslOptionsProvider = _
+
+  before {
+    sslTempDir = Utils.createTempDir(namePrefix = "resource-staging-server-ssl-test")
+    keyStoreFile = new File(sslTempDir, "keyStore.jks")
+    sparkConf = new SparkConf(true)
+    sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
+  }
+
+  test("Default SparkConf does not have TLS enabled.") {
+    assert(sslOptionsProvider.getSslOptions === SSLOptions())
+    assert(!sslOptionsProvider.getSslOptions.enabled)
+    keyStoreFile.delete()
+    sslTempDir.delete()
+  }
+
+  test("Setting keyStore, key password, and key field directly.") {
+    sparkConf.set("spark.ssl.kubernetes.resourceStagingServer.enabled", "true")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore", keyStoreFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStorePassword")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "keyPassword")
+    val sslOptions = sslOptionsProvider.getSslOptions
+    assert(sslOptions.enabled, "SSL should be enabled.")
+    assert(sslOptions.keyStore.map(_.getAbsolutePath) === Some(keyStoreFile.getAbsolutePath),
+      "Incorrect keyStore path or it was not set.")
+    assert(sslOptions.keyStorePassword === Some("keyStorePassword"),
+      "Incorrect keyStore password or it was not set.")
+    assert(sslOptions.keyPassword === Some("keyPassword"),
+      "Incorrect key password or it was not set.")
+  }
+
+  test("Setting key and certificate pem files should write an appropriate keyStore.") {
+    val (keyPemFile, certPemFile) = SSLUtils.generateKeyCertPemPair("127.0.0.1")
+    sparkConf.set("spark.ssl.kubernetes.resourceStagingServer.enabled", "true")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPem", keyPemFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.serverCertPem", certPemFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStorePassword")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "keyPassword")
+    val sslOptions = sslOptionsProvider.getSslOptions
+    assert(sslOptions.enabled, "SSL should be enabled.")
+    assert(sslOptions.keyStore.isDefined, "KeyStore should be defined.")
+    sslOptions.keyStore.foreach { keyStoreFile =>
+      val keyStore = KeyStore.getInstance(KeyStore.getDefaultType)
+      Utils.tryWithResource(new FileInputStream(keyStoreFile)) {
+        keyStore.load(_, "keyStorePassword".toCharArray)
+      }
+      val key = keyStore.getKey("key", "keyPassword".toCharArray)
+      compareJcaPemObjectToFileString(key, keyPemFile)
+      val certificate = keyStore.getCertificateChain("key")(0)
+      compareJcaPemObjectToFileString(certificate, certPemFile)
+    }
+  }
+
+  test("Using password files should read from the appropriate locations.") {
+    val keyStorePasswordFile = new File(sslTempDir, "keyStorePassword.txt")
+    Files.write("keyStorePassword", keyStorePasswordFile, Charsets.UTF_8)
+    val keyPasswordFile = new File(sslTempDir, "keyPassword.txt")
+    Files.write("keyPassword", keyPasswordFile, Charsets.UTF_8)
+    sparkConf.set("spark.ssl.kubernetes.resourceStagingServer.enabled", "true")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore", keyStoreFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePasswordFile",
+        keyStorePasswordFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile", keyPasswordFile.getAbsolutePath)
+    val sslOptions = sslOptionsProvider.getSslOptions
+    assert(sslOptions.keyStorePassword === Some("keyStorePassword"),
+      "Incorrect keyStore password or it was not set.")
+    assert(sslOptions.keyPassword === Some("keyPassword"),
+      "Incorrect key password or it was not set.")
+  }
+
+  private def compareJcaPemObjectToFileString(pemObject: Any, pemFile: File): Unit = {
+    Utils.tryWithResource(new StringWriter()) { stringWriter =>
+      Utils.tryWithResource(new JcaPEMWriter(stringWriter)) { pemWriter =>
+        pemWriter.writeObject(pemObject)
+      }
+      val pemFileAsString = Files.toString(pemFile, Charsets.UTF_8)
+      assert(stringWriter.toString === pemFileAsString)
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
index babc0994d25dc..51c5e43af1124 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
@@ -23,10 +23,11 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.google.common.io.ByteStreams
 import okhttp3.{RequestBody, ResponseBody}
-import org.scalatest.BeforeAndAfterAll
+import org.scalatest.BeforeAndAfter
 import retrofit2.Call
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.SSLUtils
 import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.util.Utils
 
@@ -39,30 +40,53 @@ import org.apache.spark.util.Utils
  * we've configured the Jetty server correctly and that the endpoints reached over HTTP can
  * receive streamed uploads and can stream downloads.
  */
-class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfterAll {
+class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
 
   private val serverPort = new ServerSocket(0).getLocalPort
   private val serviceImpl = new ResourceStagingServiceImpl(Utils.createTempDir())
-  private val server = new ResourceStagingServer(serverPort, serviceImpl)
-  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+  private val sslOptionsProvider = new SettableReferenceSslOptionsProvider()
+  private val server = new ResourceStagingServer(serverPort, serviceImpl, sslOptionsProvider)
 
-  override def beforeAll(): Unit = {
+  after {
+    server.stop()
+  }
+
+  test("Accept file and jar uploads and downloads") {
     server.start()
+    runUploadAndDownload(SSLOptions())
   }
 
-  override def afterAll(): Unit = {
-    server.stop()
+  test("Enable SSL on the server") {
+    val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
+      ipAddress = "127.0.0.1",
+      keyStorePassword = "keyStore",
+      keyPassword = "key",
+      trustStorePassword = "trustStore")
+    val sslOptions = SSLOptions(
+      enabled = true,
+      keyStore = Some(keyStore),
+      keyStorePassword = Some("keyStore"),
+      keyPassword = Some("key"),
+      trustStore = Some(trustStore),
+      trustStorePassword = Some("trustStore"))
+    sslOptionsProvider.setOptions(sslOptions)
+    server.start()
+    runUploadAndDownload(sslOptions)
   }
 
-  test("Accept file and jar uploads and downloads") {
-    val retrofitService = RetrofitUtils.createRetrofitClient(s"http://localhost:$serverPort/",
-      classOf[ResourceStagingServiceRetrofit])
+  private def runUploadAndDownload(sslOptions: SSLOptions): Unit = {
+    val scheme = if (sslOptions.enabled) "https" else "http"
+    val retrofitService = RetrofitUtils.createRetrofitClient(
+      s"$scheme://127.0.0.1:$serverPort/",
+      classOf[ResourceStagingServiceRetrofit],
+      sslOptions)
     val resourcesBytes = Array[Byte](1, 2, 3, 4)
     val labels = Map("label1" -> "label1Value", "label2" -> "label2value")
     val namespace = "namespace"
     val labelsJson = OBJECT_MAPPER.writer().writeValueAsString(labels)
     val resourcesRequestBody = RequestBody.create(
-        okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), resourcesBytes)
+      okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), resourcesBytes)
     val labelsRequestBody = RequestBody.create(
       okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsJson)
     val namespaceRequestBody = RequestBody.create(
@@ -95,5 +119,14 @@ class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfterAll {
     val downloadedBytes = ByteStreams.toByteArray(responseBody.byteStream())
     assert(downloadedBytes.toSeq === bytes)
   }
+}
+
+private class SettableReferenceSslOptionsProvider extends ResourceStagingServerSslOptionsProvider {
+  private var options = SSLOptions()
+
+  def setOptions(newOptions: SSLOptions): Unit = {
+    this.options = newOptions
+  }
 
+  override def getSslOptions: SSLOptions = options
 }
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index da78e783cac1b..5418afa25ca85 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -35,6 +35,13 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 8deb790f4b7a0..750e7668b9912 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -35,12 +35,12 @@ import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.SparkSubmit
+import org.apache.spark.deploy.kubernetes.SSLUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
-import org.apache.spark.deploy.kubernetes.integrationtest.sslutil.SSLUtils
 import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 import org.apache.spark.util.Utils

From 04afcf81df3ea09a85f7e4825d6bd2907bc8fe34 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 24 Apr 2017 18:15:11 -0700
Subject: [PATCH 467/534] Driver submission with mounting dependencies from the
 staging server (#227)

---
 resource-managers/kubernetes/core/pom.xml     |   8 +-
 .../{submit/v1 => }/CompressionUtils.scala    | 123 +++---
 .../spark/deploy/kubernetes/config.scala      | 107 ++++-
 .../spark/deploy/kubernetes/constants.scala   |  42 +-
 .../deploy/kubernetes/submit/v1/Client.scala  |  17 +-
 ...iverPodKubernetesCredentialsProvider.scala |   1 -
 .../deploy/kubernetes/submit/v2/Client.scala  | 249 ++++++++++++
 .../v2/ContainerNameEqualityPredicate.scala   |  29 ++
 .../v2/MountedDependencyManagerImpl.scala     | 324 +++++++++++++++
 .../v2/MountedDependencyManagerProvider.scala |  58 +++
 .../SubmissionKubernetesClientProvider.scala  |  55 +++
 .../v1/KubernetesSparkRestServer.scala        |   2 +-
 ...SparkDependencyDownloadInitContainer.scala | 127 ++++++
 .../kubernetes/v2/ResourceStagingServer.scala |  24 +-
 .../v2/ResourceStagingService.scala           |  13 +-
 .../v2/ResourceStagingServiceImpl.scala       |   2 +
 .../v2/ResourceStagingServiceRetrofit.scala   |   8 +-
 ...tils.scala => RetrofitClientFactory.scala} |  13 +-
 .../v2/SparkConfPropertiesParser.scala        |  46 +++
 .../DriverPodKubernetesClientProvider.scala   |  83 ++++
 .../KubernetesClusterSchedulerBackend.scala   |  14 +-
 .../kubernetes/submit/v2/ClientV2Suite.scala  | 328 ++++++++++++++++
 .../v2/MountedDependencyManagerSuite.scala    | 323 +++++++++++++++
 ...DependencyDownloadInitContainerSuite.scala | 165 ++++++++
 .../v2/ResourceStagingServerSuite.scala       |   2 +-
 .../kubernetes/docker-minimal-bundle/pom.xml  |  17 +-
 ...river-assembly.xml => docker-assembly.xml} |   6 +-
 .../src/main/assembly/executor-assembly.xml   |  84 ----
 .../src/main/docker/driver-init/Dockerfile    |  38 ++
 .../src/main/docker/driver-v2/Dockerfile      |  43 ++
 .../docker/resource-staging-server/Dockerfile |  38 ++
 .../kubernetes/integration-tests/pom.xml      |  65 +---
 .../integrationtest/KubernetesSuite.scala     | 368 ++----------------
 .../KubernetesTestComponents.scala            |  72 ++++
 .../integrationtest/KubernetesV1Suite.scala   | 306 +++++++++++++++
 .../integrationtest/KubernetesV2Suite.scala   | 127 ++++++
 .../ResourceStagingServerLauncher.scala       | 196 ++++++++++
 .../docker/SparkDockerImageBuilder.scala      |  25 +-
 38 files changed, 2932 insertions(+), 616 deletions(-)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/{submit/v1 => }/CompressionUtils.scala (58%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/{RetrofitUtils.scala => RetrofitClientFactory.scala} (85%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
 rename resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/{driver-assembly.xml => docker-assembly.xml} (95%)
 delete mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala

diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8856339d4f6d9..70c252009c9b4 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -108,6 +108,8 @@
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
     </dependency>
+    <!-- End of shaded deps. -->
+
     <dependency>
       <groupId>org.bouncycastle</groupId>
       <artifactId>bcpkix-jdk15on</artifactId>
@@ -116,7 +118,11 @@
       <groupId>org.bouncycastle</groupId>
       <artifactId>bcprov-jdk15on</artifactId>
     </dependency>
-    <!-- End of shaded deps. -->
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <scope>test</scope>
+    </dependency>
 
   </dependencies>
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/CompressionUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala
similarity index 58%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/CompressionUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala
index 8296218ba1f70..03991ba26a6f7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/CompressionUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala
@@ -14,9 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v1
+package org.apache.spark.deploy.kubernetes
 
-import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
+import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream, InputStream, OutputStream}
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
 import com.google.common.io.Files
@@ -48,40 +48,7 @@ private[spark] object CompressionUtils extends Logging {
    */
   def createTarGzip(paths: Iterable[String]): TarGzippedData = {
     val compressedBytesStream = Utils.tryWithResource(new ByteBufferOutputStream()) { raw =>
-      Utils.tryWithResource(new GZIPOutputStream(raw)) { gzipping =>
-        Utils.tryWithResource(new TarArchiveOutputStream(
-            gzipping,
-            BLOCK_SIZE,
-            RECORD_SIZE,
-            ENCODING)) { tarStream =>
-          val usedFileNames = mutable.HashSet.empty[String]
-          for (path <- paths) {
-            val file = new File(path)
-            if (!file.isFile) {
-              throw new IllegalArgumentException(s"Cannot add $path to tarball; either does" +
-                s" not exist or is a directory.")
-            }
-            var resolvedFileName = file.getName
-            val extension = Files.getFileExtension(file.getName)
-            val nameWithoutExtension = Files.getNameWithoutExtension(file.getName)
-            var deduplicationCounter = 1
-            while (usedFileNames.contains(resolvedFileName)) {
-              val oldResolvedFileName = resolvedFileName
-              resolvedFileName = s"$nameWithoutExtension-$deduplicationCounter.$extension"
-              logWarning(s"File with name $oldResolvedFileName already exists. Trying to add" +
-                s" with file name $resolvedFileName instead.")
-              deduplicationCounter += 1
-            }
-            usedFileNames += resolvedFileName
-            val tarEntry = new TarArchiveEntry(file, resolvedFileName)
-            tarStream.putArchiveEntry(tarEntry)
-            Utils.tryWithResource(new FileInputStream(file)) { fileInput =>
-              IOUtils.copy(fileInput, tarStream)
-            }
-            tarStream.closeArchiveEntry()
-          }
-        }
-      }
+      writeTarGzipToStream(raw, paths)
       raw
     }
     val compressedAsBase64 = Base64.encodeBase64String(compressedBytesStream.toByteBuffer.array)
@@ -93,6 +60,44 @@ private[spark] object CompressionUtils extends Logging {
     )
   }
 
+  def writeTarGzipToStream(outputStream: OutputStream, paths: Iterable[String]): Unit = {
+    Utils.tryWithResource(new GZIPOutputStream(outputStream)) { gzipping =>
+      Utils.tryWithResource(new TarArchiveOutputStream(
+          gzipping,
+          BLOCK_SIZE,
+          RECORD_SIZE,
+          ENCODING)) { tarStream =>
+        val usedFileNames = mutable.HashSet.empty[String]
+        for (path <- paths) {
+          val file = new File(path)
+          if (!file.isFile) {
+            throw new IllegalArgumentException(s"Cannot add $path to tarball; either does" +
+              s" not exist or is a directory.")
+          }
+          var resolvedFileName = file.getName
+          val extension = Files.getFileExtension(file.getName)
+          val nameWithoutExtension = Files.getNameWithoutExtension(file.getName)
+          var deduplicationCounter = 1
+          while (usedFileNames.contains(resolvedFileName)) {
+            val oldResolvedFileName = resolvedFileName
+            resolvedFileName = s"$nameWithoutExtension-$deduplicationCounter.$extension"
+            logWarning(s"File with name $oldResolvedFileName already exists. Trying to add" +
+              s" with file name $resolvedFileName instead.")
+            deduplicationCounter += 1
+          }
+          usedFileNames += resolvedFileName
+          val tarEntry = new TarArchiveEntry(resolvedFileName)
+          tarEntry.setSize(file.length());
+          tarStream.putArchiveEntry(tarEntry)
+          Utils.tryWithResource(new FileInputStream(file)) { fileInput =>
+            IOUtils.copy(fileInput, tarStream)
+          }
+          tarStream.closeArchiveEntry()
+        }
+      }
+    }
+  }
+
   /**
    * Decompresses the provided tar archive to a directory.
    * @param compressedData In-memory representation of the compressed data, ideally created via
@@ -104,7 +109,6 @@ private[spark] object CompressionUtils extends Logging {
   def unpackAndWriteCompressedFiles(
       compressedData: TarGzippedData,
       rootOutputDir: File): Seq[String] = {
-    val paths = mutable.Buffer.empty[String]
     val compressedBytes = Base64.decodeBase64(compressedData.dataBase64)
     if (!rootOutputDir.exists) {
       if (!rootOutputDir.mkdirs) {
@@ -116,24 +120,39 @@ private[spark] object CompressionUtils extends Logging {
          s"${rootOutputDir.getAbsolutePath} exists and is not a directory.")
     }
     Utils.tryWithResource(new ByteArrayInputStream(compressedBytes)) { compressedBytesStream =>
-      Utils.tryWithResource(new GZIPInputStream(compressedBytesStream)) { gzipped =>
-        Utils.tryWithResource(new TarArchiveInputStream(
-            gzipped,
-            compressedData.blockSize,
-            compressedData.recordSize,
-            compressedData.encoding)) { tarInputStream =>
-          var nextTarEntry = tarInputStream.getNextTarEntry
-          while (nextTarEntry != null) {
-            val outputFile = new File(rootOutputDir, nextTarEntry.getName)
-            Utils.tryWithResource(new FileOutputStream(outputFile)) { fileOutputStream =>
-              IOUtils.copy(tarInputStream, fileOutputStream)
-            }
-            paths += outputFile.getAbsolutePath
-            nextTarEntry = tarInputStream.getNextTarEntry
+      unpackTarStreamToDirectory(
+        compressedBytesStream,
+        rootOutputDir,
+        compressedData.blockSize,
+        compressedData.recordSize,
+        compressedData.encoding)
+    }
+  }
+
+  def unpackTarStreamToDirectory(
+      inputStream: InputStream,
+      outputDir: File,
+      blockSize: Int = BLOCK_SIZE,
+      recordSize: Int = RECORD_SIZE,
+      encoding: String = ENCODING): Seq[String] = {
+    val paths = mutable.Buffer.empty[String]
+    Utils.tryWithResource(new GZIPInputStream(inputStream)) { gzipped =>
+      Utils.tryWithResource(new TarArchiveInputStream(
+          gzipped,
+          blockSize,
+          recordSize,
+          encoding)) { tarInputStream =>
+        var nextTarEntry = tarInputStream.getNextTarEntry
+        while (nextTarEntry != null) {
+          val outputFile = new File(outputDir, nextTarEntry.getName)
+          Utils.tryWithResource(new FileOutputStream(outputFile)) { fileOutputStream =>
+            IOUtils.copy(tarInputStream, fileOutputStream)
           }
+          paths += outputFile.getAbsolutePath
+          nextTarEntry = tarInputStream.getNextTarEntry
         }
       }
     }
-    paths.toSeq
+    paths
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 15f7a17857f1f..1c8b6798bbdd5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -19,11 +19,13 @@ package org.apache.spark.deploy.kubernetes
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.{SPARK_VERSION => sparkVersion}
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.submit.v1.NodePortUrisDriverServiceManager
+import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.ConfigBuilder
 import org.apache.spark.network.util.ByteUnit
 
-package object config {
+package object config extends Logging {
 
   private[spark] val KUBERNETES_NAMESPACE =
     ConfigBuilder("spark.kubernetes.namespace")
@@ -321,4 +323,107 @@ package object config {
       .doc("File containing the key password for the Kubernetes dependency server.")
       .stringConf
       .createOptional
+
+  private[spark] val RESOURCE_STAGING_SERVER_SSL_ENABLED =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.enabled")
+      .doc("Whether or not to use SSL when communicating with the dependency server.")
+      .booleanConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.trustStore")
+      .doc("File containing the trustStore to communicate with the Kubernetes dependency server.")
+      .stringConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword")
+      .doc("Password for the trustStore for talking to the dependency server.")
+      .stringConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE =
+    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.trustStoreType")
+      .doc("Type of trustStore for communicating with the dependency server.")
+      .stringConf
+      .createOptional
+
+  // Driver and Init-Container parameters for submission v2
+  private[spark] val RESOURCE_STAGING_SERVER_URI =
+    ConfigBuilder("spark.kubernetes.resourceStagingServer.uri")
+      .doc("Base URI for the Spark resource staging server")
+      .stringConf
+      .createOptional
+
+  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER =
+    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadJarsResourceIdentifier")
+      .doc("Identifier for the jars tarball that was uploaded to the staging service.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION =
+    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadJarsSecretLocation")
+      .doc("Location of the application secret to use when the init-container contacts the" +
+        " resource staging server to download jars.")
+      .internal()
+      .stringConf
+      .createWithDefault(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH)
+
+  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER =
+    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadFilesResourceIdentifier")
+      .doc("Identifier for the files tarball that was uploaded to the staging service.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION =
+    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadFilesSecretLocation")
+      .doc("Location of the application secret to use when the init-container contacts the" +
+        " resource staging server to download files.")
+      .internal()
+      .stringConf
+      .createWithDefault(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH)
+
+  private[spark] val INIT_CONTAINER_DOCKER_IMAGE =
+    ConfigBuilder("spark.kubernetes.driver.initcontainer.docker.image")
+      .doc("Image for the driver's init-container that downloads mounted dependencies.")
+      .stringConf
+      .createWithDefault(s"spark-driver-init:$sparkVersion")
+
+  private[spark] val DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION =
+    ConfigBuilder("spark.kubernetes.driver.mountdependencies.jarsDownloadDir")
+      .doc("Location to download local jars to in the driver. When using spark-submit, this" +
+        " directory must be empty and will be mounted as an empty directory volume on the" +
+        " driver pod.")
+      .stringConf
+      .createWithDefault("/var/spark-data/spark-local-jars")
+
+  private[spark] val DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION =
+    ConfigBuilder("spark.kubernetes.driver.mountdependencies.filesDownloadDir")
+      .doc("Location to download local files to in the driver. When using spark-submit, this" +
+        " directory must be empty and will be mounted as an empty directory volume on the" +
+        " driver pod.")
+      .stringConf
+      .createWithDefault("/var/spark-data/spark-local-files")
+
+  private[spark] val DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT =
+    ConfigBuilder("spark.kubernetes.mountdependencies.mountTimeout")
+      .doc("Timeout before aborting the attempt to download and unpack local dependencies from" +
+        " the dependency staging server when initializing the driver pod.")
+      .timeConf(TimeUnit.MINUTES)
+      .createWithDefault(5)
+
+  private[spark] def resolveK8sMaster(rawMasterString: String): String = {
+    if (!rawMasterString.startsWith("k8s://")) {
+      throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
+    }
+    val masterWithoutK8sPrefix = rawMasterString.replaceFirst("k8s://", "")
+    if (masterWithoutK8sPrefix.startsWith("http://")
+      || masterWithoutK8sPrefix.startsWith("https://")) {
+      masterWithoutK8sPrefix
+    } else {
+      val resolvedURL = s"https://$masterWithoutK8sPrefix"
+      logDebug(s"No scheme specified for kubernetes master URL, so defaulting to https. Resolved" +
+        s" URL is $resolvedURL")
+      resolvedURL
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 03b3d21ac9c45..f82cb88b4c622 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -30,9 +30,9 @@ package object constants {
   private[spark] val SUBMISSION_APP_SECRET_PREFIX = "spark-submission-server-secret"
   private[spark] val SUBMISSION_APP_SECRET_VOLUME_NAME = "spark-submission-secret-volume"
   private[spark] val SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME =
-      "spark-submission-server-key-password"
+    "spark-submission-server-key-password"
   private[spark] val SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME =
-      "spark-submission-server-keystore-password"
+    "spark-submission-server-keystore-password"
   private[spark] val SUBMISSION_SSL_KEYSTORE_SECRET_NAME = "spark-submission-server-keystore"
   private[spark] val SUBMISSION_SSL_SECRETS_PREFIX = "spark-submission-server-ssl"
   private[spark] val SUBMISSION_SSL_SECRETS_VOLUME_NAME = "spark-submission-server-ssl-secrets"
@@ -55,9 +55,9 @@ package object constants {
   private[spark] val ENV_SUBMISSION_SERVER_PORT = "SPARK_SUBMISSION_SERVER_PORT"
   private[spark] val ENV_SUBMISSION_KEYSTORE_FILE = "SPARK_SUBMISSION_KEYSTORE_FILE"
   private[spark] val ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE =
-      "SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"
+    "SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"
   private[spark] val ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE =
-      "SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"
+    "SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"
   private[spark] val ENV_SUBMISSION_KEYSTORE_TYPE = "SPARK_SUBMISSION_KEYSTORE_TYPE"
   private[spark] val ENV_SUBMISSION_KEY_PEM_FILE = "SPARK_SUBMISSION_KEY_PEM_FILE"
   private[spark] val ENV_SUBMISSION_CERT_PEM_FILE = "SPARK_SUBMISSION_CERT_PEM_FILE"
@@ -70,12 +70,18 @@ package object constants {
   private[spark] val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   private[spark] val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
   private[spark] val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
+  private[spark] val ENV_UPLOADED_JARS_DIR = "SPARK_UPLOADED_JARS_DIR"
+  private[spark] val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  private[spark] val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
+  private[spark] val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
+  private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
+  private[spark] val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
 
   // Annotation keys
   private[spark] val ANNOTATION_PROVIDE_EXTERNAL_URI =
-      "spark-job.alpha.apache.org/provideExternalUri"
+    "spark-job.alpha.apache.org/provideExternalUri"
   private[spark] val ANNOTATION_RESOLVED_EXTERNAL_URI =
-      "spark-job.alpha.apache.org/resolvedExternalUri"
+    "spark-job.alpha.apache.org/resolvedExternalUri"
 
   // Miscellaneous
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
@@ -83,4 +89,28 @@ package object constants {
   private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
   private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
   private[spark] val MEMORY_OVERHEAD_MIN = 384L
+
+  // V2 submission init container
+  private[spark] val INIT_CONTAINER_ANNOTATION = "pod.beta.kubernetes.io/init-containers"
+  private[spark] val INIT_CONTAINER_SECRETS_VOLUME_NAME = "dependency-secret"
+  private[spark] val INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH = "/mnt/secrets/spark-init"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY = "downloadJarsSecret"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY = "downloadFilesSecret"
+  private[spark] val INIT_CONTAINER_TRUSTSTORE_SECRET_KEY = "trustStore"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH =
+    s"$INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH/$INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH =
+    s"$INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH/$INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY"
+  private[spark] val INIT_CONTAINER_TRUSTSTORE_PATH =
+    s"$INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH/$INIT_CONTAINER_TRUSTSTORE_SECRET_KEY"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_CREDENTIALS_PATH =
+    "/mnt/secrets/kubernetes-credentials"
+  private[spark] val INIT_CONTAINER_CONFIG_MAP_KEY = "init-driver"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "init-container-properties"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH = "/etc/spark-init/"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_NAME = "init-driver.properties"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_PATH =
+    s"$INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
+  private[spark] val DOWNLOAD_JARS_VOLUME_NAME = "download-jars"
+  private[spark] val DOWNLOAD_FILES_VOLUME_NAME = "download-files"
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index 72d24f7bf8342..e1cfac8feba37 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -30,6 +30,7 @@ import org.apache.commons.codec.binary.Base64
 import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesCredentials, KubernetesFileUtils, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
@@ -733,20 +734,4 @@ private[spark] object Client extends Logging {
       sparkConf = sparkConf,
       appArgs = appArgs).run()
   }
-
-  def resolveK8sMaster(rawMasterString: String): String = {
-    if (!rawMasterString.startsWith("k8s://")) {
-      throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
-    }
-    val masterWithoutK8sPrefix = rawMasterString.replaceFirst("k8s://", "")
-    if (masterWithoutK8sPrefix.startsWith("http://")
-        || masterWithoutK8sPrefix.startsWith("https://")) {
-      masterWithoutK8sPrefix
-    } else {
-      val resolvedURL = s"https://$masterWithoutK8sPrefix"
-      logDebug(s"No scheme specified for kubernetes master URL, so defaulting to https. Resolved" +
-        s" URL is $resolvedURL")
-      resolvedURL
-    }
-  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
index bc7490ef9ec4a..112226dbe3fc1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
@@ -45,7 +45,6 @@ private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf
       s"Driver client key file provided at %s does not exist or is not a file.")
     val clientCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
       s"Driver client cert file provided at %s does not exist or is not a file.")
-    val serviceAccountName = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
     KubernetesCredentials(
       oauthToken = oauthToken,
       caCertDataBase64 = caCertDataBase64,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
new file mode 100644
index 0000000000000..69dbfd041bb86
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.File
+import java.util.Collections
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, HasMetadata, OwnerReferenceBuilder, PodBuilder}
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.Logging
+import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.util.Utils
+
+/**
+ * Submission client for launching Spark applications on Kubernetes clusters.
+ *
+ * This class is responsible for instantiating Kubernetes resources that allow a Spark driver to
+ * run in a pod on the Kubernetes cluster with the Spark configurations specified by spark-submit.
+ * Application submitters that desire to provide their application's dependencies from their local
+ * disk must provide a resource staging server URI to this client so that the client can push the
+ * local resources to the resource staging server and have the driver pod pull the resources in an
+ * init-container. Interactions with the resource staging server are offloaded to the
+ * {@link MountedDependencyManager} class. If instead the application submitter has their
+ * dependencies pre-staged in remote locations like HDFS or their own HTTP servers already, then
+ * the mounted dependency manager is bypassed entirely, but the init-container still needs to
+ * fetch these remote dependencies (TODO https://github.com/apache-spark-on-k8s/spark/issues/238).
+ */
+private[spark] class Client(
+    mainClass: String,
+    sparkConf: SparkConf,
+    appArgs: Array[String],
+    mainAppResource: String,
+    kubernetesClientProvider: SubmissionKubernetesClientProvider,
+    mountedDependencyManagerProvider: MountedDependencyManagerProvider) extends Logging {
+
+  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
+  private val master = resolveK8sMaster(sparkConf.get("spark.master"))
+  private val launchTime = System.currentTimeMillis
+  private val appName = sparkConf.getOption("spark.app.name")
+    .getOrElse("spark")
+  private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+  private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
+  private val maybeStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
+  private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
+  private val memoryOverheadMb = sparkConf
+    .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMb).toInt,
+      MEMORY_OVERHEAD_MIN))
+  private val driverContainerMemoryWithOverhead = driverMemoryMb + memoryOverheadMb
+  private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
+  private val customAnnotations = sparkConf.get(KUBERNETES_DRIVER_ANNOTATIONS)
+  private val sparkJars = sparkConf.getOption("spark.jars")
+    .map(_.split(","))
+    .getOrElse(Array.empty[String]) ++
+    Option(mainAppResource)
+      .filterNot(_ == SparkLauncher.NO_RESOURCE)
+      .toSeq
+
+  private val sparkFiles = sparkConf.getOption("spark.files")
+    .map(_.split(","))
+    .getOrElse(Array.empty[String])
+  private val driverExtraClasspath = sparkConf.get(
+    org.apache.spark.internal.config.DRIVER_CLASS_PATH)
+  private val driverJavaOptions = sparkConf.get(
+    org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
+
+  def run(): Unit = {
+    val parsedCustomLabels = parseKeyValuePairs(customLabels, KUBERNETES_DRIVER_LABELS.key,
+      "labels")
+    require(!parsedCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
+      s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
+    require(!parsedCustomLabels.contains(SPARK_APP_NAME_LABEL), s"Label with key" +
+      s" $SPARK_APP_NAME_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
+    val allLabels = parsedCustomLabels ++
+      Map(SPARK_APP_ID_LABEL -> kubernetesAppId, SPARK_APP_NAME_LABEL -> appName)
+    val parsedCustomAnnotations = parseKeyValuePairs(
+      customAnnotations,
+      KUBERNETES_DRIVER_ANNOTATIONS.key,
+      "annotations")
+    Utils.tryWithResource(kubernetesClientProvider.get) { kubernetesClient =>
+      val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
+        new EnvVarBuilder()
+          .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+          .withValue(classPath)
+          .build()
+      }
+      val driverContainer = new ContainerBuilder()
+        .withName(DRIVER_CONTAINER_NAME)
+        .withImage(driverDockerImage)
+        .withImagePullPolicy("IfNotPresent")
+        .addToEnv(driverExtraClasspathEnv.toSeq: _*)
+        .addNewEnv()
+          .withName(ENV_DRIVER_MEMORY)
+          .withValue(driverContainerMemoryWithOverhead + "m")
+          .endEnv()
+        .addNewEnv()
+          .withName(ENV_DRIVER_MAIN_CLASS)
+          .withValue(mainClass)
+          .endEnv()
+        .addNewEnv()
+          .withName(ENV_DRIVER_ARGS)
+          .withValue(appArgs.mkString(" "))
+          .endEnv()
+        .build()
+      val basePod = new PodBuilder()
+        .withNewMetadata()
+          .withName(kubernetesAppId)
+          .addToLabels(allLabels.asJava)
+          .addToAnnotations(parsedCustomAnnotations.asJava)
+          .endMetadata()
+        .withNewSpec()
+          .addToContainers(driverContainer)
+          .endSpec()
+
+      val nonDriverPodKubernetesResources = mutable.Buffer[HasMetadata]()
+      val resolvedJars = mutable.Buffer[String]()
+      val resolvedFiles = mutable.Buffer[String]()
+      val driverPodWithMountedDeps = maybeStagingServerUri.map { stagingServerUri =>
+        val mountedDependencyManager = mountedDependencyManagerProvider.getMountedDependencyManager(
+          kubernetesAppId,
+          stagingServerUri,
+          allLabels,
+          namespace,
+          sparkJars,
+          sparkFiles)
+        val jarsResourceIdentifier = mountedDependencyManager.uploadJars()
+        val filesResourceIdentifier = mountedDependencyManager.uploadFiles()
+        val initContainerKubernetesSecret = mountedDependencyManager.buildInitContainerSecret(
+          jarsResourceIdentifier.resourceSecret, filesResourceIdentifier.resourceSecret)
+        val initContainerConfigMap = mountedDependencyManager.buildInitContainerConfigMap(
+          jarsResourceIdentifier.resourceId, filesResourceIdentifier.resourceId)
+        resolvedJars ++= mountedDependencyManager.resolveSparkJars()
+        resolvedFiles ++= mountedDependencyManager.resolveSparkFiles()
+        nonDriverPodKubernetesResources += initContainerKubernetesSecret
+        nonDriverPodKubernetesResources += initContainerConfigMap
+        mountedDependencyManager.configurePodToMountLocalDependencies(
+          driverContainer.getName, initContainerKubernetesSecret, initContainerConfigMap, basePod)
+      }.getOrElse {
+        sparkJars.map(Utils.resolveURI).foreach { jar =>
+          require(Option.apply(jar.getScheme).getOrElse("file") != "file",
+            "When submitting with local jars, a resource staging server must be provided to" +
+              s" deploy your jars into the driver pod. Cannot send jar with URI $jar.")
+        }
+        sparkFiles.map(Utils.resolveURI).foreach { file =>
+          require(Option.apply(file.getScheme).getOrElse("file") != "file",
+            "When submitting with local files, a resource staging server must be provided to" +
+              s" deploy your files into the driver pod. Cannot send file with URI $file")
+        }
+        resolvedJars ++= sparkJars
+        resolvedFiles ++= sparkFiles
+        basePod
+      }
+      val resolvedSparkConf = sparkConf.clone()
+      if (resolvedJars.nonEmpty) {
+        resolvedSparkConf.set("spark.jars", resolvedJars.mkString(","))
+      }
+      if (resolvedFiles.nonEmpty) {
+        resolvedSparkConf.set("spark.files", resolvedFiles.mkString(","))
+      }
+      resolvedSparkConf.set(KUBERNETES_DRIVER_POD_NAME, kubernetesAppId)
+      resolvedSparkConf.set("spark.app.id", kubernetesAppId)
+      // We don't need this anymore since we just set the JVM options on the environment
+      resolvedSparkConf.remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
+      resolvedSparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
+        resolvedSparkConf.set(KUBERNETES_SUBMIT_OAUTH_TOKEN.key, "<present_but_redacted>")
+      }
+      resolvedSparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
+        resolvedSparkConf.set(KUBERNETES_DRIVER_OAUTH_TOKEN.key, "<present_but_redacted>")
+      }
+
+      val mountedClassPath = resolvedJars.map(Utils.resolveURI).filter { jarUri =>
+        val scheme = Option.apply(jarUri.getScheme).getOrElse("file")
+        scheme == "local" || scheme == "file"
+      }.map(_.getPath).mkString(File.pathSeparator)
+      val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map { case (confKey, confValue) =>
+          s"-D$confKey=$confValue"
+      }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
+      val resolvedDriverPod = driverPodWithMountedDeps.editSpec()
+        .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
+          .addNewEnv()
+            .withName(ENV_MOUNTED_CLASSPATH)
+            .withValue(mountedClassPath)
+            .endEnv()
+          .addNewEnv()
+            .withName(ENV_DRIVER_JAVA_OPTS)
+            .withValue(resolvedDriverJavaOpts)
+            .endEnv()
+          .endContainer()
+        .endSpec()
+        .build()
+      val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
+      try {
+        val driverPodOwnerReference = new OwnerReferenceBuilder()
+          .withName(createdDriverPod.getMetadata.getName)
+          .withApiVersion(createdDriverPod.getApiVersion)
+          .withUid(createdDriverPod.getMetadata.getUid)
+          .withKind(createdDriverPod.getKind)
+          .withController(true)
+          .build()
+        nonDriverPodKubernetesResources.foreach { resource =>
+          val originalMetadata = resource.getMetadata
+          originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
+        }
+        kubernetesClient.resourceList(nonDriverPodKubernetesResources: _*).createOrReplace()
+      } catch {
+        case e: Throwable =>
+          kubernetesClient.pods().delete(createdDriverPod)
+          throw e
+      }
+    }
+  }
+
+  private def parseKeyValuePairs(
+      maybeKeyValues: Option[String],
+      configKey: String,
+      keyValueType: String): Map[String, String] = {
+    maybeKeyValues.map(keyValues => {
+      keyValues.split(",").map(_.trim).filterNot(_.isEmpty).map(keyValue => {
+        keyValue.split("=", 2).toSeq match {
+          case Seq(k, v) =>
+            (k, v)
+          case _ =>
+            throw new SparkException(s"Custom $keyValueType set by $configKey must be a" +
+              s" comma-separated list of key-value pairs, with format <key>=<value>." +
+              s" Got value: $keyValue. All values: $keyValues")
+        }
+      }).toMap
+    }).getOrElse(Map.empty[String, String])
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala
new file mode 100644
index 0000000000000..5101e1506e4d5
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.lang.Boolean
+
+import io.fabric8.kubernetes.api.builder.Predicate
+import io.fabric8.kubernetes.api.model.ContainerBuilder
+
+private[spark] class ContainerNameEqualityPredicate(containerName: String)
+    extends Predicate[ContainerBuilder] {
+  override def apply(item: ContainerBuilder): Boolean = {
+    item.getName == containerName
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala
new file mode 100644
index 0000000000000..9dbbcd0d56a3b
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.{File, FileOutputStream, StringWriter}
+import java.util.Properties
+import javax.ws.rs.core.MediaType
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, Container, ContainerBuilder, EmptyDirVolumeSource, PodBuilder, Secret, SecretBuilder, VolumeMount, VolumeMountBuilder}
+import okhttp3.RequestBody
+import retrofit2.Call
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.{SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.rest.kubernetes.v1.{KubernetesCredentials, KubernetesFileUtils}
+import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory, StagedResourceIdentifier}
+import org.apache.spark.util.Utils
+
+private[spark] trait MountedDependencyManager {
+
+  /**
+   * Upload submitter-local jars to the resource staging server.
+   * @return The resource ID and secret to use to retrieve these jars.
+   */
+  def uploadJars(): StagedResourceIdentifier
+
+  /**
+   * Upload submitter-local files to the resource staging server.
+   * @return The resource ID and secret to use to retrieve these files.
+   */
+  def uploadFiles(): StagedResourceIdentifier
+
+  def configurePodToMountLocalDependencies(
+    driverContainerName: String,
+    initContainerSecret: Secret,
+    initContainerConfigMap: ConfigMap,
+    originalPodSpec: PodBuilder): PodBuilder
+
+  def buildInitContainerSecret(jarsSecret: String, filesSecret: String): Secret
+
+  def buildInitContainerConfigMap(
+    jarsResourceId: String, filesResourceId: String): ConfigMap
+
+  /**
+   * Convert the Spark jar paths from their locations on the submitter's disk to
+   * the locations they will be downloaded to on the driver's disk.
+   */
+  def resolveSparkJars(): Seq[String]
+
+  /**
+   * Convert the Spark file paths from their locations on the submitter's disk to
+   * the locations they will be downloaded to on the driver's disk.
+   */
+  def resolveSparkFiles(): Seq[String]
+}
+
+/**
+ * Default implementation of a MountedDependencyManager that is backed by a
+ * Resource Staging Service.
+ */
+private[spark] class MountedDependencyManagerImpl(
+    kubernetesAppId: String,
+    podLabels: Map[String, String],
+    podNamespace: String,
+    stagingServerUri: String,
+    initContainerImage: String,
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    downloadTimeoutMinutes: Long,
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    stagingServiceSslOptions: SSLOptions,
+    retrofitClientFactory: RetrofitClientFactory) extends MountedDependencyManager {
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+
+  private def localUriStringsToFiles(uris: Seq[String]): Iterable[File] = {
+    KubernetesFileUtils.getOnlySubmitterLocalFiles(uris)
+      .map(Utils.resolveURI)
+      .map(uri => new File(uri.getPath))
+  }
+  private def localJars: Iterable[File] = localUriStringsToFiles(sparkJars)
+  private def localFiles: Iterable[File] = localUriStringsToFiles(sparkFiles)
+
+  override def uploadJars(): StagedResourceIdentifier = doUpload(localJars, "uploaded-jars")
+  override def uploadFiles(): StagedResourceIdentifier = doUpload(localFiles, "uploaded-files")
+
+  private def doUpload(files: Iterable[File], fileNamePrefix: String): StagedResourceIdentifier = {
+    val filesDir = Utils.createTempDir(namePrefix = fileNamePrefix)
+    val filesTgz = new File(filesDir, s"$fileNamePrefix.tgz")
+    Utils.tryWithResource(new FileOutputStream(filesTgz)) { filesOutputStream =>
+      CompressionUtils.writeTarGzipToStream(filesOutputStream, files.map(_.getAbsolutePath))
+    }
+    // TODO provide credentials properly when the staging server monitors the Kubernetes API.
+    val kubernetesCredentialsString = OBJECT_MAPPER.writer()
+      .writeValueAsString(KubernetesCredentials(None, None, None, None))
+    val labelsAsString = OBJECT_MAPPER.writer().writeValueAsString(podLabels)
+
+    val filesRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), filesTgz)
+
+    val kubernetesCredentialsBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), kubernetesCredentialsString)
+
+    val namespaceRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.TEXT_PLAIN), podNamespace)
+
+    val labelsRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsAsString)
+
+    val service = retrofitClientFactory.createRetrofitClient(
+      stagingServerUri,
+      classOf[ResourceStagingServiceRetrofit],
+      stagingServiceSslOptions)
+    val uploadResponse = service.uploadResources(
+      labelsRequestBody, namespaceRequestBody, filesRequestBody, kubernetesCredentialsBody)
+    getTypedResponseResult(uploadResponse)
+  }
+
+  override def configurePodToMountLocalDependencies(
+      driverContainerName: String,
+      initContainerSecret: Secret,
+      initContainerConfigMap: ConfigMap,
+      originalPodSpec: PodBuilder): PodBuilder = {
+    val sharedVolumeMounts = Seq[VolumeMount](
+      new VolumeMountBuilder()
+        .withName(DOWNLOAD_JARS_VOLUME_NAME)
+        .withMountPath(jarsDownloadPath)
+        .build(),
+      new VolumeMountBuilder()
+        .withName(DOWNLOAD_FILES_VOLUME_NAME)
+        .withMountPath(filesDownloadPath)
+        .build())
+
+    val initContainers = Seq(new ContainerBuilder()
+      .withName("spark-driver-init")
+      .withImage(initContainerImage)
+      .withImagePullPolicy("IfNotPresent")
+      .addNewVolumeMount()
+        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+        .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH)
+        .endVolumeMount()
+      .addNewVolumeMount()
+        .withName(INIT_CONTAINER_SECRETS_VOLUME_NAME)
+        .withMountPath(INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH)
+        .endVolumeMount()
+      .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
+      .build())
+
+    // Make sure we don't override any user-provided init containers by just appending ours to
+    // the existing list.
+    val resolvedInitContainers = originalPodSpec
+      .editMetadata()
+      .getAnnotations
+      .asScala
+      .get(INIT_CONTAINER_ANNOTATION)
+      .map { existingInitContainerAnnotation =>
+        val existingInitContainers = OBJECT_MAPPER.readValue(
+          existingInitContainerAnnotation, classOf[List[Container]])
+        existingInitContainers ++ initContainers
+      }.getOrElse(initContainers)
+    val resolvedSerializedInitContainers = OBJECT_MAPPER.writeValueAsString(resolvedInitContainers)
+    originalPodSpec
+      .editMetadata()
+        .removeFromAnnotations(INIT_CONTAINER_ANNOTATION)
+        .addToAnnotations(INIT_CONTAINER_ANNOTATION, resolvedSerializedInitContainers)
+        .endMetadata()
+      .editSpec()
+        .addNewVolume()
+          .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+          .withNewConfigMap()
+            .withName(initContainerConfigMap.getMetadata.getName)
+            .addNewItem()
+              .withKey(INIT_CONTAINER_CONFIG_MAP_KEY)
+              .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME)
+              .endItem()
+            .endConfigMap()
+          .endVolume()
+        .addNewVolume()
+          .withName(DOWNLOAD_JARS_VOLUME_NAME)
+          .withEmptyDir(new EmptyDirVolumeSource())
+          .endVolume()
+        .addNewVolume()
+          .withName(DOWNLOAD_FILES_VOLUME_NAME)
+          .withEmptyDir(new EmptyDirVolumeSource())
+          .endVolume()
+        .addNewVolume()
+          .withName(INIT_CONTAINER_SECRETS_VOLUME_NAME)
+          .withNewSecret()
+            .withSecretName(initContainerSecret.getMetadata.getName)
+            .endSecret()
+          .endVolume()
+        .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainerName))
+          .addToVolumeMounts(sharedVolumeMounts: _*)
+          .addNewEnv()
+            .withName(ENV_UPLOADED_JARS_DIR)
+            .withValue(jarsDownloadPath)
+            .endEnv()
+          .endContainer()
+        .endSpec()
+  }
+
+  override def buildInitContainerSecret(jarsSecret: String, filesSecret: String): Secret = {
+    val trustStoreBase64 = stagingServiceSslOptions.trustStore.map { trustStoreFile =>
+      require(trustStoreFile.isFile, "Dependency server trustStore provided at" +
+        trustStoreFile.getAbsolutePath + " does not exist or is not a file.")
+      (INIT_CONTAINER_TRUSTSTORE_SECRET_KEY,
+        BaseEncoding.base64().encode(Files.toByteArray(trustStoreFile)))
+    }.toMap
+    val jarsSecretBase64 = BaseEncoding.base64().encode(jarsSecret.getBytes(Charsets.UTF_8))
+    val filesSecretBase64 = BaseEncoding.base64().encode(filesSecret.getBytes(Charsets.UTF_8))
+    val secretData = Map(
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY -> jarsSecretBase64,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY -> filesSecretBase64) ++
+      trustStoreBase64
+    val kubernetesSecret = new SecretBuilder()
+      .withNewMetadata()
+      .withName(s"$kubernetesAppId-spark-init")
+      .endMetadata()
+      .addToData(secretData.asJava)
+      .build()
+    kubernetesSecret
+  }
+
+  override def buildInitContainerConfigMap(
+       jarsResourceId: String, filesResourceId: String): ConfigMap = {
+    val initContainerProperties = new Properties()
+    initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_URI.key, stagingServerUri)
+    initContainerProperties.setProperty(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION.key, jarsDownloadPath)
+    initContainerProperties.setProperty(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION.key, filesDownloadPath)
+    initContainerProperties.setProperty(
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key, jarsResourceId)
+    initContainerProperties.setProperty(
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key, INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH)
+    initContainerProperties.setProperty(
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key, filesResourceId)
+    initContainerProperties.setProperty(
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key, INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH)
+    initContainerProperties.setProperty(DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT.key,
+      s"${downloadTimeoutMinutes}m")
+    stagingServiceSslOptions.trustStore.foreach { _ =>
+      initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key,
+        INIT_CONTAINER_TRUSTSTORE_PATH)
+    }
+    initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_SSL_ENABLED.key,
+      stagingServiceSslOptions.enabled.toString)
+    stagingServiceSslOptions.trustStorePassword.foreach { password =>
+      initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key, password)
+    }
+    stagingServiceSslOptions.trustStoreType.foreach { storeType =>
+      initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key, storeType)
+    }
+    val propertiesWriter = new StringWriter()
+    initContainerProperties.store(propertiesWriter, "Init-container properties.")
+    new ConfigMapBuilder()
+      .withNewMetadata()
+      .withName(s"$kubernetesAppId-init-properties")
+      .endMetadata()
+      .addToData(INIT_CONTAINER_CONFIG_MAP_KEY, propertiesWriter.toString)
+      .build()
+  }
+
+  override def resolveSparkJars(): Seq[String] = resolveLocalFiles(sparkJars, jarsDownloadPath)
+
+  override def resolveSparkFiles(): Seq[String] = resolveLocalFiles(sparkFiles, filesDownloadPath)
+
+  private def resolveLocalFiles(
+      allFileUriStrings: Seq[String], localDownloadRoot: String): Seq[String] = {
+    val usedLocalFileNames = mutable.HashSet.empty[String]
+    val resolvedFiles = mutable.Buffer.empty[String]
+    for (fileUriString <- allFileUriStrings) {
+      val fileUri = Utils.resolveURI(fileUriString)
+      val resolvedFile = Option(fileUri.getScheme).getOrElse("file") match {
+        case "file" =>
+          // Deduplication logic matches that of CompressionUtils#writeTarGzipToStream
+          val file = new File(fileUri.getPath)
+          val extension = Files.getFileExtension(file.getName)
+          val nameWithoutExtension = Files.getNameWithoutExtension(file.getName)
+          var resolvedFileName = file.getName
+          var deduplicationCounter = 1
+          while (usedLocalFileNames.contains(resolvedFileName)) {
+            resolvedFileName = s"$nameWithoutExtension-$deduplicationCounter.$extension"
+            deduplicationCounter += 1
+          }
+          s"file://$localDownloadRoot/$resolvedFileName"
+        case _ => fileUriString
+      }
+      resolvedFiles += resolvedFile
+    }
+    resolvedFiles
+  }
+
+  private def getTypedResponseResult[T](call: Call[T]): T = {
+    val response = call.execute()
+    if (response.code() < 200 || response.code() >= 300) {
+      throw new SparkException("Unexpected response from dependency server when uploading" +
+        s" dependencies: ${response.code()}. Error body: " +
+        Option(response.errorBody()).map(_.string()).getOrElse("N/A"))
+    }
+    response.body()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala
new file mode 100644
index 0000000000000..8f09112132b2c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.rest.kubernetes.v2.RetrofitClientFactoryImpl
+
+private[spark] trait MountedDependencyManagerProvider {
+  def getMountedDependencyManager(
+    kubernetesAppId: String,
+    stagingServerUri: String,
+    podLabels: Map[String, String],
+    podNamespace: String,
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String]): MountedDependencyManager
+}
+
+private[spark] class MountedDependencyManagerProviderImpl(sparkConf: SparkConf)
+    extends MountedDependencyManagerProvider {
+  override def getMountedDependencyManager(
+      kubernetesAppId: String,
+      stagingServerUri: String,
+      podLabels: Map[String, String],
+      podNamespace: String,
+      sparkJars: Seq[String],
+      sparkFiles: Seq[String]): MountedDependencyManager = {
+    val resourceStagingServerSslOptions = new SparkSecurityManager(sparkConf)
+      .getSSLOptions("kubernetes.resourceStagingServer")
+    new MountedDependencyManagerImpl(
+      kubernetesAppId,
+      podLabels,
+      podNamespace,
+      stagingServerUri,
+      sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE),
+      sparkConf.get(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION),
+      sparkConf.get(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION),
+      sparkConf.get(DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT),
+      sparkJars,
+      sparkFiles,
+      resourceStagingServerSslOptions,
+      RetrofitClientFactoryImpl)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala
new file mode 100644
index 0000000000000..af3de6ce85026
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.internal.Logging
+
+trait SubmissionKubernetesClientProvider {
+  def get: KubernetesClient
+}
+
+private[spark] class SubmissionKubernetesClientProviderImpl(sparkConf: SparkConf)
+    extends SubmissionKubernetesClientProvider with Logging {
+
+  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
+  private val master = resolveK8sMaster(sparkConf.get("spark.master"))
+
+  override def get: KubernetesClient = {
+    var k8ConfBuilder = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(master)
+      .withNamespace(namespace)
+    sparkConf.get(KUBERNETES_SUBMIT_CA_CERT_FILE).foreach {
+      f => k8ConfBuilder = k8ConfBuilder.withCaCertFile(f)
+    }
+    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_KEY_FILE).foreach {
+      f => k8ConfBuilder = k8ConfBuilder.withClientKeyFile(f)
+    }
+    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_CERT_FILE).foreach {
+      f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
+    }
+    sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { token =>
+      k8ConfBuilder = k8ConfBuilder.withOauthToken(token)
+    }
+    val k8ClientConfig = k8ConfBuilder.build
+    new DefaultKubernetesClient(k8ClientConfig)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
index 048427fa4ec23..ca05fe767146b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
@@ -33,8 +33,8 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.submit.v1.CompressionUtils
 import org.apache.spark.deploy.rest._
 import org.apache.spark.internal.config.OptionalConfigEntry
 import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
new file mode 100644
index 0000000000000..680d305985cc0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.File
+import java.util.concurrent.TimeUnit
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import com.google.common.util.concurrent.SettableFuture
+import okhttp3.ResponseBody
+import retrofit2.{Call, Callback, Response}
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+private trait WaitableCallback[T] extends Callback[T] {
+  private val complete = SettableFuture.create[Boolean]
+
+  override final def onFailure(call: Call[T], t: Throwable): Unit = complete.setException(t)
+
+  override final def onResponse(call: Call[T], response: Response[T]): Unit = {
+    require(response.code() >= 200 && response.code() < 300, Option(response.errorBody())
+      .map(_.string())
+      .getOrElse(s"Error executing HTTP request, but error body was not provided."))
+    handleResponse(response.body())
+    complete.set(true)
+  }
+
+  protected def handleResponse(body: T): Unit
+
+  final def waitForCompletion(time: Long, timeUnit: TimeUnit): Unit = {
+    complete.get(time, timeUnit)
+  }
+}
+
+private class DownloadTarGzCallback(downloadDir: File) extends WaitableCallback[ResponseBody] {
+
+  override def handleResponse(responseBody: ResponseBody): Unit = {
+    Utils.tryWithResource(responseBody.byteStream()) { responseStream =>
+      CompressionUtils.unpackTarStreamToDirectory(responseStream, downloadDir)
+    }
+  }
+}
+
+private[spark] class KubernetesSparkDependencyDownloadInitContainer(
+    sparkConf: SparkConf, retrofitClientFactory: RetrofitClientFactory) extends Logging {
+
+  private val resourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
+    .getOrElse(throw new SparkException("No dependency server URI was provided."))
+
+  private val downloadJarsResourceIdentifier = sparkConf
+    .get(INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER)
+    .getOrElse(throw new SparkException("No resource identifier provided for jars."))
+  private val downloadJarsSecretLocation = new File(
+    sparkConf.get(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION))
+  private val downloadFilesResourceIdentifier = sparkConf
+    .get(INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER)
+    .getOrElse(throw new SparkException("No resource identifier provided for files."))
+  private val downloadFilesSecretLocation = new File(
+    sparkConf.get(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION))
+  require(downloadJarsSecretLocation.isFile, "Application jars download secret provided" +
+    s" at ${downloadJarsSecretLocation.getAbsolutePath} does not exist or is not a file.")
+  require(downloadFilesSecretLocation.isFile, "Application files download secret provided" +
+    s" at ${downloadFilesSecretLocation.getAbsolutePath} does not exist or is not a file.")
+
+  private val jarsDownloadDir = new File(sparkConf.get(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION))
+  require(jarsDownloadDir.isDirectory, "Application jars download directory provided at" +
+    s" ${jarsDownloadDir.getAbsolutePath} does not exist or is not a directory.")
+
+  private val filesDownloadDir = new File(sparkConf.get(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION))
+  require(filesDownloadDir.isDirectory, "Application files download directory provided at" +
+    s" ${filesDownloadDir.getAbsolutePath} does not exist or is not a directory.")
+  private val downloadTimeoutMinutes = sparkConf.get(DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT)
+
+  def run(): Unit = {
+    val securityManager = new SparkSecurityManager(sparkConf)
+    val sslOptions = securityManager.getSSLOptions("kubernetes.resourceStagingServer")
+    val service = retrofitClientFactory.createRetrofitClient(
+      resourceStagingServerUri, classOf[ResourceStagingServiceRetrofit], sslOptions)
+    val jarsSecret = Files.toString(downloadJarsSecretLocation, Charsets.UTF_8)
+    val filesSecret = Files.toString(downloadFilesSecretLocation, Charsets.UTF_8)
+    val downloadJarsCallback = new DownloadTarGzCallback(jarsDownloadDir)
+    val downloadFilesCallback = new DownloadTarGzCallback(filesDownloadDir)
+    service.downloadResources(downloadJarsResourceIdentifier, jarsSecret)
+      .enqueue(downloadJarsCallback)
+    service.downloadResources(downloadFilesResourceIdentifier, filesSecret)
+      .enqueue(downloadFilesCallback)
+    logInfo("Waiting to download jars...")
+    downloadJarsCallback.waitForCompletion(downloadTimeoutMinutes, TimeUnit.MINUTES)
+    logInfo(s"Jars downloaded to ${jarsDownloadDir.getAbsolutePath}")
+    logInfo("Waiting to download files...")
+    downloadFilesCallback.waitForCompletion(downloadTimeoutMinutes, TimeUnit.MINUTES)
+    logInfo(s"Files downloaded to ${filesDownloadDir.getAbsolutePath}")
+  }
+}
+
+object KubernetesSparkDependencyDownloadInitContainer extends Logging {
+  def main(args: Array[String]): Unit = {
+    logInfo("Starting init-container to download Spark application dependencies.")
+    val sparkConf = if (args.nonEmpty) {
+      SparkConfPropertiesParser.getSparkConfFromPropertiesFile(new File(args(0)))
+    } else {
+      new SparkConf(true)
+    }
+    new KubernetesSparkDependencyDownloadInitContainer(sparkConf, RetrofitClientFactoryImpl).run()
+    logInfo("Finished downloading application dependencies.")
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
index 8ca13da545d5d..4ecb6369ff3b0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
@@ -16,13 +16,11 @@
  */
 package org.apache.spark.deploy.rest.kubernetes.v2
 
-import java.io.{File, FileInputStream}
-import java.util.Properties
+import java.io.File
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import com.google.common.collect.Maps
 import org.eclipse.jetty.http.HttpVersion
 import org.eclipse.jetty.server.{HttpConfiguration, HttpConnectionFactory, Server, ServerConnector, SslConnectionFactory}
 import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
@@ -30,12 +28,10 @@ import org.eclipse.jetty.util.thread.{QueuedThreadPool, ScheduledExecutorSchedul
 import org.glassfish.jersey.media.multipart.MultiPartFeature
 import org.glassfish.jersey.server.ResourceConfig
 import org.glassfish.jersey.servlet.ServletContainer
-import scala.collection.JavaConverters._
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.{ConfigReader, SparkConfigProvider}
 import org.apache.spark.util.Utils
 
 private[spark] class ResourceStagingServer(
@@ -97,20 +93,10 @@ private[spark] class ResourceStagingServer(
 
 object ResourceStagingServer {
   def main(args: Array[String]): Unit = {
-    val sparkConf = new SparkConf(true)
-    if (args.nonEmpty) {
-      val propertiesFile = new File(args(0))
-      if (!propertiesFile.isFile) {
-        throw new IllegalArgumentException(s"Server properties file given at" +
-          s" ${propertiesFile.getAbsoluteFile} does not exist or is not a file.")
-      }
-      val properties = new Properties
-      Utils.tryWithResource(new FileInputStream(propertiesFile))(properties.load)
-      val propertiesMap = Maps.fromProperties(properties)
-      val configReader = new ConfigReader(new SparkConfigProvider(propertiesMap))
-      propertiesMap.asScala.keys.foreach { key =>
-        configReader.get(key).foreach(sparkConf.set(key, _))
-      }
+    val sparkConf = if (args.nonEmpty) {
+      SparkConfPropertiesParser.getSparkConfFromPropertiesFile(new File(args(0)))
+    } else {
+      new SparkConf(true)
     }
     val dependenciesRootDir = Utils.createTempDir(namePrefix = "local-application-dependencies")
     val serviceInstance = new ResourceStagingServiceImpl(dependenciesRootDir)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
index 95cc6ab949d5c..844809dec995c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
@@ -80,6 +80,15 @@ private[spark] trait ResourceStagingService {
   @Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
   @Path("/resources/{resourceId}")
   def downloadResources(
-      @PathParam("resourceId") resourceId: String,
-      @HeaderParam("Authorization") resourceSecret: String): StreamingOutput
+    @PathParam("resourceId") resourceId: String,
+    @HeaderParam("Authorization") resourceSecret: String): StreamingOutput
+
+  /**
+   * Health check.
+   */
+  @GET
+  @Consumes(Array(MediaType.APPLICATION_JSON))
+  @Produces(Array(MediaType.TEXT_PLAIN))
+  @Path("/ping")
+  def ping(): String
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
index 732969cd67d89..cf6180fbf53d4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
@@ -88,6 +88,8 @@ private[spark] class ResourceStagingServiceImpl(dependenciesRootDir: File)
       }
     }
   }
+
+  override def ping(): String = "pong"
 }
 
 private case class StagedResources(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
index daf03f764b35a..b1a3cc0676757 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
@@ -37,6 +37,10 @@ private[spark] trait ResourceStagingServiceRetrofit {
 
   @Streaming
   @retrofit2.http.GET("/api/v0/resources/{resourceId}")
-  def downloadResources(@Path("resourceId") resourceId: String,
-      @retrofit2.http.Header("Authorization") resourceSecret: String): Call[ResponseBody]
+  def downloadResources(
+    @Path("resourceId") resourceId: String,
+    @retrofit2.http.Header("Authorization") resourceSecret: String): Call[ResponseBody]
+
+  @retrofit2.http.GET("/api/ping")
+  def ping(): String
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitClientFactory.scala
similarity index 85%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitClientFactory.scala
index 7416c624e97f6..f906423524944 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitClientFactory.scala
@@ -22,21 +22,26 @@ import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import okhttp3.OkHttpClient
+import okhttp3.{Dispatcher, OkHttpClient}
 import retrofit2.Retrofit
 import retrofit2.converter.jackson.JacksonConverterFactory
 import retrofit2.converter.scalars.ScalarsConverterFactory
 
 import org.apache.spark.SSLOptions
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
-private[spark] object RetrofitUtils {
+private[spark] trait RetrofitClientFactory {
+  def createRetrofitClient[T](baseUrl: String, serviceType: Class[T], sslOptions: SSLOptions): T
+}
+
+private[spark] object RetrofitClientFactoryImpl extends RetrofitClientFactory {
 
   private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
   private val SECURE_RANDOM = new SecureRandom()
 
   def createRetrofitClient[T](baseUrl: String, serviceType: Class[T], sslOptions: SSLOptions): T = {
-    val okHttpClientBuilder = new OkHttpClient.Builder()
+    val dispatcher = new Dispatcher(ThreadUtils.newDaemonCachedThreadPool(s"http-client-$baseUrl"))
+    val okHttpClientBuilder = new OkHttpClient.Builder().dispatcher(dispatcher)
     sslOptions.trustStore.foreach { trustStoreFile =>
       require(trustStoreFile.isFile, s"TrustStore provided at ${trustStoreFile.getAbsolutePath}"
         + " does not exist, or is not a file.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala
new file mode 100644
index 0000000000000..cf9decab127c5
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.{File, FileInputStream}
+import java.util.Properties
+
+import com.google.common.collect.Maps
+import scala.collection.JavaConverters.mapAsScalaMapConverter
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.config.{ConfigReader, SparkConfigProvider}
+import org.apache.spark.util.Utils
+
+private[spark] object SparkConfPropertiesParser {
+
+  def getSparkConfFromPropertiesFile(propertiesFile: File): SparkConf = {
+    val sparkConf = new SparkConf(true)
+    if (!propertiesFile.isFile) {
+      throw new IllegalArgumentException(s"Server properties file given at" +
+        s" ${propertiesFile.getAbsoluteFile} does not exist or is not a file.")
+    }
+    val properties = new Properties
+    Utils.tryWithResource(new FileInputStream(propertiesFile))(properties.load)
+    val propertiesMap = Maps.fromProperties(properties)
+    val configReader = new ConfigReader(new SparkConfigProvider(propertiesMap))
+    propertiesMap.asScala.keys.foreach { key =>
+      configReader.get(key).foreach(sparkConf.set(key, _))
+    }
+    sparkConf
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
new file mode 100644
index 0000000000000..b8c2b0c91bbeb
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] class DriverPodKubernetesClientProvider(sparkConf: SparkConf, namespace: String) {
+  private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
+  private val SERVICE_ACCOUNT_CA_CERT = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
+  private val oauthTokenFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN)
+  private val caCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE)
+  private val clientKeyFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE)
+  private val clientCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE)
+
+  /**
+   * Creates a {@link KubernetesClient}, expecting to be from within the context of a pod. When
+   * doing so, service account token files can be picked up from canonical locations.
+   */
+  def get: DefaultKubernetesClient = {
+    val baseClientConfigBuilder = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(KUBERNETES_MASTER_INTERNAL_URL)
+      .withNamespace(namespace)
+
+    val configBuilder = oauthTokenFile
+        .orElse(caCertFile)
+        .orElse(clientKeyFile)
+        .orElse(clientCertFile)
+        .map { _ =>
+      var mountedAuthConfigBuilder = baseClientConfigBuilder
+      oauthTokenFile.foreach { tokenFilePath =>
+        val tokenFile = new File(tokenFilePath)
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder
+          .withOauthToken(Files.toString(tokenFile, Charsets.UTF_8))
+      }
+      caCertFile.foreach { caFile =>
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withCaCertFile(caFile)
+      }
+      clientKeyFile.foreach { keyFile =>
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientKeyFile(keyFile)
+      }
+      clientCertFile.foreach { certFile =>
+        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientCertFile(certFile)
+      }
+      mountedAuthConfigBuilder
+    }.getOrElse {
+      var serviceAccountConfigBuilder = baseClientConfigBuilder
+      if (SERVICE_ACCOUNT_CA_CERT.isFile) {
+        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withCaCertFile(
+          SERVICE_ACCOUNT_CA_CERT.getAbsolutePath)
+      }
+
+      if (SERVICE_ACCOUNT_TOKEN.isFile) {
+        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withOauthToken(
+          Files.toString(SERVICE_ACCOUNT_TOKEN, Charsets.UTF_8))
+      }
+      serviceAccountConfigBuilder
+    }
+    new DefaultKubernetesClient(configBuilder.build)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 130b143c7e92b..15457db7e1459 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -47,11 +47,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val blockmanagerPort = conf
     .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
 
-  private val kubernetesDriverServiceName = conf
-    .get(KUBERNETES_DRIVER_SERVICE_NAME)
-    .getOrElse(
-      throw new SparkException("Must specify the service name the driver is running with"))
-
   private val kubernetesDriverPodName = conf
     .get(KUBERNETES_DRIVER_POD_NAME)
     .getOrElse(
@@ -73,8 +68,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
 
-  private val kubernetesClient = new KubernetesClientBuilder(conf, kubernetesNamespace)
-    .buildFromWithinPod()
+  private val kubernetesClient = new DriverPodKubernetesClientProvider(conf, kubernetesNamespace)
+    .get
 
   private val driverPod = try {
     kubernetesClient.pods().inNamespace(kubernetesNamespace).
@@ -142,11 +137,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     } catch {
       case e: Throwable => logError("Uncaught exception while shutting down controllers.", e)
     }
-    try {
-      kubernetesClient.services().withName(kubernetesDriverServiceName).delete()
-    } catch {
-      case e: Throwable => logError("Uncaught exception while shutting down driver service.", e)
-    }
     try {
       kubernetesClient.close()
     } catch {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
new file mode 100644
index 0000000000000..9e2ab26460412
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.File
+
+import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, Container, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
+import io.fabric8.kubernetes.client.KubernetesClient
+import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
+import org.hamcrest.{BaseMatcher, Description}
+import org.mockito.Matchers.{any, anyVararg, argThat, startsWith, eq => mockitoEq}
+import org.mockito.Mockito.when
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar._
+import scala.collection.JavaConverters._
+import scala.reflect.ClassTag
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.rest.kubernetes.v2.StagedResourceIdentifier
+import org.apache.spark.util.Utils
+
+class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
+
+  private val MAIN_CLASS = "org.apache.spark.test.Main"
+  private val APP_ARGS = Array[String]("arg1", "arg2")
+  private val MAIN_APP_RESOURCE = "local:///app/jars/spark-main.jar"
+  private val APP_NAME = "spark-test-app"
+  private val STAGING_SERVER_URI = "http://localhost:9000"
+  private val SPARK_JARS = Seq(
+    "local:///app/jars/spark-helper.jar", "file:///var/data/spark-local-helper.jar")
+  private val RESOLVED_SPARK_JARS = Seq(
+    "local:///app/jars/spark-helper.jar",
+    "file:///var/data/spark-downloaded/spark-local-helper.jar")
+  private val SPARK_FILES = Seq(
+    "local:///app/files/spark-file.txt", "file:///var/data/spark-local-file.txt")
+  private val RESOLVED_SPARK_FILES = Seq(
+    "local:///app/files/spark-file.txt", "file:///var/data/spark-downloaded/spark-local-file.txt")
+  private val DRIVER_EXTRA_CLASSPATH = "/app/jars/extra-jar1.jar:/app/jars/extra-jars2.jar"
+  private val DRIVER_DOCKER_IMAGE_VALUE = "spark-driver:latest"
+  private val DRIVER_MEMORY_OVERHEARD_MB = 128L
+  private val DRIVER_MEMORY_MB = 512L
+  private val NAMESPACE = "namespace"
+  private val DOWNLOAD_JARS_RESOURCE_IDENTIFIER = StagedResourceIdentifier("jarsId", "jarsSecret")
+  private val DOWNLOAD_FILES_RESOURCE_IDENTIFIER = StagedResourceIdentifier(
+    "filesId", "filesSecret")
+  private val MOUNTED_FILES_ANNOTATION_KEY = "mountedFiles"
+
+  private var sparkConf: SparkConf = _
+  private var submissionKubernetesClientProvider: SubmissionKubernetesClientProvider = _
+  private var submissionKubernetesClient: KubernetesClient = _
+  private type PODS = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+  private type RESOURCES = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
+    HasMetadata, Boolean]
+  private var podOperations: PODS = _
+  private var resourceListOperations: RESOURCES = _
+  private var mountedDependencyManagerProvider: MountedDependencyManagerProvider = _
+  private var mountedDependencyManager: MountedDependencyManager = _
+  private var captureCreatedPodAnswer: SelfArgumentCapturingAnswer[Pod] = _
+  private var captureCreatedResourcesAnswer: AllArgumentsCapturingAnswer[HasMetadata, RESOURCES] = _
+
+  before {
+    sparkConf = new SparkConf(true)
+      .set("spark.app.name", APP_NAME)
+      .set("spark.master", "k8s://https://localhost:443")
+      .set(DRIVER_DOCKER_IMAGE, DRIVER_DOCKER_IMAGE_VALUE)
+      .set(KUBERNETES_DRIVER_MEMORY_OVERHEAD, DRIVER_MEMORY_OVERHEARD_MB)
+      .set(KUBERNETES_NAMESPACE, NAMESPACE)
+      .set(org.apache.spark.internal.config.DRIVER_MEMORY, DRIVER_MEMORY_MB)
+    submissionKubernetesClientProvider = mock[SubmissionKubernetesClientProvider]
+    submissionKubernetesClient = mock[KubernetesClient]
+    podOperations = mock[PODS]
+    resourceListOperations = mock[RESOURCES]
+    mountedDependencyManagerProvider = mock[MountedDependencyManagerProvider]
+    mountedDependencyManager = mock[MountedDependencyManager]
+    when(submissionKubernetesClientProvider.get).thenReturn(submissionKubernetesClient)
+    when(submissionKubernetesClient.pods()).thenReturn(podOperations)
+    captureCreatedPodAnswer = new SelfArgumentCapturingAnswer[Pod]
+    captureCreatedResourcesAnswer = new AllArgumentsCapturingAnswer[HasMetadata, RESOURCES](
+      resourceListOperations)
+    when(podOperations.create(any())).thenAnswer(captureCreatedPodAnswer)
+    when(submissionKubernetesClient.resourceList(anyVararg[HasMetadata]))
+      .thenAnswer(captureCreatedResourcesAnswer)
+  }
+
+  // Tests w/o local dependencies, or behave independently to that configuration.
+  test("Simple properties and environment set on the driver pod.") {
+    sparkConf.set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, DRIVER_EXTRA_CLASSPATH)
+    val createdDriverPod = createAndGetDriverPod()
+    val maybeDriverContainer = getDriverContainer(createdDriverPod)
+    maybeDriverContainer.foreach { driverContainer =>
+      assert(driverContainer.getName === DRIVER_CONTAINER_NAME)
+      assert(driverContainer.getImage === DRIVER_DOCKER_IMAGE_VALUE)
+      assert(driverContainer.getImagePullPolicy === "IfNotPresent")
+      val envs = driverContainer.getEnv.asScala.map { env =>
+        (env.getName, env.getValue)
+      }.toMap
+      assert(envs(ENV_DRIVER_MEMORY) === s"${DRIVER_MEMORY_MB + DRIVER_MEMORY_OVERHEARD_MB}m")
+      assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
+      assert(envs(ENV_DRIVER_ARGS) === APP_ARGS.mkString(" "))
+      assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === DRIVER_EXTRA_CLASSPATH)
+    }
+  }
+
+  test("Created pod should apply custom annotations and labels") {
+    sparkConf.set(KUBERNETES_DRIVER_LABELS,
+      "label1=label1value,label2=label2value")
+    sparkConf.set(KUBERNETES_DRIVER_ANNOTATIONS,
+      "annotation1=annotation1value,annotation2=annotation2value")
+    val createdDriverPod = createAndGetDriverPod()
+    val labels = createdDriverPod.getMetadata.getLabels.asScala
+    assert(labels.size === 4)
+    // App ID is non-deterministic, but just check if it's set and is prefixed with the app name
+    val appIdLabel = labels(SPARK_APP_ID_LABEL)
+    assert(appIdLabel != null && appIdLabel.startsWith(APP_NAME) && appIdLabel != APP_NAME)
+    val appNameLabel = labels(SPARK_APP_NAME_LABEL)
+    assert(appNameLabel != null && appNameLabel == APP_NAME)
+    assert(labels("label1") === "label1value")
+    assert(labels("label2") === "label2value")
+    val annotations = createdDriverPod.getMetadata.getAnnotations.asScala
+    val expectedAnnotations = Map(
+      "annotation1" -> "annotation1value", "annotation2" -> "annotation2value")
+    assert(annotations === expectedAnnotations)
+  }
+
+  test("Driver JVM Options should be set in the environment.") {
+    sparkConf.set(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS, "-Dopt1=opt1value")
+    sparkConf.set("spark.logConf", "true")
+    val createdDriverPod = createAndGetDriverPod()
+    val maybeDriverContainer = getDriverContainer(createdDriverPod)
+    maybeDriverContainer.foreach { driverContainer =>
+      val maybeJvmOptionsEnv = driverContainer.getEnv
+        .asScala
+        .find(_.getName == ENV_DRIVER_JAVA_OPTS)
+      assert(maybeJvmOptionsEnv.isDefined)
+      maybeJvmOptionsEnv.foreach { jvmOptionsEnv =>
+        val jvmOptions = jvmOptionsEnv.getValue.split(" ")
+        jvmOptions.foreach { opt => assert(opt.startsWith("-D")) }
+        val optionKeyValues = jvmOptions.map { option =>
+          val withoutDashDPrefix = option.stripPrefix("-D")
+          val split = withoutDashDPrefix.split('=')
+          assert(split.length == 2)
+          (split(0), split(1))
+        }.toMap
+        assert(optionKeyValues("opt1") === "opt1value")
+        assert(optionKeyValues.contains("spark.app.id"))
+        assert(optionKeyValues("spark.jars") === MAIN_APP_RESOURCE)
+        assert(optionKeyValues(KUBERNETES_DRIVER_POD_NAME.key).startsWith(APP_NAME))
+        assert(optionKeyValues("spark.app.name") === APP_NAME)
+        assert(optionKeyValues("spark.logConf") === "true")
+      }
+    }
+  }
+
+  // Tests with local dependencies with the mounted dependency manager.
+  test("Uploading local dependencies should create Kubernetes secrets and config map") {
+    val initContainerConfigMap = getInitContainerConfigMap()
+    val initContainerSecret = getInitContainerSecret()
+    runWithMountedDependencies(initContainerConfigMap, initContainerSecret)
+    val driverPod = captureCreatedPodAnswer.capturedArgument
+    assert(captureCreatedResourcesAnswer.capturedArguments != null)
+    assert(captureCreatedResourcesAnswer.capturedArguments.size === 2)
+    assert(captureCreatedResourcesAnswer.capturedArguments.toSet ===
+      Set(initContainerSecret, initContainerConfigMap))
+    captureCreatedResourcesAnswer.capturedArguments.foreach { resource =>
+      val driverPodOwnerReferences = resource.getMetadata.getOwnerReferences
+      assert(driverPodOwnerReferences.size === 1)
+      val driverPodOwnerReference = driverPodOwnerReferences.asScala.head
+      assert(driverPodOwnerReference.getName === driverPod.getMetadata.getName)
+      assert(driverPodOwnerReference.getApiVersion === driverPod.getApiVersion)
+      assert(driverPodOwnerReference.getUid === driverPod.getMetadata.getUid)
+      assert(driverPodOwnerReference.getKind === driverPod.getKind)
+      assert(driverPodOwnerReference.getController)
+    }
+  }
+
+  test("Uploading local resources should set classpath environment variables") {
+    val initContainerConfigMap = getInitContainerConfigMap()
+    val initContainerSecret = getInitContainerSecret()
+    runWithMountedDependencies(initContainerConfigMap, initContainerSecret)
+    val driverPod = captureCreatedPodAnswer.capturedArgument
+    val maybeDriverContainer = getDriverContainer(driverPod)
+    maybeDriverContainer.foreach { driverContainer =>
+      val envs = driverContainer.getEnv
+        .asScala
+        .map { env => (env.getName, env.getValue) }
+        .toMap
+      val classPathEntries = envs(ENV_MOUNTED_CLASSPATH).split(File.pathSeparator).toSet
+      val expectedClassPathEntries = RESOLVED_SPARK_JARS
+        .map(Utils.resolveURI)
+        .map(_.getPath)
+        .toSet
+      assert(classPathEntries === expectedClassPathEntries)
+    }
+  }
+
+  private def getInitContainerSecret(): Secret = {
+    new SecretBuilder()
+      .withNewMetadata().withName(s"$APP_NAME-init-container-secret").endMetadata()
+      .addToData(
+        INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY, DOWNLOAD_JARS_RESOURCE_IDENTIFIER.resourceSecret)
+      .addToData(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY,
+        DOWNLOAD_FILES_RESOURCE_IDENTIFIER.resourceSecret)
+      .build()
+  }
+
+  private def getInitContainerConfigMap(): ConfigMap = {
+    new ConfigMapBuilder()
+      .withNewMetadata().withName(s"$APP_NAME-init-container-conf").endMetadata()
+      .addToData("key", "configuration")
+      .build()
+  }
+
+  private def runWithMountedDependencies(
+      initContainerConfigMap: ConfigMap, initContainerSecret: Secret): Unit = {
+    sparkConf.set(RESOURCE_STAGING_SERVER_URI, STAGING_SERVER_URI)
+      .setJars(SPARK_JARS)
+      .set("spark.files", SPARK_FILES.mkString(","))
+    val labelsMatcher = new BaseMatcher[Map[String, String]] {
+      override def matches(maybeLabels: scala.Any) = {
+        maybeLabels match {
+          case labels: Map[String, String] =>
+            labels(SPARK_APP_ID_LABEL).startsWith(APP_NAME) &&
+              labels(SPARK_APP_NAME_LABEL) == APP_NAME
+          case _ => false
+        }
+      }
+
+      override def describeTo(description: Description) = {
+        description.appendText("Checks if the labels contain the app ID and app name.")
+      }
+    }
+    when(mountedDependencyManagerProvider.getMountedDependencyManager(
+      startsWith(APP_NAME),
+      mockitoEq(STAGING_SERVER_URI),
+      argThat(labelsMatcher),
+      mockitoEq(NAMESPACE),
+      mockitoEq(SPARK_JARS ++ Seq(MAIN_APP_RESOURCE)),
+      mockitoEq(SPARK_FILES))).thenReturn(mountedDependencyManager)
+    when(mountedDependencyManager.uploadJars()).thenReturn(DOWNLOAD_JARS_RESOURCE_IDENTIFIER)
+    when(mountedDependencyManager.uploadFiles()).thenReturn(DOWNLOAD_FILES_RESOURCE_IDENTIFIER)
+    when(mountedDependencyManager.buildInitContainerSecret(
+      DOWNLOAD_JARS_RESOURCE_IDENTIFIER.resourceSecret,
+      DOWNLOAD_FILES_RESOURCE_IDENTIFIER.resourceSecret))
+      .thenReturn(initContainerSecret)
+    when(mountedDependencyManager.buildInitContainerConfigMap(
+      DOWNLOAD_JARS_RESOURCE_IDENTIFIER.resourceId, DOWNLOAD_FILES_RESOURCE_IDENTIFIER.resourceId))
+      .thenReturn(initContainerConfigMap)
+    when(mountedDependencyManager.resolveSparkJars()).thenReturn(RESOLVED_SPARK_JARS)
+    when(mountedDependencyManager.resolveSparkFiles()).thenReturn(RESOLVED_SPARK_FILES)
+    when(mountedDependencyManager.configurePodToMountLocalDependencies(
+      mockitoEq(DRIVER_CONTAINER_NAME),
+      mockitoEq(initContainerSecret),
+      mockitoEq(initContainerConfigMap),
+      any())).thenAnswer(new Answer[PodBuilder] {
+      override def answer(invocationOnMock: InvocationOnMock): PodBuilder = {
+        val basePod = invocationOnMock.getArgumentAt(3, classOf[PodBuilder])
+        basePod.editMetadata().addToAnnotations(MOUNTED_FILES_ANNOTATION_KEY, "true").endMetadata()
+      }
+    })
+    val clientUnderTest = createClient()
+    clientUnderTest.run()
+  }
+
+  private def getDriverContainer(driverPod: Pod): Option[Container] = {
+    val maybeDriverContainer = driverPod.getSpec
+      .getContainers
+      .asScala
+      .find(_.getName == DRIVER_CONTAINER_NAME)
+    assert(maybeDriverContainer.isDefined)
+    maybeDriverContainer
+  }
+
+  private def createAndGetDriverPod(): Pod = {
+    val clientUnderTest = createClient()
+    clientUnderTest.run()
+    val createdDriverPod = captureCreatedPodAnswer.capturedArgument
+    assert(createdDriverPod != null)
+    createdDriverPod
+  }
+
+  private def createClient(): Client = {
+    new Client(
+      MAIN_CLASS,
+      sparkConf,
+      APP_ARGS,
+      MAIN_APP_RESOURCE,
+      submissionKubernetesClientProvider,
+      mountedDependencyManagerProvider)
+  }
+
+  private class SelfArgumentCapturingAnswer[T: ClassTag] extends Answer[T] {
+    var capturedArgument: T = _
+
+    override def answer(invocationOnMock: InvocationOnMock): T = {
+      val argumentClass = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
+      val argument = invocationOnMock.getArgumentAt(0, argumentClass)
+      this.capturedArgument = argument
+      argument
+    }
+  }
+
+  private class AllArgumentsCapturingAnswer[I, T](returnValue: T) extends Answer[T] {
+    var capturedArguments: Seq[I] = _
+
+    override def answer(invocationOnMock: InvocationOnMock): T = {
+      capturedArguments = invocationOnMock.getArguments.map(_.asInstanceOf[I]).toSeq
+      returnValue
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala
new file mode 100644
index 0000000000000..321fe1b3fd889
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.{ByteArrayOutputStream, File, StringReader}
+import java.util.{Properties, UUID}
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, Container, Pod, PodBuilder, SecretBuilder}
+import okhttp3.RequestBody
+import okio.Okio
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.mockito.Matchers.any
+import org.mockito.Mockito
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar._
+import retrofit2.{Call, Response}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory, StagedResourceIdentifier}
+import org.apache.spark.util.Utils
+
+private[spark] class MountedDependencyManagerSuite extends SparkFunSuite with BeforeAndAfter {
+  import MountedDependencyManagerSuite.createTempFile
+
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+  private val APP_ID = "app-id"
+  private val LABELS = Map("label1" -> "label1value", "label2" -> "label2value")
+  private val NAMESPACE = "namespace"
+  private val STAGING_SERVER_URI = "http://localhost:8000"
+  private val INIT_CONTAINER_IMAGE = "spark-driver-init:latest"
+  private val JARS_DOWNLOAD_PATH = DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION.defaultValue.get
+  private val FILES_DOWNLOAD_PATH = DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION.defaultValue.get
+  private val DOWNLOAD_TIMEOUT_MINUTES = 5
+  private val LOCAL_JARS = Seq(createTempFile("jar"), createTempFile("jar"))
+  private val JARS = Seq("hdfs://localhost:9000/jars/jar1.jar",
+    s"file://${LOCAL_JARS.head}",
+    LOCAL_JARS(1))
+  private val LOCAL_FILES = Seq(createTempFile("txt"))
+  private val FILES = Seq("hdfs://localhost:9000/files/file1.txt",
+    LOCAL_FILES.head)
+  private val TRUSTSTORE_FILE = new File(createTempFile(".jks"))
+  private val TRUSTSTORE_PASSWORD = "trustStorePassword"
+  private val TRUSTSTORE_TYPE = "jks"
+  private val STAGING_SERVER_SSL_OPTIONS = SSLOptions(
+    enabled = true,
+    trustStore = Some(TRUSTSTORE_FILE),
+    trustStorePassword = Some(TRUSTSTORE_PASSWORD),
+    trustStoreType = Some(TRUSTSTORE_TYPE))
+  private val JARS_RESOURCE_ID = "jarsId"
+  private val JARS_SECRET = "jarsSecret"
+  private val FILES_RESOURCE_ID = "filesId"
+  private val FILES_SECRET = "filesSecret"
+  private var retrofitClientFactory: RetrofitClientFactory = _
+  private var retrofitClient: ResourceStagingServiceRetrofit = _
+
+  private var dependencyManagerUnderTest: MountedDependencyManager = _
+
+  before {
+    retrofitClientFactory = mock[RetrofitClientFactory]
+    retrofitClient = mock[ResourceStagingServiceRetrofit]
+    Mockito.when(
+      retrofitClientFactory.createRetrofitClient(
+        STAGING_SERVER_URI, classOf[ResourceStagingServiceRetrofit], STAGING_SERVER_SSL_OPTIONS))
+      .thenReturn(retrofitClient)
+    dependencyManagerUnderTest = new MountedDependencyManagerImpl(
+      APP_ID,
+      LABELS,
+      NAMESPACE,
+      STAGING_SERVER_URI,
+      INIT_CONTAINER_IMAGE,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOWNLOAD_TIMEOUT_MINUTES,
+      JARS,
+      FILES,
+      STAGING_SERVER_SSL_OPTIONS,
+      retrofitClientFactory)
+  }
+
+  test("Uploading jars should contact the staging server with the appropriate parameters") {
+    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
+      StagedResourceIdentifier("resourceId", "resourceSecret"))
+    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
+      .thenAnswer(capturingArgumentsAnswer)
+    dependencyManagerUnderTest.uploadJars()
+    testUploadSendsCorrectFiles(LOCAL_JARS, capturingArgumentsAnswer)
+  }
+
+  test("Uploading files should contact the staging server with the appropriate parameters") {
+    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
+      StagedResourceIdentifier("resourceId", "resourceSecret"))
+    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
+      .thenAnswer(capturingArgumentsAnswer)
+    dependencyManagerUnderTest.uploadFiles()
+    testUploadSendsCorrectFiles(LOCAL_FILES, capturingArgumentsAnswer)
+  }
+
+  test("Init container secret should contain jars, files, and trustStore") {
+    val jarsSecretBase64 = BaseEncoding.base64().encode(JARS_SECRET.getBytes(Charsets.UTF_8))
+    val filesSecretBase64 = BaseEncoding.base64().encode(FILES_SECRET.getBytes(Charsets.UTF_8))
+    val trustStoreBase64 = BaseEncoding.base64().encode(Files.toByteArray(TRUSTSTORE_FILE))
+    val secret = dependencyManagerUnderTest.buildInitContainerSecret("jarsSecret", "filesSecret")
+    assert(secret.getMetadata.getName === s"$APP_ID-spark-init")
+    val expectedSecrets = Map(
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY -> jarsSecretBase64,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY -> filesSecretBase64,
+      INIT_CONTAINER_TRUSTSTORE_SECRET_KEY -> trustStoreBase64)
+    assert(secret.getData.asScala === expectedSecrets)
+  }
+
+  test("Init container config map should contain parameters for downloading from staging server") {
+    val configMap = dependencyManagerUnderTest.buildInitContainerConfigMap(
+      JARS_RESOURCE_ID, FILES_RESOURCE_ID)
+    assert(configMap.getMetadata.getName === s"$APP_ID-init-properties")
+    val propertiesRawString = configMap.getData.get(INIT_CONTAINER_CONFIG_MAP_KEY)
+    assert(propertiesRawString != null)
+    val propertiesReader = new StringReader(propertiesRawString)
+    val properties = new Properties()
+    properties.load(propertiesReader)
+    val propertiesMap = properties.stringPropertyNames().asScala.map { prop =>
+      (prop, properties.getProperty(prop))
+    }.toMap
+    val expectedProperties = Map[String, String](
+      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
+      DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION.key -> JARS_DOWNLOAD_PATH,
+      DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION.key -> FILES_DOWNLOAD_PATH,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH,
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH,
+      DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT.key -> s"${DOWNLOAD_TIMEOUT_MINUTES}m",
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key -> INIT_CONTAINER_TRUSTSTORE_PATH,
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "true",
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASSWORD,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE)
+    assert(propertiesMap === expectedProperties)
+  }
+
+  test("Resolving jars should map local paths to their mounted counterparts") {
+    val resolvedJars = dependencyManagerUnderTest.resolveSparkJars()
+    val expectedResolvedJars = Seq(
+      "hdfs://localhost:9000/jars/jar1.jar",
+      s"file://$JARS_DOWNLOAD_PATH/${new File(JARS(1)).getName}",
+      s"file://$JARS_DOWNLOAD_PATH/${new File(JARS(2)).getName}")
+    assert(resolvedJars === expectedResolvedJars)
+  }
+
+  test("Resolving files should map local paths to their mounted counterparts") {
+    val resolvedFiles = dependencyManagerUnderTest.resolveSparkFiles()
+    val expectedResolvedFiles = Seq(
+      "hdfs://localhost:9000/files/file1.txt",
+      s"file://$FILES_DOWNLOAD_PATH/${new File(FILES(1)).getName}")
+    assert(resolvedFiles === expectedResolvedFiles)
+  }
+
+  test("Downloading init container should be added to pod") {
+    val driverPod = configureDriverPod()
+    val podAnnotations = driverPod.getMetadata.getAnnotations
+    assert(podAnnotations.size === 1)
+    val initContainerRawAnnotation = podAnnotations.get(INIT_CONTAINER_ANNOTATION)
+    val initContainers = OBJECT_MAPPER.readValue(
+      initContainerRawAnnotation, classOf[Array[Container]])
+    assert(initContainers.size === 1)
+    val initContainer = initContainers.head
+    assert(initContainer.getName === "spark-driver-init")
+    assert(initContainer.getImage === INIT_CONTAINER_IMAGE)
+    assert(initContainer.getImagePullPolicy === "IfNotPresent")
+    val volumeMounts = initContainer.getVolumeMounts
+      .asScala
+      .map(mount => (mount.getName, mount.getMountPath))
+      .toMap
+    val expectedVolumeMounts = Map[String, String](
+      DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
+      DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH,
+      INIT_CONTAINER_PROPERTIES_FILE_VOLUME -> INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH,
+      INIT_CONTAINER_SECRETS_VOLUME_NAME -> INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH)
+    assert(volumeMounts === expectedVolumeMounts)
+  }
+
+  test("Driver pod should have added volumes and volume mounts for file downloads") {
+    val driverPod = configureDriverPod()
+    val volumes = driverPod.getSpec.getVolumes.asScala.map(volume => (volume.getName, volume)).toMap
+    val initContainerPropertiesVolume = volumes(INIT_CONTAINER_PROPERTIES_FILE_VOLUME).getConfigMap
+    assert(initContainerPropertiesVolume != null)
+    assert(initContainerPropertiesVolume.getName === "config")
+    assert(initContainerPropertiesVolume.getItems.asScala.exists { keyToPath =>
+      keyToPath.getKey == INIT_CONTAINER_CONFIG_MAP_KEY &&
+        keyToPath.getPath == INIT_CONTAINER_PROPERTIES_FILE_NAME
+    })
+    val jarsVolume = volumes(DOWNLOAD_JARS_VOLUME_NAME)
+    assert(jarsVolume.getEmptyDir != null)
+    val filesVolume = volumes(DOWNLOAD_FILES_VOLUME_NAME)
+    assert(filesVolume.getEmptyDir != null)
+    val initContainerSecretVolume = volumes(INIT_CONTAINER_SECRETS_VOLUME_NAME)
+    assert(initContainerSecretVolume.getSecret != null)
+    assert(initContainerSecretVolume.getSecret.getSecretName === "secret")
+    val driverContainer = driverPod.getSpec
+      .getContainers
+      .asScala
+      .find(_.getName == "driver-container").get
+    val driverContainerVolumeMounts = driverContainer.getVolumeMounts
+      .asScala
+      .map(mount => (mount.getName, mount.getMountPath))
+      .toMap
+    val expectedVolumeMountNamesAndPaths = Map[String, String](
+      DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
+      DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH)
+    assert(driverContainerVolumeMounts === expectedVolumeMountNamesAndPaths)
+    val envs = driverContainer.getEnv
+    assert(envs.size() === 1)
+    assert(envs.asScala.head.getName === ENV_UPLOADED_JARS_DIR)
+    assert(envs.asScala.head.getValue === JARS_DOWNLOAD_PATH)
+  }
+
+  private def configureDriverPod(): Pod = {
+    val initContainerSecret = new SecretBuilder()
+      .withNewMetadata().withName("secret").endMetadata()
+      .addToData("datakey", "datavalue")
+      .build()
+    val initContainerConfigMap = new ConfigMapBuilder()
+      .withNewMetadata().withName("config").endMetadata()
+      .addToData("datakey", "datavalue")
+      .build()
+    val basePod = new PodBuilder()
+      .withNewMetadata()
+        .withName("driver-pod")
+        .endMetadata()
+      .withNewSpec()
+        .addNewContainer()
+          .withName("driver-container")
+          .withImage("spark-driver:latest")
+          .endContainer()
+      .endSpec()
+    val adjustedPod = dependencyManagerUnderTest.configurePodToMountLocalDependencies(
+      "driver-container",
+      initContainerSecret,
+      initContainerConfigMap,
+      basePod).build()
+    adjustedPod
+  }
+
+  private def testUploadSendsCorrectFiles(
+      expectedFiles: Seq[String],
+      capturingArgumentsAnswer: UploadDependenciesArgumentsCapturingAnswer) = {
+    val requestLabelsBytes = requestBodyBytes(capturingArgumentsAnswer.podLabelsArg)
+    val requestLabelsString = new String(requestLabelsBytes, Charsets.UTF_8)
+    val requestLabelsMap = OBJECT_MAPPER.readValue(
+      requestLabelsString, classOf[Map[String, String]])
+    assert(requestLabelsMap === LABELS)
+    val requestNamespaceBytes = requestBodyBytes(capturingArgumentsAnswer.podNamespaceArg)
+    val requestNamespaceString = new String(requestNamespaceBytes, Charsets.UTF_8)
+    assert(requestNamespaceString === NAMESPACE)
+    val localJarsTarStream = new ByteArrayOutputStream()
+    CompressionUtils.writeTarGzipToStream(localJarsTarStream, expectedFiles)
+    val requestResourceBytes = requestBodyBytes(capturingArgumentsAnswer.podResourcesArg)
+    assert(requestResourceBytes.sameElements(localJarsTarStream.toByteArray))
+  }
+
+  private def requestBodyBytes(requestBody: RequestBody): Array[Byte] = {
+    Utils.tryWithResource(new ByteArrayOutputStream()) { outputStream =>
+      Utils.tryWithResource(Okio.sink(outputStream)) { sink =>
+        Utils.tryWithResource(Okio.buffer(sink)) { bufferedSink =>
+          requestBody.writeTo(bufferedSink)
+        }
+      }
+      outputStream.toByteArray
+    }
+  }
+}
+
+private class UploadDependenciesArgumentsCapturingAnswer(returnValue: StagedResourceIdentifier)
+    extends Answer[Call[StagedResourceIdentifier]] {
+
+  var podLabelsArg: RequestBody = _
+  var podNamespaceArg: RequestBody = _
+  var podResourcesArg: RequestBody = _
+  var kubernetesCredentialsArg: RequestBody = _
+
+  override def answer(invocationOnMock: InvocationOnMock): Call[StagedResourceIdentifier] = {
+    podLabelsArg = invocationOnMock.getArgumentAt(0, classOf[RequestBody])
+    podNamespaceArg = invocationOnMock.getArgumentAt(1, classOf[RequestBody])
+    podResourcesArg = invocationOnMock.getArgumentAt(2, classOf[RequestBody])
+    kubernetesCredentialsArg = invocationOnMock.getArgumentAt(3, classOf[RequestBody])
+    val responseCall = mock[Call[StagedResourceIdentifier]]
+    Mockito.when(responseCall.execute()).thenReturn(Response.success(returnValue))
+    responseCall
+  }
+}
+
+private object MountedDependencyManagerSuite {
+  def createTempFile(extension: String): String = {
+    val dir = Utils.createTempDir()
+    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
+    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
+    file.getAbsolutePath
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
new file mode 100644
index 0000000000000..77eb7f2b9f49c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes.v2
+
+import java.io.{ByteArrayOutputStream, File}
+import java.util.UUID
+import javax.ws.rs.core
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import okhttp3.{MediaType, ResponseBody}
+import org.mockito.Matchers.any
+import org.mockito.Mockito.{doAnswer, when}
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar._
+import retrofit2.{Call, Callback, Response}
+
+import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.util.Utils
+
+class KubernetesSparkDependencyDownloadInitContainerSuite
+    extends SparkFunSuite with BeforeAndAfter {
+  import KubernetesSparkDependencyDownloadInitContainerSuite.createTempFile
+  private val STAGING_SERVER_URI = "http://localhost:8000"
+  private val TRUSTSTORE_FILE = new File(createTempFile(".jks"))
+  private val TRUSTSTORE_PASSWORD = "trustStorePassword"
+  private val TRUSTSTORE_TYPE = "jks"
+  private val STAGING_SERVER_SSL_OPTIONS = SSLOptions(
+    enabled = true,
+    trustStore = Some(TRUSTSTORE_FILE),
+    trustStorePassword = Some(TRUSTSTORE_PASSWORD),
+    trustStoreType = Some(TRUSTSTORE_TYPE))
+  private val JARS = Seq(createTempFile("jar"), createTempFile("jar"))
+  private val FILES = Seq(createTempFile("txt"), createTempFile("csv"))
+  private val DOWNLOAD_JARS_SECRET_LOCATION = createTempFile("txt")
+  private val DOWNLOAD_FILES_SECRET_LOCATION = createTempFile("txt")
+  private val JARS_RESOURCE_ID = "jarsId"
+  private val FILES_RESOURCE_ID = "filesId"
+
+  private var sparkConf: SparkConf = _
+  private var downloadJarsDir: File = _
+  private var downloadFilesDir: File = _
+  private var downloadJarsSecretValue: String = _
+  private var downloadFilesSecretValue: String = _
+  private var jarsCompressedBytes: Array[Byte] = _
+  private var filesCompressedBytes: Array[Byte] = _
+  private var retrofitClientFactory: RetrofitClientFactory = _
+  private var retrofitClient: ResourceStagingServiceRetrofit = _
+  private var initContainerUnderTest: KubernetesSparkDependencyDownloadInitContainer = _
+
+  override def beforeAll(): Unit = {
+    jarsCompressedBytes = compressPathsToBytes(JARS)
+    filesCompressedBytes = compressPathsToBytes(FILES)
+    downloadJarsSecretValue = Files.toString(
+      new File(DOWNLOAD_JARS_SECRET_LOCATION), Charsets.UTF_8)
+    downloadFilesSecretValue = Files.toString(
+      new File(DOWNLOAD_FILES_SECRET_LOCATION), Charsets.UTF_8)
+  }
+
+  before {
+    downloadJarsDir = Utils.createTempDir()
+    downloadFilesDir = Utils.createTempDir()
+    retrofitClientFactory = mock[RetrofitClientFactory]
+    retrofitClient = mock[ResourceStagingServiceRetrofit]
+    sparkConf = new SparkConf(true)
+      .set(RESOURCE_STAGING_SERVER_URI, STAGING_SERVER_URI)
+      .set(INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER, JARS_RESOURCE_ID)
+      .set(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION, DOWNLOAD_JARS_SECRET_LOCATION)
+      .set(INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER, FILES_RESOURCE_ID)
+      .set(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION, DOWNLOAD_FILES_SECRET_LOCATION)
+      .set(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
+      .set(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
+      .set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE, TRUSTSTORE_FILE.getAbsolutePath)
+      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD, TRUSTSTORE_PASSWORD)
+      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE, TRUSTSTORE_TYPE)
+
+    when(retrofitClientFactory.createRetrofitClient(
+        STAGING_SERVER_URI, classOf[ResourceStagingServiceRetrofit], STAGING_SERVER_SSL_OPTIONS))
+      .thenReturn(retrofitClient)
+    initContainerUnderTest = new KubernetesSparkDependencyDownloadInitContainer(
+      sparkConf, retrofitClientFactory)
+  }
+
+  after {
+    downloadJarsDir.delete()
+    downloadFilesDir.delete()
+  }
+
+  test("Downloads should unpack response body streams to directories") {
+    val downloadJarsCall = mock[Call[ResponseBody]]
+    val downloadFilesCall = mock[Call[ResponseBody]]
+    when(retrofitClient.downloadResources(JARS_RESOURCE_ID, downloadJarsSecretValue))
+      .thenReturn(downloadJarsCall)
+    when(retrofitClient.downloadResources(FILES_RESOURCE_ID, downloadFilesSecretValue))
+      .thenReturn(downloadFilesCall)
+    val jarsResponseBody = ResponseBody.create(
+      MediaType.parse(core.MediaType.APPLICATION_OCTET_STREAM), jarsCompressedBytes)
+    val filesResponseBody = ResponseBody.create(
+      MediaType.parse(core.MediaType.APPLICATION_OCTET_STREAM), filesCompressedBytes)
+    doAnswer(new InvokeCallbackAnswer(downloadJarsCall, jarsResponseBody))
+      .when(downloadJarsCall)
+      .enqueue(any())
+    doAnswer(new InvokeCallbackAnswer(downloadFilesCall, filesResponseBody))
+      .when(downloadFilesCall)
+      .enqueue(any())
+    initContainerUnderTest.run()
+    checkWrittenFilesAreTheSameAsOriginal(JARS, downloadJarsDir)
+    checkWrittenFilesAreTheSameAsOriginal(FILES, downloadFilesDir)
+  }
+
+  private def checkWrittenFilesAreTheSameAsOriginal(
+      originalFiles: Iterable[String], downloadDir: File): Unit = {
+    originalFiles.map(new File(_)).foreach { file =>
+      val writtenFile = new File(downloadDir, file.getName)
+      assert(writtenFile.exists)
+      val originalJarContents = Seq(Files.toByteArray(file): _*)
+      val writtenJarContents = Seq(Files.toByteArray(writtenFile): _*)
+      assert(writtenJarContents === originalJarContents)
+    }
+  }
+
+  private def compressPathsToBytes(paths: Iterable[String]): Array[Byte] = {
+    Utils.tryWithResource(new ByteArrayOutputStream()) { compressedBytes =>
+      CompressionUtils.writeTarGzipToStream (compressedBytes, paths)
+      compressedBytes.toByteArray
+    }
+  }
+}
+
+private object KubernetesSparkDependencyDownloadInitContainerSuite {
+  def createTempFile(extension: String): String = {
+    val dir = Utils.createTempDir()
+    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
+    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
+    file.getAbsolutePath
+  }
+}
+
+private class InvokeCallbackAnswer(call: Call[ResponseBody], responseBody: ResponseBody)
+    extends Answer[Unit] {
+  override def answer(invocationOnMock: InvocationOnMock): Unit = {
+    val callback = invocationOnMock.getArgumentAt(0, classOf[Callback[ResponseBody]])
+    val response = Response.success(responseBody)
+    callback.onResponse(call, response)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
index 51c5e43af1124..08be8af30b3bc 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
@@ -77,7 +77,7 @@ class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
 
   private def runUploadAndDownload(sslOptions: SSLOptions): Unit = {
     val scheme = if (sslOptions.enabled) "https" else "http"
-    val retrofitService = RetrofitUtils.createRetrofitClient(
+    val retrofitService = RetrofitClientFactoryImpl.createRetrofitClient(
       s"$scheme://127.0.0.1:$serverPort/",
       classOf[ResourceStagingServiceRetrofit],
       sslOptions)
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index e9f88e37a5f89..a10fe8fb58408 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -72,27 +72,14 @@
         <artifactId>maven-assembly-plugin</artifactId>
         <executions>
           <execution>
-            <id>driver-docker-dist</id>
+            <id>docker-dist</id>
             <phase>pre-integration-test</phase>
             <goals>
               <goal>single</goal>
             </goals>
             <configuration>
               <descriptors>
-                <descriptor>src/main/assembly/driver-assembly.xml</descriptor>
-              </descriptors>
-              <tarLongFileMode>posix</tarLongFileMode>
-            </configuration>
-          </execution>
-          <execution>
-            <id>executor-docker-dist</id>
-            <phase>pre-integration-test</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <descriptors>
-                <descriptor>src/main/assembly/executor-assembly.xml</descriptor>
+                <descriptor>src/main/assembly/docker-assembly.xml</descriptor>
               </descriptors>
               <tarLongFileMode>posix</tarLongFileMode>
             </configuration>
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/docker-assembly.xml
similarity index 95%
rename from resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
rename to resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/docker-assembly.xml
index b5fcaa75f049c..2b48d366256fe 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/driver-assembly.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/docker-assembly.xml
@@ -15,7 +15,7 @@
   ~ limitations under the License.
   -->
 <assembly>
-  <id>driver-docker-dist</id>
+  <id>docker-dist</id>
   <formats>
     <format>tar.gz</format>
     <format>dir</format>
@@ -51,9 +51,9 @@
     </fileSet>
     <fileSet>
       <directory>
-        src/main/docker/driver
+        src/main/docker/
       </directory>
-      <outputDirectory></outputDirectory>
+      <outputDirectory>dockerfiles</outputDirectory>
       <includes>
         <include>**/*</include>
       </includes>
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml b/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
deleted file mode 100644
index d97ba56562a12..0000000000000
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/assembly/executor-assembly.xml
+++ /dev/null
@@ -1,84 +0,0 @@
-<!--
-  ~ Licensed to the Apache Software Foundation (ASF) under one or more
-  ~ contributor license agreements.  See the NOTICE file distributed with
-  ~ this work for additional information regarding copyright ownership.
-  ~ The ASF licenses this file to You under the Apache License, Version 2.0
-  ~ (the "License"); you may not use this file except in compliance with
-  ~ the License.  You may obtain a copy of the License at
-  ~
-  ~    http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing, software
-  ~ distributed under the License is distributed on an "AS IS" BASIS,
-  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  ~ See the License for the specific language governing permissions and
-  ~ limitations under the License.
-  -->
-<assembly>
-  <id>executor-docker-dist</id>
-  <formats>
-    <format>tar.gz</format>
-    <format>dir</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <fileSets>
-    <fileSet>
-      <directory>
-        ${project.parent.basedir}/core/src/main/resources/org/apache/spark/ui/static/
-      </directory>
-      <outputDirectory>ui-resources/org/apache/spark/ui/static</outputDirectory>
-      <includes>
-        <include>**/*</include>
-      </includes>
-    </fileSet>
-    <fileSet>
-      <directory>
-        ${project.parent.basedir}/sbin/
-      </directory>
-      <outputDirectory>sbin</outputDirectory>
-      <includes>
-        <include>**/*</include>
-      </includes>
-    </fileSet>
-    <fileSet>
-      <directory>
-        ${project.parent.basedir}/bin/
-      </directory>
-      <outputDirectory>bin</outputDirectory>
-      <includes>
-        <include>**/*</include>
-      </includes>
-    </fileSet>
-    <fileSet>
-      <directory>
-        ${project.parent.basedir}/conf/
-      </directory>
-      <outputDirectory>conf</outputDirectory>
-      <includes>
-        <include>**/*</include>
-      </includes>
-    </fileSet>
-    <fileSet>
-      <directory>
-        src/main/docker/executor
-      </directory>
-      <outputDirectory></outputDirectory>
-      <includes>
-        <include>**/*</include>
-      </includes>
-    </fileSet>
-  </fileSets>
-  <dependencySets>
-    <dependencySet>
-      <outputDirectory>jars</outputDirectory>
-      <useTransitiveDependencies>true</useTransitiveDependencies>
-      <unpack>false</unpack>
-      <scope>runtime</scope>
-      <useProjectArtifact>false</useProjectArtifact>
-      <excludes>
-        <exclude>org.apache.spark:spark-assembly_${scala.binary.version}:pom</exclude>
-        <exclude>org.spark-project.spark:unused</exclude>
-      </excludes>
-    </dependencySet>
-  </dependencySets>
-</assembly>
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile
new file mode 100644
index 0000000000000..59029a6c08b4a
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM openjdk:8-alpine
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
+
+RUN apk upgrade --update
+RUN apk add --update bash
+RUN mkdir -p /opt/spark
+RUN touch /opt/spark/RELEASE
+
+ADD jars /opt/spark/jars
+ADD bin /opt/spark/bin
+ADD sbin /opt/spark/sbin
+ADD conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+
+WORKDIR /opt/spark
+
+ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.v2.KubernetesSparkDependencyDownloadInitContainer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile
new file mode 100644
index 0000000000000..40f9459dc06dc
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM openjdk:8-alpine
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-driver:latest -f dockerfiles/driver/Dockerfile .
+
+RUN apk upgrade --update
+RUN apk add --update bash
+RUN mkdir -p /opt/spark
+RUN touch /opt/spark/RELEASE
+
+ADD jars /opt/spark/jars
+ADD examples /opt/spark/examples
+ADD bin /opt/spark/bin
+ADD sbin /opt/spark/sbin
+ADD conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+
+WORKDIR /opt/spark
+
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    exec ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
new file mode 100644
index 0000000000000..15e1ce75815df
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM openjdk:8-alpine
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
+
+RUN apk upgrade --update
+RUN apk add --update bash
+RUN mkdir -p /opt/spark
+RUN touch /opt/spark/RELEASE
+
+ADD jars /opt/spark/jars
+ADD bin /opt/spark/bin
+ADD sbin /opt/spark/sbin
+ADD conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+
+WORKDIR /opt/spark
+
+ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.v2.ResourceStagingServer" ]
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 5418afa25ca85..ac7a549c9b483 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -33,7 +33,11 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
-      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -66,7 +70,7 @@
       <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
       <type>tar.gz</type>
-      <classifier>driver-docker-dist</classifier>
+      <classifier>docker-dist</classifier>
       <scope>test</scope>
       <exclusions>
         <exclusion>
@@ -147,7 +151,7 @@
             </configuration>
           </execution>
           <execution>
-            <id>copy-test-spark-jobs-to-docker-driver</id>
+            <id>copy-test-spark-jobs-to-docker-dist</id>
             <phase>pre-integration-test</phase>
             <goals>
               <goal>copy</goal>
@@ -159,65 +163,20 @@
                   <artifactId>spark-kubernetes-integration-tests-spark-jobs_${scala.binary.version}</artifactId>
                   <version>${project.version}</version>
                   <type>jar</type>
-                  <outputDirectory>${project.build.directory}/docker/driver/examples/integration-tests-jars</outputDirectory>
+                  <outputDirectory>${project.build.directory}/docker/examples/integration-tests-jars</outputDirectory>
                 </artifactItem>
                 <artifactItem>
                   <groupId>org.apache.spark</groupId>
                   <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
                   <version>${project.version}</version>
                   <type>jar</type>
-                  <outputDirectory>${project.build.directory}/docker/driver/examples/integration-tests-jars</outputDirectory>
-                </artifactItem>
-              </artifactItems>
-            </configuration>
-          </execution>
-          <execution>
-            <id>copy-test-spark-jobs-to-docker-executor</id>
-            <phase>pre-integration-test</phase>
-            <goals>
-              <goal>copy</goal>
-            </goals>
-            <configuration>
-              <artifactItems>
-                <artifactItem>
-                  <groupId>org.apache.spark</groupId>
-                  <artifactId>spark-kubernetes-integration-tests-spark-jobs_${scala.binary.version}</artifactId>
-                  <version>${project.version}</version>
-                  <type>jar</type>
-                  <outputDirectory>${project.build.directory}/docker/executor/examples/integration-tests-jars</outputDirectory>
-                </artifactItem>
-                <artifactItem>
-                  <groupId>org.apache.spark</groupId>
-                  <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_${scala.binary.version}</artifactId>
-                  <version>${project.version}</version>
-                  <type>jar</type>
-                  <outputDirectory>${project.build.directory}/docker/executor/examples/integration-tests-jars</outputDirectory>
-                </artifactItem>
-              </artifactItems>
-            </configuration>
-          </execution>
-          <execution>
-            <id>unpack-docker-driver-bundle</id>
-            <phase>pre-integration-test</phase>
-            <goals>
-              <goal>unpack</goal>
-            </goals>
-            <configuration>
-              <artifactItems>
-                <artifactItem>
-                  <groupId>org.apache.spark</groupId>
-                  <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
-                  <version>${project.version}</version>
-                  <classifier>driver-docker-dist</classifier>
-                  <type>tar.gz</type>
-                  <overWrite>true</overWrite>
-                  <outputDirectory>${project.build.directory}/docker/driver</outputDirectory>
+                  <outputDirectory>${project.build.directory}/docker/examples/integration-tests-jars</outputDirectory>
                 </artifactItem>
               </artifactItems>
             </configuration>
           </execution>
           <execution>
-            <id>unpack-docker-executor-bundle</id>
+            <id>unpack-docker-bundle</id>
             <phase>pre-integration-test</phase>
             <goals>
               <goal>unpack</goal>
@@ -228,10 +187,10 @@
                   <groupId>org.apache.spark</groupId>
                   <artifactId>spark-docker-minimal-bundle_${scala.binary.version}</artifactId>
                   <version>${project.version}</version>
-                  <classifier>executor-docker-dist</classifier>
+                  <classifier>docker-dist</classifier>
                   <type>tar.gz</type>
                   <overWrite>true</overWrite>
-                  <outputDirectory>${project.build.directory}/docker/executor</outputDirectory>
+                  <outputDirectory>${project.build.directory}/docker/</outputDirectory>
                 </artifactItem>
               </artifactItems>
             </configuration>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 750e7668b9912..abbf7e4d5ce1b 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -16,119 +16,23 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest
 
-import java.io.File
 import java.nio.file.Paths
-import java.util.UUID
-import java.util.concurrent.TimeUnit
 
 import com.google.common.base.Charsets
-import com.google.common.collect.ImmutableList
 import com.google.common.io.Files
-import com.google.common.util.concurrent.SettableFuture
-import io.fabric8.kubernetes.api.model.Pod
-import io.fabric8.kubernetes.client.{Config, KubernetesClient, KubernetesClientException, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
-import org.scalatest.BeforeAndAfter
-import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
+import org.scalatest.Suite
+import org.scalatest.concurrent.PatienceConfiguration
 import org.scalatest.time.{Minutes, Seconds, Span}
-import scala.collection.JavaConverters._
 
-import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
-import org.apache.spark.deploy.SparkSubmit
-import org.apache.spark.deploy.kubernetes.SSLUtils
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
-import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
-import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
-import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
-import org.apache.spark.util.Utils
 
-private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
-
-  private val EXAMPLES_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs")
-    .toFile
-    .listFiles()(0)
-
-  private val HELPER_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs-helpers")
-      .toFile
-      .listFiles()(0)
-  private val SUBMITTER_LOCAL_MAIN_APP_RESOURCE = s"file://${EXAMPLES_JAR_FILE.getAbsolutePath}"
-  private val CONTAINER_LOCAL_MAIN_APP_RESOURCE = s"local:///opt/spark/examples/" +
-    s"integration-tests-jars/${EXAMPLES_JAR_FILE.getName}"
-  private val CONTAINER_LOCAL_HELPER_JAR_PATH = s"local:///opt/spark/examples/" +
-    s"integration-tests-jars/${HELPER_JAR_FILE.getName}"
-
-  private val TEST_EXISTENCE_FILE = Paths.get("test-data", "input.txt").toFile
-  private val TEST_EXISTENCE_FILE_CONTENTS = Files.toString(TEST_EXISTENCE_FILE, Charsets.UTF_8)
-  private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
-  private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
-  private val SPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
-    ".integrationtest.jobs.SparkPiWithInfiniteWait"
-  private val FILE_EXISTENCE_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
-    ".integrationtest.jobs.FileExistenceTest"
-  private val NAMESPACE = UUID.randomUUID().toString.replaceAll("-", "")
-  private var minikubeKubernetesClient: KubernetesClient = _
-  private var clientConfig: Config = _
-  private var sparkConf: SparkConf = _
+private[spark] class KubernetesSuite extends SparkFunSuite {
 
   override def beforeAll(): Unit = {
     Minikube.startMinikube()
     new SparkDockerImageBuilder(Minikube.getDockerEnv).buildSparkDockerImages()
-    Minikube.getKubernetesClient.namespaces.createNew()
-      .withNewMetadata()
-        .withName(NAMESPACE)
-        .endMetadata()
-      .done()
-    minikubeKubernetesClient = Minikube.getKubernetesClient.inNamespace(NAMESPACE)
-    clientConfig = minikubeKubernetesClient.getConfiguration
-  }
-
-  before {
-    Eventually.eventually(TIMEOUT, INTERVAL) {
-      val podsList = minikubeKubernetesClient.pods().list()
-      assert(podsList == null
-        || podsList.getItems == null
-        || podsList.getItems.isEmpty
-      )
-      val servicesList = minikubeKubernetesClient.services().list()
-      assert(servicesList == null
-        || servicesList.getItems == null
-        || servicesList.getItems.isEmpty)
-    }
-    sparkConf = new SparkConf(true)
-      .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
-      .set(KUBERNETES_SUBMIT_CA_CERT_FILE, clientConfig.getCaCertFile)
-      .set(KUBERNETES_SUBMIT_CLIENT_KEY_FILE, clientConfig.getClientKeyFile)
-      .set(KUBERNETES_SUBMIT_CLIENT_CERT_FILE, clientConfig.getClientCertFile)
-      .set(KUBERNETES_NAMESPACE, NAMESPACE)
-      .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
-      .set(EXECUTOR_DOCKER_IMAGE, "spark-executor:latest")
-      .setJars(Seq(HELPER_JAR_FILE.getAbsolutePath))
-      .set("spark.executor.memory", "500m")
-      .set("spark.executor.cores", "1")
-      .set("spark.executors.instances", "1")
-      .set("spark.app.name", "spark-pi")
-      .set("spark.ui.enabled", "true")
-      .set("spark.testing", "false")
-      .set(WAIT_FOR_APP_COMPLETION, false)
-  }
-
-  after {
-    val pods = minikubeKubernetesClient.pods().list().getItems.asScala
-    pods.par.foreach(pod => {
-      minikubeKubernetesClient
-        .pods()
-        .withName(pod.getMetadata.getName)
-        .withGracePeriod(60)
-        .delete
-    })
-    // spark-submit sets system properties so we have to clear them
-    new SparkConf(true)
-      .getAll.map(_._1)
-      .filter(_ != "spark.docker.test.persistMinikube")
-      .foreach { System.clearProperty }
   }
 
   override def afterAll(): Unit = {
@@ -137,247 +41,33 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
-  private def getSparkMetricsService(sparkBaseAppName: String): SparkRestApiV1 = {
-    val serviceName = minikubeKubernetesClient.services()
-      .withLabel("spark-app-name", sparkBaseAppName)
-      .list()
-      .getItems
-      .get(0)
-      .getMetadata
-      .getName
-    Minikube.getService[SparkRestApiV1](serviceName, NAMESPACE, "spark-ui-port")
-  }
-
-  private def expectationsForStaticAllocation(sparkMetricsService: SparkRestApiV1): Unit = {
-    val apps = Eventually.eventually(TIMEOUT, INTERVAL) {
-      val result = sparkMetricsService
-        .getApplications(ImmutableList.of(ApplicationStatus.RUNNING, ApplicationStatus.COMPLETED))
-      assert(result.size == 1
-        && !result.head.id.equalsIgnoreCase("appid")
-        && !result.head.id.equalsIgnoreCase("{appId}"))
-      result
-    }
-    Eventually.eventually(TIMEOUT, INTERVAL) {
-      val result = sparkMetricsService.getExecutors(apps.head.id)
-      assert(result.size == 2)
-      assert(result.count(exec => exec.id != "driver") == 1)
-      result
-    }
-    Eventually.eventually(TIMEOUT, INTERVAL) {
-      val result = sparkMetricsService.getStages(
-        apps.head.id, Seq(StageStatus.COMPLETE).asJava)
-      assert(result.size == 1)
-      result
-    }
-  }
-
-  test("Run a simple example") {
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
+  override def nestedSuites: scala.collection.immutable.IndexedSeq[Suite] = {
+      Vector(
+        new KubernetesV1Suite,
+        new KubernetesV2Suite)
   }
+}
 
-  test("Run using spark-submit") {
-    val args = Array(
-      "--master", s"k8s://https://${Minikube.getMinikubeIp}:8443",
-      "--deploy-mode", "cluster",
-      "--kubernetes-namespace", NAMESPACE,
-      "--name", "spark-pi",
-      "--executor-memory", "512m",
-      "--executor-cores", "1",
-      "--num-executors", "1",
-      "--jars", HELPER_JAR_FILE.getAbsolutePath,
-      "--class", SPARK_PI_MAIN_CLASS,
-      "--conf", "spark.ui.enabled=true",
-      "--conf", "spark.testing=false",
-      "--conf", s"${KUBERNETES_SUBMIT_CA_CERT_FILE.key}=${clientConfig.getCaCertFile}",
-      "--conf", s"${KUBERNETES_SUBMIT_CLIENT_KEY_FILE.key}=${clientConfig.getClientKeyFile}",
-      "--conf", s"${KUBERNETES_SUBMIT_CLIENT_CERT_FILE.key}=${clientConfig.getClientCertFile}",
-      "--conf", s"${EXECUTOR_DOCKER_IMAGE.key}=spark-executor:latest",
-      "--conf", s"${DRIVER_DOCKER_IMAGE.key}=spark-driver:latest",
-      "--conf", s"${WAIT_FOR_APP_COMPLETION.key}=false",
-      EXAMPLES_JAR_FILE.getAbsolutePath)
-    SparkSubmit.main(args)
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
-  }
-
-  test("Run with the examples jar on the docker image") {
-    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = CONTAINER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
-  }
-
-  test("Run with custom labels and annotations") {
-    sparkConf.set(KUBERNETES_DRIVER_LABELS, "label1=label1value,label2=label2value")
-    sparkConf.set(KUBERNETES_DRIVER_ANNOTATIONS, "annotation1=annotation1value," +
-        "annotation2=annotation2value")
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val driverPodMetadata = minikubeKubernetesClient
-      .pods
-      .withLabel("spark-app-name", "spark-pi")
-      .list()
-      .getItems
-      .get(0)
-      .getMetadata
-    val driverPodLabels = driverPodMetadata.getLabels
-    // We can't match all of the selectors directly since one of the selectors is based on the
-    // launch time.
-    assert(driverPodLabels.size === 5, "Unexpected number of pod labels.")
-    assert(driverPodLabels.get("spark-app-name") === "spark-pi", "Unexpected value for" +
-      " spark-app-name label.")
-    assert(driverPodLabels.get("spark-app-id").startsWith("spark-pi"), "Unexpected value for" +
-      " spark-app-id label (should be prefixed with the app name).")
-    assert(driverPodLabels.get("label1") === "label1value", "Unexpected value for label1")
-    assert(driverPodLabels.get("label2") === "label2value", "Unexpected value for label2")
-    val driverPodAnnotations = driverPodMetadata.getAnnotations
-    assert(driverPodAnnotations.size === 2, "Unexpected number of pod annotations.")
-    assert(driverPodAnnotations.get("annotation1") === "annotation1value",
-      "Unexpected value for annotation1")
-    assert(driverPodAnnotations.get("annotation2") === "annotation2value",
-      "Unexpected value for annotation2")
-  }
-
-  test("Enable SSL on the driver submit server") {
-    val (keyStoreFile, trustStoreFile) = SSLUtils.generateKeyStoreTrustStorePair(
-      Minikube.getMinikubeIp,
-      "changeit",
-      "changeit",
-      "changeit")
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE, s"file://${keyStoreFile.getAbsolutePath}")
-    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyStorePassword", "changeit")
-    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyPassword", "changeit")
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
-      s"file://${trustStoreFile.getAbsolutePath}")
-    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.trustStorePassword", "changeit")
-    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-  }
-
-  test("Enable SSL on the driver submit server using PEM files") {
-    val (keyPem, certPem) = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
-    sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyPem.getAbsolutePath}")
-    sparkConf.set(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${certPem.getAbsolutePath}")
-    sparkConf.set(DRIVER_SUBMIT_SSL_SERVER_CERT_PEM, s"file://${certPem.getAbsolutePath}")
-    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-  }
-
-  test("Added files should exist on the driver.") {
-    sparkConf.set("spark.files", TEST_EXISTENCE_FILE.getAbsolutePath)
-    sparkConf.setAppName("spark-file-existence-test")
-    val podCompletedFuture = SettableFuture.create[Boolean]
-    val watch = new Watcher[Pod] {
-      override def eventReceived(action: Action, pod: Pod): Unit = {
-        val containerStatuses = pod.getStatus.getContainerStatuses.asScala
-        val allSuccessful = containerStatuses.nonEmpty && containerStatuses
-          .forall(status => {
-            status.getState.getTerminated != null && status.getState.getTerminated.getExitCode == 0
-        })
-        if (allSuccessful) {
-          podCompletedFuture.set(true)
-        } else {
-          val failedContainers = containerStatuses.filter(container => {
-            container.getState.getTerminated != null &&
-              container.getState.getTerminated.getExitCode != 0
-          })
-          if (failedContainers.nonEmpty) {
-            podCompletedFuture.setException(new SparkException(
-              "One or more containers in the driver failed with a nonzero exit code."))
-          }
-        }
-      }
-
-      override def onClose(e: KubernetesClientException): Unit = {
-        logWarning("Watch closed", e)
-      }
-    }
-    Utils.tryWithResource(minikubeKubernetesClient
-        .pods
-        .withLabel("spark-app-name", "spark-file-existence-test")
-        .watch(watch)) { _ =>
-      new Client(
-        sparkConf = sparkConf,
-        mainClass = FILE_EXISTENCE_MAIN_CLASS,
-        mainAppResource = CONTAINER_LOCAL_MAIN_APP_RESOURCE,
-        appArgs = Array(TEST_EXISTENCE_FILE.getName, TEST_EXISTENCE_FILE_CONTENTS)).run()
-      assert(podCompletedFuture.get(60, TimeUnit.SECONDS), "Failed to run driver pod")
-      val driverPod = minikubeKubernetesClient
-        .pods
-        .withLabel("spark-app-name", "spark-file-existence-test")
-        .list()
-        .getItems
-        .get(0)
-      val podLog = minikubeKubernetesClient
-        .pods
-        .withName(driverPod.getMetadata.getName)
-        .getLog
-      assert(podLog.contains(s"File found at /opt/spark/${TEST_EXISTENCE_FILE.getName}" +
-        s" with correct contents."), "Job did not find the file as expected.")
-    }
-  }
+private[spark] object KubernetesSuite {
+  val EXAMPLES_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs")
+    .toFile
+    .listFiles()(0)
 
-  test("Use external URI provider") {
-    val externalUriProviderWatch = new ExternalUriProviderWatch(minikubeKubernetesClient)
-    Utils.tryWithResource(minikubeKubernetesClient.services()
-        .withLabel("spark-app-name", "spark-pi")
-        .watch(externalUriProviderWatch)) { _ =>
-      sparkConf.set(DRIVER_SERVICE_MANAGER_TYPE, ExternalSuppliedUrisDriverServiceManager.TYPE)
-      new Client(
-        sparkConf = sparkConf,
-        mainClass = SPARK_PI_MAIN_CLASS,
-        mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-        appArgs = Array.empty[String]).run()
-      val sparkMetricsService = getSparkMetricsService("spark-pi")
-      expectationsForStaticAllocation(sparkMetricsService)
-      assert(externalUriProviderWatch.annotationSet.get)
-      val driverService = minikubeKubernetesClient
-        .services()
-        .withLabel("spark-app-name", "spark-pi")
-        .list()
-        .getItems
-        .asScala(0)
-      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_PROVIDE_EXTERNAL_URI),
-          "External URI request annotation was not set on the driver service.")
-      // Unfortunately we can't check the correctness of the actual value of the URI, as it depends
-      // on the driver submission port set on the driver service but we remove that port from the
-      // service once the submission is complete.
-      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_RESOLVED_EXTERNAL_URI),
-        "Resolved URI annotation not set on driver service.")
-    }
-  }
+  val HELPER_JAR_FILE = Paths.get("target", "integration-tests-spark-jobs-helpers")
+    .toFile
+    .listFiles()(0)
+  val SUBMITTER_LOCAL_MAIN_APP_RESOURCE = s"file://${EXAMPLES_JAR_FILE.getAbsolutePath}"
+  val CONTAINER_LOCAL_MAIN_APP_RESOURCE = s"local:///opt/spark/examples/" +
+    s"integration-tests-jars/${EXAMPLES_JAR_FILE.getName}"
+  val CONTAINER_LOCAL_HELPER_JAR_PATH = s"local:///opt/spark/examples/" +
+    s"integration-tests-jars/${HELPER_JAR_FILE.getName}"
 
-  test("Mount the Kubernetes credentials onto the driver pod") {
-    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE, clientConfig.getCaCertFile)
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE, clientConfig.getClientKeyFile)
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE, clientConfig.getClientCertFile)
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = SPARK_PI_MAIN_CLASS,
-      mainAppResource = SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
-  }
+  val TEST_EXISTENCE_FILE = Paths.get("test-data", "input.txt").toFile
+  val TEST_EXISTENCE_FILE_CONTENTS = Files.toString(TEST_EXISTENCE_FILE, Charsets.UTF_8)
+  val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
+  val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
+  val SPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
+    ".integrationtest.jobs.SparkPiWithInfiniteWait"
+  val FILE_EXISTENCE_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
+    ".integrationtest.jobs.FileExistenceTest"
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
new file mode 100644
index 0000000000000..53e02f9e479c1
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.util.UUID
+
+import org.scalatest.concurrent.Eventually
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+
+private[spark] class KubernetesTestComponents {
+
+  val namespace = UUID.randomUUID().toString.replaceAll("-", "")
+  val kubernetesClient = Minikube.getKubernetesClient.inNamespace(namespace)
+  val clientConfig = kubernetesClient.getConfiguration
+
+  def createNamespace(): Unit = {
+    Minikube.getKubernetesClient.namespaces.createNew()
+      .withNewMetadata()
+      .withName(namespace)
+      .endMetadata()
+      .done()
+  }
+
+  def deleteNamespace(): Unit = {
+    Minikube.getKubernetesClient.namespaces.withName(namespace).delete()
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val namespaceList = Minikube.getKubernetesClient
+        .namespaces()
+        .list()
+        .getItems()
+        .asScala
+      require(!namespaceList.exists(_.getMetadata.getName == namespace))
+    }
+  }
+
+  def newSparkConf(): SparkConf = {
+    new SparkConf(true)
+      .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
+      .set(KUBERNETES_SUBMIT_CA_CERT_FILE, clientConfig.getCaCertFile)
+      .set(KUBERNETES_SUBMIT_CLIENT_KEY_FILE, clientConfig.getClientKeyFile)
+      .set(KUBERNETES_SUBMIT_CLIENT_CERT_FILE, clientConfig.getClientCertFile)
+      .set(KUBERNETES_NAMESPACE, namespace)
+      .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
+      .set(EXECUTOR_DOCKER_IMAGE, "spark-executor:latest")
+      .setJars(Seq(KubernetesSuite.HELPER_JAR_FILE.getAbsolutePath))
+      .set("spark.executor.memory", "500m")
+      .set("spark.executor.cores", "1")
+      .set("spark.executors.instances", "1")
+      .set("spark.app.name", "spark-pi")
+      .set("spark.ui.enabled", "true")
+      .set("spark.testing", "false")
+      .set(WAIT_FOR_APP_COMPLETION, false)
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
new file mode 100644
index 0000000000000..a4e3353032b71
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.util.concurrent.TimeUnit
+
+import com.google.common.collect.ImmutableList
+import com.google.common.util.concurrent.SettableFuture
+import io.fabric8.kubernetes.api.model.Pod
+import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import org.scalatest.{BeforeAndAfter, DoNotDiscover}
+import org.scalatest.concurrent.Eventually
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.SSLUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
+import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
+import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
+import org.apache.spark.util.Utils
+
+@DoNotDiscover
+private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter {
+
+  private var kubernetesTestComponents: KubernetesTestComponents = _
+  private var sparkConf: SparkConf = _
+
+  override def beforeAll(): Unit = {
+    kubernetesTestComponents = new KubernetesTestComponents()
+    kubernetesTestComponents.createNamespace()
+  }
+
+  override def afterAll(): Unit = {
+    kubernetesTestComponents.deleteNamespace()
+  }
+
+  before {
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val podsList = kubernetesTestComponents.kubernetesClient.pods().list()
+      assert(podsList == null
+        || podsList.getItems == null
+        || podsList.getItems.isEmpty
+      )
+      val servicesList = kubernetesTestComponents.kubernetesClient.services().list()
+      assert(servicesList == null
+        || servicesList.getItems == null
+        || servicesList.getItems.isEmpty)
+    }
+    sparkConf = kubernetesTestComponents.newSparkConf()
+  }
+
+  after {
+    val pods = kubernetesTestComponents.kubernetesClient.pods().list().getItems.asScala
+    pods.par.foreach(pod => {
+      kubernetesTestComponents.kubernetesClient.pods()
+        .withName(pod.getMetadata.getName)
+        .withGracePeriod(60)
+        .delete
+    })
+  }
+
+  private def getSparkMetricsService(sparkBaseAppName: String): SparkRestApiV1 = {
+    val serviceName = kubernetesTestComponents.kubernetesClient.services()
+      .withLabel("spark-app-name", sparkBaseAppName)
+      .list()
+      .getItems
+      .get(0)
+      .getMetadata
+      .getName
+    Minikube.getService[SparkRestApiV1](serviceName,
+      kubernetesTestComponents.namespace, "spark-ui-port")
+  }
+
+  private def expectationsForStaticAllocation(sparkMetricsService: SparkRestApiV1): Unit = {
+    val apps = Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val result = sparkMetricsService
+        .getApplications(ImmutableList.of(ApplicationStatus.RUNNING, ApplicationStatus.COMPLETED))
+      assert(result.size == 1
+        && !result.head.id.equalsIgnoreCase("appid")
+        && !result.head.id.equalsIgnoreCase("{appId}"))
+      result
+    }
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val result = sparkMetricsService.getExecutors(apps.head.id)
+      assert(result.size == 2)
+      assert(result.count(exec => exec.id != "driver") == 1)
+      result
+    }
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val result = sparkMetricsService.getStages(
+        apps.head.id, Seq(StageStatus.COMPLETE).asJava)
+      assert(result.size == 1)
+      result
+    }
+  }
+
+  test("Run a simple example") {
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
+    expectationsForStaticAllocation(sparkMetricsService)
+  }
+
+  test("Run with the examples jar on the docker image") {
+    sparkConf.setJars(Seq(KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
+    expectationsForStaticAllocation(sparkMetricsService)
+  }
+
+  test("Run with custom labels and annotations") {
+    sparkConf.set(KUBERNETES_DRIVER_LABELS, "label1=label1value,label2=label2value")
+    sparkConf.set(KUBERNETES_DRIVER_ANNOTATIONS, "annotation1=annotation1value," +
+        "annotation2=annotation2value")
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+    val driverPodMetadata = kubernetesTestComponents.kubernetesClient
+      .pods
+      .withLabel("spark-app-name", "spark-pi")
+      .list()
+      .getItems
+      .get(0)
+      .getMetadata
+    val driverPodLabels = driverPodMetadata.getLabels
+    // We can't match all of the selectors directly since one of the selectors is based on the
+    // launch time.
+    assert(driverPodLabels.size === 5, "Unexpected number of pod labels.")
+    assert(driverPodLabels.get("spark-app-name") === "spark-pi", "Unexpected value for" +
+      " spark-app-name label.")
+    assert(driverPodLabels.get("spark-app-id").startsWith("spark-pi"), "Unexpected value for" +
+      " spark-app-id label (should be prefixed with the app name).")
+    assert(driverPodLabels.get("label1") === "label1value", "Unexpected value for label1")
+    assert(driverPodLabels.get("label2") === "label2value", "Unexpected value for label2")
+    val driverPodAnnotations = driverPodMetadata.getAnnotations
+    assert(driverPodAnnotations.size === 2, "Unexpected number of pod annotations.")
+    assert(driverPodAnnotations.get("annotation1") === "annotation1value",
+      "Unexpected value for annotation1")
+    assert(driverPodAnnotations.get("annotation2") === "annotation2value",
+      "Unexpected value for annotation2")
+  }
+
+  test("Enable SSL on the driver submit server") {
+    val (keyStoreFile, trustStoreFile) = SSLUtils.generateKeyStoreTrustStorePair(
+      Minikube.getMinikubeIp,
+      "changeit",
+      "changeit",
+      "changeit")
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE, s"file://${keyStoreFile.getAbsolutePath}")
+    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyStorePassword", "changeit")
+    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyPassword", "changeit")
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
+      s"file://${trustStoreFile.getAbsolutePath}")
+    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.trustStorePassword", "changeit")
+    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+  }
+
+  test("Enable SSL on the driver submit server using PEM files") {
+    val (keyPem, certPem) = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
+    sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyPem.getAbsolutePath}")
+    sparkConf.set(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${certPem.getAbsolutePath}")
+    sparkConf.set(DRIVER_SUBMIT_SSL_SERVER_CERT_PEM, s"file://${certPem.getAbsolutePath}")
+    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+  }
+
+  test("Added files should exist on the driver.") {
+    sparkConf.set("spark.files", KubernetesSuite.TEST_EXISTENCE_FILE.getAbsolutePath)
+    sparkConf.setAppName("spark-file-existence-test")
+    val podCompletedFuture = SettableFuture.create[Boolean]
+    val watch = new Watcher[Pod] {
+      override def eventReceived(action: Action, pod: Pod): Unit = {
+        val containerStatuses = pod.getStatus.getContainerStatuses.asScala
+        val allSuccessful = containerStatuses.nonEmpty && containerStatuses
+          .forall(status => {
+            status.getState.getTerminated != null && status.getState.getTerminated.getExitCode == 0
+        })
+        if (allSuccessful) {
+          podCompletedFuture.set(true)
+        } else {
+          val failedContainers = containerStatuses.filter(container => {
+            container.getState.getTerminated != null &&
+              container.getState.getTerminated.getExitCode != 0
+          })
+          if (failedContainers.nonEmpty) {
+            podCompletedFuture.setException(new SparkException(
+              "One or more containers in the driver failed with a nonzero exit code."))
+          }
+        }
+      }
+
+      override def onClose(e: KubernetesClientException): Unit = {
+        logWarning("Watch closed", e)
+      }
+    }
+    Utils.tryWithResource(kubernetesTestComponents.kubernetesClient
+        .pods
+        .withLabel("spark-app-name", "spark-file-existence-test")
+        .watch(watch)) { _ =>
+      new Client(
+        sparkConf = sparkConf,
+        mainClass = KubernetesSuite.FILE_EXISTENCE_MAIN_CLASS,
+        mainAppResource = KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+        appArgs = Array(KubernetesSuite.TEST_EXISTENCE_FILE.getName,
+          KubernetesSuite.TEST_EXISTENCE_FILE_CONTENTS)).run()
+      assert(podCompletedFuture.get(60, TimeUnit.SECONDS), "Failed to run driver pod")
+      val driverPod = kubernetesTestComponents.kubernetesClient
+        .pods
+        .withLabel("spark-app-name", "spark-file-existence-test")
+        .list()
+        .getItems
+        .get(0)
+      val podLog = kubernetesTestComponents.kubernetesClient
+        .pods
+        .withName(driverPod.getMetadata.getName)
+        .getLog
+      assert(podLog.contains(s"File found at" +
+        s" /opt/spark/${KubernetesSuite.TEST_EXISTENCE_FILE.getName} with correct contents."),
+        "Job did not find the file as expected.")
+    }
+  }
+
+  test("Use external URI provider") {
+    val externalUriProviderWatch =
+      new ExternalUriProviderWatch(kubernetesTestComponents.kubernetesClient)
+    Utils.tryWithResource(kubernetesTestComponents.kubernetesClient.services()
+        .withLabel("spark-app-name", "spark-pi")
+        .watch(externalUriProviderWatch)) { _ =>
+      sparkConf.set(DRIVER_SERVICE_MANAGER_TYPE, ExternalSuppliedUrisDriverServiceManager.TYPE)
+      new Client(
+        sparkConf = sparkConf,
+        mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+        mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+        appArgs = Array.empty[String]).run()
+      val sparkMetricsService = getSparkMetricsService("spark-pi")
+      expectationsForStaticAllocation(sparkMetricsService)
+      assert(externalUriProviderWatch.annotationSet.get)
+      val driverService = kubernetesTestComponents.kubernetesClient
+        .services()
+        .withLabel("spark-app-name", "spark-pi")
+        .list()
+        .getItems
+        .asScala(0)
+      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_PROVIDE_EXTERNAL_URI),
+          "External URI request annotation was not set on the driver service.")
+      // Unfortunately we can't check the correctness of the actual value of the URI, as it depends
+      // on the driver submission port set on the driver service but we remove that port from the
+      // service once the submission is complete.
+      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_RESOLVED_EXTERNAL_URI),
+        "Resolved URI annotation not set on driver service.")
+    }
+  }
+
+  test("Mount the Kubernetes credentials onto the driver pod") {
+    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
+      kubernetesTestComponents.clientConfig.getCaCertFile)
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
+      kubernetesTestComponents.clientConfig.getClientKeyFile)
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
+      kubernetesTestComponents.clientConfig.getClientCertFile)
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+    val sparkMetricsService = getSparkMetricsService("spark-pi")
+    expectationsForStaticAllocation(sparkMetricsService)
+  }
+
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
new file mode 100644
index 0000000000000..0d74067334028
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.util.UUID
+
+import org.scalatest.{BeforeAndAfter, DoNotDiscover}
+import org.scalatest.concurrent.Eventually
+
+import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.SSLUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.submit.v2.{MountedDependencyManagerProviderImpl, SubmissionKubernetesClientProviderImpl}
+
+@DoNotDiscover
+private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter {
+
+  private val APP_LOCATOR_LABEL = UUID.randomUUID().toString.replaceAll("-", "")
+  private var kubernetesTestComponents: KubernetesTestComponents = _
+  private var sparkConf: SparkConf = _
+  private var resourceStagingServerLauncher: ResourceStagingServerLauncher = _
+
+  override def beforeAll(): Unit = {
+    kubernetesTestComponents = new KubernetesTestComponents
+    resourceStagingServerLauncher = new ResourceStagingServerLauncher(
+      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
+  }
+
+  before {
+    sparkConf = kubernetesTestComponents.newSparkConf()
+      .set(INIT_CONTAINER_DOCKER_IMAGE, s"spark-driver-init:latest")
+      .set(DRIVER_DOCKER_IMAGE, s"spark-driver-v2:latest")
+      .set(KUBERNETES_DRIVER_LABELS, s"spark-app-locator=$APP_LOCATOR_LABEL")
+    kubernetesTestComponents.createNamespace()
+  }
+
+  after {
+    kubernetesTestComponents.deleteNamespace()
+  }
+
+  test("Use submission v2.") {
+    launchStagingServer(SSLOptions())
+    runSparkAppAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Enable SSL on the submission server") {
+    val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
+      ipAddress = Minikube.getMinikubeIp,
+      keyStorePassword = "keyStore",
+      keyPassword = "key",
+      trustStorePassword = "trustStore")
+    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore", keyStore.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore", trustStore.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStore")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "key")
+      .set("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword", "trustStore")
+    launchStagingServer(SSLOptions(
+      enabled = true,
+      keyStore = Some(keyStore),
+      trustStore = Some(trustStore),
+      keyStorePassword = Some("keyStore"),
+      keyPassword = Some("key"),
+      trustStorePassword = Some("trustStore")))
+    runSparkAppAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Use container-local resources without the resource staging server") {
+    sparkConf.setJars(Seq(
+      KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+      KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
+    runSparkAppAndVerifyCompletion(KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  private def launchStagingServer(resourceStagingServerSslOptions: SSLOptions): Unit = {
+    val resourceStagingServerPort = resourceStagingServerLauncher.launchStagingServer(
+      resourceStagingServerSslOptions)
+    val resourceStagingServerUriScheme = if (resourceStagingServerSslOptions.enabled) {
+      "https"
+    } else {
+      "http"
+    }
+    sparkConf.set(RESOURCE_STAGING_SERVER_URI,
+      s"$resourceStagingServerUriScheme://${Minikube.getMinikubeIp}:$resourceStagingServerPort")
+  }
+
+  private def runSparkAppAndVerifyCompletion(appResource: String): Unit = {
+    val client = new org.apache.spark.deploy.kubernetes.submit.v2.Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      appArgs = Array.empty[String],
+      mainAppResource = appResource,
+      kubernetesClientProvider =
+        new SubmissionKubernetesClientProviderImpl(sparkConf),
+      mountedDependencyManagerProvider =
+        new MountedDependencyManagerProviderImpl(sparkConf))
+    client.run()
+    val driverPod = kubernetesTestComponents.kubernetesClient
+      .pods()
+      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
+      .list()
+      .getItems
+      .get(0)
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      assert(kubernetesTestComponents.kubernetesClient
+        .pods()
+        .withName(driverPod.getMetadata.getName)
+        .getLog
+        .contains("Pi is roughly 3"), "The application did not compute the value of pi.")
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
new file mode 100644
index 0000000000000..ca549fa27d630
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.io.StringWriter
+import java.util.Properties
+import java.util.concurrent.TimeUnit
+
+import com.google.common.io.{BaseEncoding, Files}
+import com.google.common.util.concurrent.SettableFuture
+import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, Endpoints, HasMetadata, HTTPGetActionBuilder, KeyToPathBuilder, Pod, PodBuilder, SecretBuilder, ServiceBuilder}
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import io.fabric8.kubernetes.client.internal.readiness.Readiness
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SSLOptions
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.v2.ContainerNameEqualityPredicate
+import org.apache.spark.util.Utils
+
+/**
+ * Launches a pod that runs the resource staging server, exposing it over a NodePort.
+ */
+private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesClient) {
+
+  private val KEYSTORE_DIR = "/mnt/secrets/spark-staging"
+  private val KEYSTORE_FILE = s"$KEYSTORE_DIR/keyStore"
+  private val PROPERTIES_FILE_NAME = "staging-server.properties"
+  private val PROPERTIES_DIR = "/var/data/spark-staging-server"
+  private val PROPERTIES_FILE_PATH = s"$PROPERTIES_DIR/$PROPERTIES_FILE_NAME"
+  private var activeResources = Seq.empty[HasMetadata]
+
+  // Returns the NodePort the staging server is listening on
+  def launchStagingServer(sslOptions: SSLOptions): Int = {
+    val stagingServerProperties = new Properties()
+    val stagingServerSecret = sslOptions.keyStore.map { keyStore =>
+      val keyStoreBytes = Files.toByteArray(keyStore)
+      val keyStoreBase64 = BaseEncoding.base64().encode(keyStoreBytes)
+      new SecretBuilder()
+        .withNewMetadata()
+          .withName("resource-staging-server-keystore")
+          .endMetadata()
+        .addToData("keyStore", keyStoreBase64)
+        .build()
+    }
+    stagingServerProperties.setProperty(
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key, sslOptions.enabled.toString)
+    sslOptions.keyStorePassword.foreach { password =>
+      stagingServerProperties.setProperty(
+        "spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", password)
+    }
+    sslOptions.keyPassword.foreach { password =>
+      stagingServerProperties.setProperty(
+        "spark.ssl.kubernetes.resourceStagingServer.keyPassword", password)
+    }
+    stagingServerSecret.foreach { _ =>
+      stagingServerProperties.setProperty(
+        "spark.ssl.kubernetes.resourceStagingServer.keyStore", KEYSTORE_FILE)
+    }
+    val propertiesWriter = new StringWriter()
+    stagingServerProperties.store(propertiesWriter, "Resource staging server properties.")
+    val stagingServerConfigMap = new ConfigMapBuilder()
+      .withNewMetadata()
+      .withName(s"staging-server-properties")
+      .endMetadata()
+      .addToData("staging-server", propertiesWriter.toString)
+      .build()
+    val probePingHttpGet = new HTTPGetActionBuilder()
+      .withScheme(if (sslOptions.enabled) "HTTPS" else "HTTP")
+      .withPath("/api/v0/ping")
+      .withNewPort(RESOURCE_STAGING_SERVER_PORT.defaultValue.get)
+      .build()
+    val basePod = new PodBuilder()
+      .withNewMetadata()
+        .withName("resource-staging-server")
+        .addToLabels("resource-staging-server", "staging-server")
+        .endMetadata()
+      .withNewSpec()
+        .addNewVolume()
+          .withName("staging-server-properties")
+          .withNewConfigMap()
+            .withName(stagingServerConfigMap.getMetadata.getName)
+            .withItems(
+              new KeyToPathBuilder()
+                .withKey("staging-server")
+                .withPath(PROPERTIES_FILE_NAME)
+                .build())
+            .endConfigMap()
+          .endVolume()
+        .addNewContainer()
+          .withName("staging-server-container")
+          .withImage("spark-resource-staging-server:latest")
+          .withImagePullPolicy("IfNotPresent")
+          .withNewReadinessProbe()
+            .withHttpGet(probePingHttpGet)
+            .endReadinessProbe()
+          .addNewVolumeMount()
+            .withName("staging-server-properties")
+            .withMountPath(PROPERTIES_DIR)
+            .endVolumeMount()
+          .addToArgs(PROPERTIES_FILE_PATH)
+          .endContainer()
+        .endSpec()
+    val withMountedKeyStorePod = stagingServerSecret.map { secret =>
+      basePod.editSpec()
+        .addNewVolume()
+          .withName("keystore-volume")
+          .withNewSecret()
+            .withSecretName(secret.getMetadata.getName)
+            .endSecret()
+          .endVolume()
+        .editMatchingContainer(new ContainerNameEqualityPredicate("staging-server-container"))
+          .addNewVolumeMount()
+            .withName("keystore-volume")
+            .withMountPath(KEYSTORE_DIR)
+            .endVolumeMount()
+          .endContainer()
+        .endSpec()
+    }.getOrElse(basePod).build()
+    val stagingServerService = new ServiceBuilder()
+      .withNewMetadata()
+        .withName("resource-staging-server")
+        .endMetadata()
+      .withNewSpec()
+        .withType("NodePort")
+        .addToSelector("resource-staging-server", "staging-server")
+        .addNewPort()
+          .withName("staging-server-port")
+          .withPort(RESOURCE_STAGING_SERVER_PORT.defaultValue.get)
+          .withNewTargetPort(RESOURCE_STAGING_SERVER_PORT.defaultValue.get)
+          .endPort()
+        .endSpec()
+      .build()
+    val stagingServerPodReadyWatcher = new ReadinessWatcher[Pod]
+    val serviceReadyWatcher = new ReadinessWatcher[Endpoints]
+    val allResources = Seq(
+      stagingServerService,
+      stagingServerConfigMap,
+      withMountedKeyStorePod) ++
+      stagingServerSecret.toSeq
+    Utils.tryWithResource(kubernetesClient.pods()
+        .withName(withMountedKeyStorePod.getMetadata.getName)
+        .watch(stagingServerPodReadyWatcher)) { _ =>
+      Utils.tryWithResource(kubernetesClient.endpoints()
+          .withName(stagingServerService.getMetadata.getName)
+          .watch(serviceReadyWatcher)) { _ =>
+        activeResources = kubernetesClient.resourceList(allResources: _*)
+          .createOrReplace()
+          .asScala
+        stagingServerPodReadyWatcher.waitUntilReady()
+        serviceReadyWatcher.waitUntilReady()
+      }
+    }
+    kubernetesClient.services().withName(stagingServerService.getMetadata.getName).get()
+      .getSpec
+      .getPorts
+      .get(0)
+      .getNodePort
+  }
+
+  def tearDownStagingServer(): Unit = {
+    kubernetesClient.resourceList(activeResources: _*).delete()
+    activeResources = Seq.empty[HasMetadata]
+  }
+
+  private class ReadinessWatcher[T <: HasMetadata] extends Watcher[T] {
+
+    private val signal = SettableFuture.create[Boolean]
+
+    override def eventReceived(action: Action, resource: T): Unit = {
+      if ((action == Action.MODIFIED || action == Action.ADDED) &&
+        Readiness.isReady(resource)) {
+        signal.set(true)
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {}
+
+    def waitUntilReady(): Boolean = signal.get(30, TimeUnit.SECONDS)
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index 1aa6a7b7e70c2..d807c4d81009b 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -19,14 +19,20 @@ package org.apache.spark.deploy.kubernetes.integrationtest.docker
 import java.net.URI
 import java.nio.file.Paths
 
-import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider
-import com.spotify.docker.client.{DefaultDockerClient, DockerCertificates}
+import com.spotify.docker.client.{DefaultDockerClient, DockerCertificates, LoggingBuildHandler}
 import org.apache.http.client.utils.URIBuilder
 import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
 
 private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String, String]) {
 
+  private val DOCKER_BUILD_PATH = Paths.get("target", "docker")
+  // Dockerfile paths must be relative to the build path.
+  private val DRIVER_V1_DOCKER_FILE = "dockerfiles/driver/Dockerfile"
+  private val DRIVER_V2_DOCKER_FILE = "dockerfiles/driver-v2/Dockerfile"
+  private val EXECUTOR_DOCKER_FILE = "dockerfiles/executor/Dockerfile"
+  private val DRIVER_INIT_DOCKER_FILE = "dockerfiles/driver-init/Dockerfile"
+  private val STAGING_SERVER_DOCKER_FILE = "dockerfiles/resource-staging-server/Dockerfile"
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
   private val dockerHost = dockerEnv.getOrElse("DOCKER_HOST",
@@ -52,7 +58,18 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
 
   def buildSparkDockerImages(): Unit = {
     Eventually.eventually(TIMEOUT, INTERVAL) { dockerClient.ping() }
-    dockerClient.build(Paths.get("target", "docker", "driver"), "spark-driver")
-    dockerClient.build(Paths.get("target", "docker", "executor"), "spark-executor")
+    buildImage("spark-driver", DRIVER_V1_DOCKER_FILE)
+    buildImage("spark-executor", EXECUTOR_DOCKER_FILE)
+    buildImage("spark-driver-v2", DRIVER_V2_DOCKER_FILE)
+    buildImage("spark-resource-staging-server", STAGING_SERVER_DOCKER_FILE)
+    buildImage("spark-driver-init", DRIVER_INIT_DOCKER_FILE)
+  }
+
+  private def buildImage(name: String, dockerFile: String): Unit = {
+    dockerClient.build(
+      DOCKER_BUILD_PATH,
+      name,
+      dockerFile,
+      new LoggingBuildHandler())
   }
 }

From 6b489c2fb07e99066aa0cd1bddb923a3339371b3 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Mon, 1 May 2017 18:54:46 -0700
Subject: [PATCH 468/534] Enable testing against GCE clusters (#243)

* Part 1: making test code cluster-agnostic

* Final checked

* Move all test code into KubernetesTestComponents

* Addressed comments

* Fixed doc

* Restructure the test backends (#248)

* Restructured the test backends

* Address comments

* var -> val

* Comments

* removed deadcode
---
 resource-managers/kubernetes/README.md        |  8 +++
 .../ExternalUriProviderWatch.scala            |  2 +-
 .../integrationtest/KubernetesSuite.scala     | 15 ++---
 .../KubernetesTestComponents.scala            | 54 +++++++++++-----
 .../integrationtest/KubernetesV1Suite.scala   | 24 +++++--
 .../integrationtest/KubernetesV2Suite.scala   | 22 +++++--
 .../integrationtest/ProcessUtils.scala        | 55 ++++++++++++++++
 .../backend/GCE/GCETestBackend.scala          | 40 ++++++++++++
 .../backend/IntegrationTestBackend.scala      | 39 ++++++++++++
 .../{ => backend}/minikube/Minikube.scala     | 63 ++-----------------
 .../minikube/MinikubeTestBackend.scala        | 47 ++++++++++++++
 .../integrationtest/constants.scala           | 22 +++++++
 12 files changed, 299 insertions(+), 92 deletions(-)
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/GCE/GCETestBackend.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/IntegrationTestBackend.scala
 rename resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/{ => backend}/minikube/Minikube.scala (64%)
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index d70c38fdc64d5..fd1ad29eb795d 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -61,6 +61,14 @@ build/mvn integration-test \
     -pl resource-managers/kubernetes/integration-tests -am
 ```
 
+# Running against an arbitrary cluster
+
+In order to run against any cluster, use the following:
+build/mvn integration-test \
+    -Pkubernetes -Pkubernetes-integration-tests \
+    -pl resource-managers/kubernetes/integration-tests -am
+    -DextraScalaTestArgs="-Dspark.kubernetes.test.master=k8s://https://<master> -Dspark.docker.test.driverImage=<driver-image> -Dspark.docker.test.executorImage=<executor-image>"
+
 # Preserve the Minikube VM
 
 The integration tests make use of [Minikube](https://github.com/kubernetes/minikube), which fires up a virtual machine
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
index 3199a8c385f95..f402d240bfc33 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
@@ -24,7 +24,7 @@ import io.fabric8.kubernetes.client.Watcher.Action
 import scala.collection.JavaConverters._
 
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.internal.Logging
 
 /**
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index abbf7e4d5ce1b..bd5ff7a005d46 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -25,26 +25,23 @@ import org.scalatest.concurrent.PatienceConfiguration
 import org.scalatest.time.{Minutes, Seconds, Span}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
-import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.{IntegrationTestBackend, IntegrationTestBackendFactory}
 
 private[spark] class KubernetesSuite extends SparkFunSuite {
+  private val testBackend: IntegrationTestBackend = IntegrationTestBackendFactory.getTestBackend()
 
   override def beforeAll(): Unit = {
-    Minikube.startMinikube()
-    new SparkDockerImageBuilder(Minikube.getDockerEnv).buildSparkDockerImages()
+    testBackend.initialize()
   }
 
   override def afterAll(): Unit = {
-    if (!System.getProperty("spark.docker.test.persistMinikube", "false").toBoolean) {
-      Minikube.deleteMinikube()
-    }
+    testBackend.cleanUp()
   }
 
   override def nestedSuites: scala.collection.immutable.IndexedSeq[Suite] = {
       Vector(
-        new KubernetesV1Suite,
-        new KubernetesV2Suite)
+        new KubernetesV1Suite(testBackend),
+        new KubernetesV2Suite(testBackend))
   }
 }
 
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
index 53e02f9e479c1..8cdacee655c05 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
@@ -17,22 +17,27 @@
 package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.util.UUID
+import javax.net.ssl.X509TrustManager
 
-import org.scalatest.concurrent.Eventually
 import scala.collection.JavaConverters._
+import scala.reflect.ClassTag
+
+import io.fabric8.kubernetes.client.DefaultKubernetesClient
+import io.fabric8.kubernetes.client.internal.SSLUtils
+import org.scalatest.concurrent.Eventually
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.rest.kubernetes.v1.HttpClientUtil
 
-private[spark] class KubernetesTestComponents {
+private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesClient) {
 
   val namespace = UUID.randomUUID().toString.replaceAll("-", "")
-  val kubernetesClient = Minikube.getKubernetesClient.inNamespace(namespace)
+  val kubernetesClient = defaultClient.inNamespace(namespace)
   val clientConfig = kubernetesClient.getConfiguration
 
   def createNamespace(): Unit = {
-    Minikube.getKubernetesClient.namespaces.createNew()
+    defaultClient.namespaces.createNew()
       .withNewMetadata()
       .withName(namespace)
       .endMetadata()
@@ -40,9 +45,9 @@ private[spark] class KubernetesTestComponents {
   }
 
   def deleteNamespace(): Unit = {
-    Minikube.getKubernetesClient.namespaces.withName(namespace).delete()
+    defaultClient.namespaces.withName(namespace).delete()
     Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      val namespaceList = Minikube.getKubernetesClient
+      val namespaceList = defaultClient
         .namespaces()
         .list()
         .getItems()
@@ -53,13 +58,12 @@ private[spark] class KubernetesTestComponents {
 
   def newSparkConf(): SparkConf = {
     new SparkConf(true)
-      .setMaster(s"k8s://https://${Minikube.getMinikubeIp}:8443")
-      .set(KUBERNETES_SUBMIT_CA_CERT_FILE, clientConfig.getCaCertFile)
-      .set(KUBERNETES_SUBMIT_CLIENT_KEY_FILE, clientConfig.getClientKeyFile)
-      .set(KUBERNETES_SUBMIT_CLIENT_CERT_FILE, clientConfig.getClientCertFile)
+      .setMaster(s"k8s://${kubernetesClient.getMasterUrl}")
       .set(KUBERNETES_NAMESPACE, namespace)
-      .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
-      .set(EXECUTOR_DOCKER_IMAGE, "spark-executor:latest")
+      .set(DRIVER_DOCKER_IMAGE,
+        System.getProperty("spark.docker.test.driverImage", "spark-driver:latest"))
+      .set(EXECUTOR_DOCKER_IMAGE,
+        System.getProperty("spark.docker.test.executorImage", "spark-executor:latest"))
       .setJars(Seq(KubernetesSuite.HELPER_JAR_FILE.getAbsolutePath))
       .set("spark.executor.memory", "500m")
       .set("spark.executor.cores", "1")
@@ -69,4 +73,26 @@ private[spark] class KubernetesTestComponents {
       .set("spark.testing", "false")
       .set(WAIT_FOR_APP_COMPLETION, false)
   }
-}
+
+  def getService[T: ClassTag](
+    serviceName: String,
+    namespace: String,
+    servicePortName: String,
+    servicePath: String = ""): T = synchronized {
+    val kubernetesMaster = s"${defaultClient.getMasterUrl}"
+
+    val url = s"${
+      Array[String](
+        s"${kubernetesClient.getMasterUrl}",
+        "api", "v1", "proxy",
+        "namespaces", namespace,
+        "services", serviceName).mkString("/")
+    }" +
+      s":$servicePortName$servicePath"
+    val userHome = System.getProperty("user.home")
+    val kubernetesConf = kubernetesClient.getConfiguration
+    val sslContext = SSLUtils.sslContext(kubernetesConf)
+    val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
+    HttpClientUtil.createClient[T](Set(url), 5, sslContext.getSocketFactory, trustManager)
+  }
+}
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
index a4e3353032b71..4cbd074547915 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
@@ -18,6 +18,8 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.util.concurrent.TimeUnit
 
+import scala.collection.JavaConverters._
+
 import com.google.common.collect.ImmutableList
 import com.google.common.util.concurrent.SettableFuture
 import io.fabric8.kubernetes.api.model.Pod
@@ -25,26 +27,28 @@ import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.scalatest.{BeforeAndAfter, DoNotDiscover}
 import org.scalatest.concurrent.Eventually
-import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.SSLUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.constants.{GCE_TEST_BACKEND, MINIKUBE_TEST_BACKEND}
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
 import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 import org.apache.spark.util.Utils
 
 @DoNotDiscover
-private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter {
+private[spark] class KubernetesV1Suite(testBackend: IntegrationTestBackend)
+  extends SparkFunSuite with BeforeAndAfter {
 
   private var kubernetesTestComponents: KubernetesTestComponents = _
   private var sparkConf: SparkConf = _
 
   override def beforeAll(): Unit = {
-    kubernetesTestComponents = new KubernetesTestComponents()
+    kubernetesTestComponents = new KubernetesTestComponents(testBackend.getKubernetesClient)
     kubernetesTestComponents.createNamespace()
   }
 
@@ -85,7 +89,7 @@ private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter
       .get(0)
       .getMetadata
       .getName
-    Minikube.getService[SparkRestApiV1](serviceName,
+    kubernetesTestComponents.getService[SparkRestApiV1](serviceName,
       kubernetesTestComponents.namespace, "spark-ui-port")
   }
 
@@ -168,6 +172,8 @@ private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Enable SSL on the driver submit server") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     val (keyStoreFile, trustStoreFile) = SSLUtils.generateKeyStoreTrustStorePair(
       Minikube.getMinikubeIp,
       "changeit",
@@ -188,6 +194,8 @@ private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Enable SSL on the driver submit server using PEM files") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     val (keyPem, certPem) = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
     sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyPem.getAbsolutePath}")
     sparkConf.set(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${certPem.getAbsolutePath}")
@@ -201,6 +209,8 @@ private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Added files should exist on the driver.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     sparkConf.set("spark.files", KubernetesSuite.TEST_EXISTENCE_FILE.getAbsolutePath)
     sparkConf.setAppName("spark-file-existence-test")
     val podCompletedFuture = SettableFuture.create[Boolean]
@@ -257,6 +267,8 @@ private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Use external URI provider") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     val externalUriProviderWatch =
       new ExternalUriProviderWatch(kubernetesTestComponents.kubernetesClient)
     Utils.tryWithResource(kubernetesTestComponents.kubernetesClient.services()
@@ -288,6 +300,8 @@ private[spark] class KubernetesV1Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Mount the Kubernetes credentials onto the driver pod") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
       kubernetesTestComponents.clientConfig.getCaCertFile)
     sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
index 0d74067334028..8fa7cbd52ee83 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
@@ -21,14 +21,17 @@ import java.util.UUID
 import org.scalatest.{BeforeAndAfter, DoNotDiscover}
 import org.scalatest.concurrent.Eventually
 
-import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark._
 import org.apache.spark.deploy.kubernetes.SSLUtils
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.integrationtest.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
 import org.apache.spark.deploy.kubernetes.submit.v2.{MountedDependencyManagerProviderImpl, SubmissionKubernetesClientProviderImpl}
 
 @DoNotDiscover
-private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter {
+private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
+  extends SparkFunSuite with BeforeAndAfter {
 
   private val APP_LOCATOR_LABEL = UUID.randomUUID().toString.replaceAll("-", "")
   private var kubernetesTestComponents: KubernetesTestComponents = _
@@ -36,7 +39,7 @@ private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter
   private var resourceStagingServerLauncher: ResourceStagingServerLauncher = _
 
   override def beforeAll(): Unit = {
-    kubernetesTestComponents = new KubernetesTestComponents
+    kubernetesTestComponents = new KubernetesTestComponents(testBackend.getKubernetesClient)
     resourceStagingServerLauncher = new ResourceStagingServerLauncher(
       kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
   }
@@ -54,11 +57,15 @@ private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Use submission v2.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     launchStagingServer(SSLOptions())
     runSparkAppAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
   test("Enable SSL on the submission server") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
       ipAddress = Minikube.getMinikubeIp,
       keyStorePassword = "keyStore",
@@ -81,6 +88,8 @@ private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   test("Use container-local resources without the resource staging server") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     sparkConf.setJars(Seq(
       KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
       KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
@@ -88,6 +97,8 @@ private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter
   }
 
   private def launchStagingServer(resourceStagingServerSslOptions: SSLOptions): Unit = {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
     val resourceStagingServerPort = resourceStagingServerLauncher.launchStagingServer(
       resourceStagingServerSslOptions)
     val resourceStagingServerUriScheme = if (resourceStagingServerSslOptions.enabled) {
@@ -96,7 +107,8 @@ private[spark] class KubernetesV2Suite extends SparkFunSuite with BeforeAndAfter
       "http"
     }
     sparkConf.set(RESOURCE_STAGING_SERVER_URI,
-      s"$resourceStagingServerUriScheme://${Minikube.getMinikubeIp}:$resourceStagingServerPort")
+      s"$resourceStagingServerUriScheme://" +
+        s"${Minikube.getMinikubeIp}:$resourceStagingServerPort")
   }
 
   private def runSparkAppAndVerifyCompletion(appResource: String): Unit = {
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala
new file mode 100644
index 0000000000000..d0bfac3085487
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.io.{BufferedReader, InputStreamReader}
+import java.util.concurrent.TimeUnit
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+object ProcessUtils extends Logging {
+  /**
+    * executeProcess is used to run a command and return the output if it
+    * completes within timeout seconds.
+    */
+  def executeProcess(fullCommand: Array[String], timeout: Long): Seq[String] = {
+    val pb = new ProcessBuilder().command(fullCommand: _*)
+    pb.redirectErrorStream(true)
+    val proc = pb.start()
+    val outputLines = new ArrayBuffer[String]
+
+    Utils.tryWithResource(new InputStreamReader(proc.getInputStream)) { procOutput =>
+      Utils.tryWithResource(new BufferedReader(procOutput)) { (bufferedOutput: BufferedReader) =>
+        var line: String = null
+        do {
+          line = bufferedOutput.readLine()
+          if (line != null) {
+            logInfo(line)
+            outputLines += line
+          }
+        } while (line != null)
+      }
+    }
+    assert(proc.waitFor(timeout, TimeUnit.SECONDS),
+      s"Timed out while executing ${fullCommand.mkString(" ")}")
+    assert(proc.exitValue == 0, s"Failed to execute ${fullCommand.mkString(" ")}")
+    outputLines.toSeq
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/GCE/GCETestBackend.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/GCE/GCETestBackend.scala
new file mode 100644
index 0000000000000..1ef096be4af02
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/GCE/GCETestBackend.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.backend.GCE
+
+import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
+
+import org.apache.spark.deploy.kubernetes.config.resolveK8sMaster
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
+import org.apache.spark.deploy.kubernetes.integrationtest.constants.GCE_TEST_BACKEND
+
+private[spark] class GCETestBackend(val master: String) extends IntegrationTestBackend {
+  private var defaultClient: DefaultKubernetesClient = _
+
+  override def initialize(): Unit = {
+    var k8ConfBuilder = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(resolveK8sMaster(master))
+    defaultClient = new DefaultKubernetesClient(k8ConfBuilder.build)
+  }
+
+  override def getKubernetesClient(): DefaultKubernetesClient = {
+    defaultClient
+  }
+
+  override def name(): String = GCE_TEST_BACKEND
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/IntegrationTestBackend.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/IntegrationTestBackend.scala
new file mode 100644
index 0000000000000..c5bc923dd51a6
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/IntegrationTestBackend.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.kubernetes.integrationtest.backend
+
+import io.fabric8.kubernetes.client.DefaultKubernetesClient
+
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.GCE.GCETestBackend
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.{Minikube, MinikubeTestBackend}
+import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
+
+private[spark] trait IntegrationTestBackend {
+  def name(): String
+  def initialize(): Unit
+  def getKubernetesClient(): DefaultKubernetesClient
+  def cleanUp(): Unit = {}
+}
+
+private[spark] object IntegrationTestBackendFactory {
+  def getTestBackend(): IntegrationTestBackend = {
+    Option(System.getProperty("spark.kubernetes.test.master"))
+      .map(new GCETestBackend(_))
+      .getOrElse(new MinikubeTestBackend())
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/Minikube.scala
similarity index 64%
rename from resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
rename to resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/Minikube.scala
index 81491be944d3e..7c4b344e8f72b 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/minikube/Minikube.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/Minikube.scala
@@ -14,20 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.integrationtest.minikube
+package org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube
 
-import java.io.{BufferedReader, InputStreamReader}
 import java.nio.file.Paths
-import java.util.concurrent.TimeUnit
-import java.util.regex.Pattern
-import javax.net.ssl.X509TrustManager
 
 import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient}
-import io.fabric8.kubernetes.client.internal.SSLUtils
-import scala.collection.mutable.ArrayBuffer
-import scala.reflect.ClassTag
 
-import org.apache.spark.deploy.rest.kubernetes.v1.HttpClientUtil
+import org.apache.spark.deploy.kubernetes.integrationtest.ProcessUtils
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
@@ -93,7 +86,7 @@ private[spark] object Minikube extends Logging {
   }
 
   def getKubernetesClient: DefaultKubernetesClient = synchronized {
-    val kubernetesMaster = s"https://$getMinikubeIp:8443"
+    val kubernetesMaster = s"https://${getMinikubeIp}:8443"
     val userHome = System.getProperty("user.home")
     val kubernetesConf = new ConfigBuilder()
       .withApiVersion("v1")
@@ -105,32 +98,6 @@ private[spark] object Minikube extends Logging {
     new DefaultKubernetesClient(kubernetesConf)
   }
 
-  def getService[T: ClassTag](
-      serviceName: String,
-      namespace: String,
-      servicePortName: String,
-      servicePath: String = ""): T = synchronized {
-    val kubernetesMaster = s"https://$getMinikubeIp:8443"
-    val url = s"${
-      Array[String](
-        kubernetesMaster,
-        "api", "v1", "proxy",
-        "namespaces", namespace,
-        "services", serviceName).mkString("/")}" +
-      s":$servicePortName$servicePath"
-    val userHome = System.getProperty("user.home")
-    val kubernetesConf = new ConfigBuilder()
-      .withApiVersion("v1")
-      .withMasterUrl(kubernetesMaster)
-      .withCaCertFile(Paths.get(userHome, ".minikube", "ca.crt").toFile.getAbsolutePath)
-      .withClientCertFile(Paths.get(userHome, ".minikube", "apiserver.crt").toFile.getAbsolutePath)
-      .withClientKeyFile(Paths.get(userHome, ".minikube", "apiserver.key").toFile.getAbsolutePath)
-      .build()
-    val sslContext = SSLUtils.sslContext(kubernetesConf)
-    val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
-    HttpClientUtil.createClient[T](Set(url), 5, sslContext.getSocketFactory, trustManager)
-  }
-
   def executeMinikubeSsh(command: String): Unit = {
     executeMinikube("ssh", command)
   }
@@ -141,28 +108,8 @@ private[spark] object Minikube extends Logging {
         throw new IllegalStateException("Failed to make the Minikube binary executable.")
       }
     }
-    val fullCommand = Array(MINIKUBE_EXECUTABLE_DEST.getAbsolutePath, action) ++ args
-    val pb = new ProcessBuilder().command(fullCommand: _*)
-    pb.redirectErrorStream(true)
-    val proc = pb.start()
-    val outputLines = new ArrayBuffer[String]
-
-    Utils.tryWithResource(new InputStreamReader(proc.getInputStream)) { procOutput =>
-      Utils.tryWithResource(new BufferedReader(procOutput)) { (bufferedOutput: BufferedReader) =>
-        var line: String = null
-        do {
-          line = bufferedOutput.readLine()
-          if (line != null) {
-            logInfo(line)
-            outputLines += line
-          }
-        } while (line != null)
-      }
-    }
-    assert(proc.waitFor(MINIKUBE_STARTUP_TIMEOUT_SECONDS, TimeUnit.SECONDS),
-      s"Timed out while executing $action on minikube.")
-    assert(proc.exitValue == 0, s"Failed to execute minikube $action ${args.mkString(" ")}")
-    outputLines.toSeq
+    ProcessUtils.executeProcess(Array(MINIKUBE_EXECUTABLE_DEST.getAbsolutePath, action) ++ args,
+      MINIKUBE_STARTUP_TIMEOUT_SECONDS)
   }
 }
 
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
new file mode 100644
index 0000000000000..6e0049b813719
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube
+
+import io.fabric8.kubernetes.client.DefaultKubernetesClient
+
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
+import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
+import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
+
+private[spark] class MinikubeTestBackend extends IntegrationTestBackend {
+  private var defaultClient: DefaultKubernetesClient = _
+
+  override def initialize(): Unit = {
+    Minikube.startMinikube()
+    new SparkDockerImageBuilder(Minikube.getDockerEnv).buildSparkDockerImages()
+    defaultClient = Minikube.getKubernetesClient
+  }
+
+  override def getKubernetesClient(): DefaultKubernetesClient = {
+    defaultClient
+  }
+
+  override def cleanUp(): Unit = {
+    if (!System.getProperty("spark.docker.test.persistMinikube", "false").toBoolean) {
+      Minikube.deleteMinikube()
+    }
+  }
+
+  override def name(): String = MINIKUBE_TEST_BACKEND
+
+
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala
new file mode 100644
index 0000000000000..8207198b529d2
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+package object constants {
+  val MINIKUBE_TEST_BACKEND = "minikube"
+  val GCE_TEST_BACKEND = "gce"
+}
\ No newline at end of file

From 0e1cb4077cdb4aa6a54cbe172561c3d6deb965e4 Mon Sep 17 00:00:00 2001
From: Erik Erlandson <eje@redhat.com>
Date: Tue, 2 May 2017 16:52:22 -0700
Subject: [PATCH 469/534] Update running-on-kubernetes.md (#259)

---
 docs/running-on-kubernetes.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 794099638f80c..66ea381e306a5 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -9,6 +9,7 @@ currently limited and not well-tested. This should not be used in production env
 ## Prerequisites
 
 * You must have a running Kubernetes cluster with access configured to it using [kubectl](https://kubernetes.io/docs/user-guide/prereqs/). If you do not already have a working Kubernetes cluster, you may setup a test cluster on your local machine using [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
+  * We recommend that minikube be updated to the most recent version (0.18.0 at the time of this documentation), as some earlier versions may not start up the kubernetes cluster with all the necessary components.
 * You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/), [nodes](https://kubernetes.io/docs/admin/node/) and [services](https://kubernetes.io/docs/user-guide/services/) in your cluster. You can verify that you can list these resources by running `kubectl get nodes`, `kubectl get pods` and `kubectl get svc` which should give you a list of nodes, pods and services (if any) respectively.
 * You must have a spark distribution with Kubernetes support. This may be obtained from the [release tarball](https://github.com/apache-spark-on-k8s/spark/releases) or by [building Spark with Kubernetes support](../resource-managers/kubernetes/README.md#building-spark-with-kubernetes-support).
 

From ba151c01ed5f1b670bd92977fba1ba683ed7de5e Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Wed, 3 May 2017 22:11:49 +0000
Subject: [PATCH 470/534] Build with sbt and fix scalastyle checks. (#241)

---
 project/SparkBuild.scala                                  | 8 +++++---
 .../org/apache/spark/deploy/kubernetes/SSLUtils.scala     | 2 +-
 .../spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala | 2 +-
 .../v2/ResourceStagingServerSslOptionsProviderSuite.scala | 3 ++-
 .../kubernetes/docker-minimal-bundle/pom.xml              | 2 +-
 .../integration-tests-spark-jobs-helpers/pom.xml          | 3 +++
 .../kubernetes/integration-tests-spark-jobs/pom.xml       | 3 +++
 resource-managers/kubernetes/integration-tests/pom.xml    | 3 +++
 8 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index e3fbe0379fb7b..01e7e445713ac 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -57,9 +57,11 @@ object BuildCommons {
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
   val optionallyEnabledProjects@Seq(mesos, yarn, java8Tests, sparkGangliaLgpl,
-    streamingKinesisAsl, dockerIntegrationTests) =
-    Seq("mesos", "yarn", "java8-tests", "ganglia-lgpl", "streaming-kinesis-asl",
-      "docker-integration-tests").map(ProjectRef(buildLocation, _))
+    streamingKinesisAsl, dockerIntegrationTests, kubernetes, _*) =
+    Seq("mesos", "yarn", "java8-tests", "ganglia-lgpl", "streaming-kinesis-asl", "docker-integration-tests",
+      "kubernetes", "kubernetes-integration-tests", "kubernetes-integration-tests-spark-jobs",
+      "kubernetes-integration-tests-spark-jobs-helpers", "kubernetes-docker-minimal-bundle"
+    ).map(ProjectRef(buildLocation, _))
 
   val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKafka010Assembly, streamingKinesisAslAssembly) =
     Seq("network-yarn", "streaming-flume-assembly", "streaming-kafka-0-8-assembly", "streaming-kafka-0-10-assembly", "streaming-kinesis-asl-assembly")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
index dacb017d8a513..0cb056dcf5493 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
@@ -19,8 +19,8 @@ package org.apache.spark.deploy.kubernetes
 import java.io.{File, FileOutputStream, OutputStreamWriter}
 import java.math.BigInteger
 import java.nio.file.Files
-import java.security.cert.X509Certificate
 import java.security.{KeyPair, KeyPairGenerator, KeyStore, SecureRandom}
+import java.security.cert.X509Certificate
 import java.util.{Calendar, Random}
 import javax.security.auth.x500.X500Principal
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
index 9e2ab26460412..e6536fbaa6941 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
@@ -22,7 +22,7 @@ import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, Container,
 import io.fabric8.kubernetes.client.KubernetesClient
 import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
 import org.hamcrest.{BaseMatcher, Description}
-import org.mockito.Matchers.{any, anyVararg, argThat, startsWith, eq => mockitoEq}
+import org.mockito.Matchers.{any, anyVararg, argThat, eq => mockitoEq, startsWith}
 import org.mockito.Mockito.when
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
index 290b46a537bf3..10aced9000bf8 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
@@ -96,7 +96,8 @@ class ResourceStagingServerSslOptionsProviderSuite extends SparkFunSuite with Be
       .set("spark.ssl.kubernetes.resourceStagingServer.keyStore", keyStoreFile.getAbsolutePath)
       .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePasswordFile",
         keyStorePasswordFile.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile", keyPasswordFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile",
+        keyPasswordFile.getAbsolutePath)
     val sslOptions = sslOptionsProvider.getSslOptions
     assert(sslOptions.keyStorePassword === Some("keyStorePassword"),
       "Incorrect keyStore password or it was not set.")
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index a10fe8fb58408..c66b87ac0952d 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -31,7 +31,7 @@
   <packaging>pom</packaging>
 
   <properties>
-    <sbt.project.name>docker-minimal-bundle</sbt.project.name>
+    <sbt.project.name>kubernetes-docker-minimal-bundle</sbt.project.name>
     <build.testJarPhase>none</build.testJarPhase>
     <build.copyDependenciesPhase>pre-integration-test</build.copyDependenciesPhase>
   </properties>
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
index b9c29b26eb648..581bf9453f2f2 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
@@ -27,6 +27,9 @@
   <artifactId>spark-kubernetes-integration-tests-spark-jobs-helpers_2.11</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Kubernetes Integration Tests Spark Jobs Helpers</name>
+  <properties>
+    <sbt.project.name>kubernetes-integration-tests-spark-jobs-helpers</sbt.project.name>
+  </properties>
 
   <dependencies>
   </dependencies>
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
index 16dd0c9322c13..9639811479ff5 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -27,6 +27,9 @@
   <artifactId>spark-kubernetes-integration-tests-spark-jobs_2.11</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Kubernetes Integration Tests Spark Jobs</name>
+  <properties>
+    <sbt.project.name>kubernetes-integration-tests-spark-jobs</sbt.project.name>
+  </properties>
 
   <dependencies>
     <dependency>
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index ac7a549c9b483..c94893cbce410 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -25,6 +25,9 @@
   </parent>
 
   <artifactId>spark-kubernetes-integration-tests_2.11</artifactId>
+  <properties>
+    <sbt.project.name>kubernetes-integration-tests</sbt.project.name>
+  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Kubernetes Integration Tests</name>
 

From 4ac0de130dccd9c639431a08a4fcfe85aad1a3a1 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 3 May 2017 15:33:14 -0700
Subject: [PATCH 471/534] Updating images in doc (#219)

---
 docs/running-on-kubernetes.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 66ea381e306a5..5377d61d35b2f 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -25,11 +25,11 @@ If you wish to use pre-built docker images, you may use the images published in
 <tr><th>Component</th><th>Image</th></tr>
 <tr>
   <td>Spark Driver Image</td>
-  <td><code>kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-rc1</code></td>
+  <td><code>kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
 </tr>
 <tr>
   <td>Spark Executor Image</td>
-  <td><code>kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-rc1</code></td>
+  <td><code>kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
 </tr>
 </table>
 
@@ -45,7 +45,7 @@ For example, if the registry host is `registry-host` and the registry is listeni
     docker build -t registry-host:5000/spark-executor:latest -f dockerfiles/executor/Dockerfile .
     docker push registry-host:5000/spark-driver:latest
     docker push registry-host:5000/spark-executor:latest
-    
+
 ## Submitting Applications to Kubernetes
 
 Kubernetes applications can be executed via `spark-submit`. For example, to compute the value of pi, assuming the images
@@ -58,8 +58,8 @@ are set up as described above:
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-rc1 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-rc1 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
@@ -79,7 +79,7 @@ In the above example, the specific Kubernetes cluster can be used with spark sub
 
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
- 
+
 ### Specifying input files
 
 Spark supports specifying JAR paths that are either on the submitting host's disk, or are located on the disk of the
@@ -109,8 +109,8 @@ If our local proxy were listening on port 8001, we would have our submission loo
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-rc1 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-rc1 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
 Communication between Spark and Kubernetes clusters is performed using the fabric8 kubernetes-client library.

From 8ccb305b9102a374a15eaeef41f6afe86a636a88 Mon Sep 17 00:00:00 2001
From: Johannes Scheuermann <johscheuer@users.noreply.github.com>
Date: Fri, 5 May 2017 20:25:24 +0200
Subject: [PATCH 472/534] Correct readme links (#266)

---
 docs/running-on-kubernetes.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 5377d61d35b2f..02933c28bbc66 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -189,7 +189,7 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td>
     The namespace that will be used for running the driver and executor pods. When using
     <code>spark-submit</code> in cluster mode, this can also be passed to <code>spark-submit</code> via the
-    <code>--kubernetes-namespace</code> command line argument.
+    <code>--kubernetes-namespace</code> command line argument. The namespace must already exist.
   </td>
 </tr>
 <tr>

From 0a8080a1b621d310213b213fc0bf7c61a61067ee Mon Sep 17 00:00:00 2001
From: Erik Erlandson <eje@redhat.com>
Date: Tue, 9 May 2017 14:12:23 -0700
Subject: [PATCH 473/534] edit readme with a working build example command
 (#254)

---
 resource-managers/kubernetes/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index fd1ad29eb795d..734c29947b6d9 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -14,10 +14,10 @@ important matters to keep in mind when developing this feature.
 
 # Building Spark with Kubernetes Support
 
-To build Spark with Kubernetes support, use the `kubernetes` profile when invoking Maven. For example, to simply compile
-the Kubernetes core implementation module along with its dependencies:
+To build Spark with Kubernetes support, use the `kubernetes` profile when invoking Maven.
 
-    build/mvn compile -Pkubernetes -pl resource-managers/kubernetes/core -am -DskipTests
+    git checkout branch-2.1-kubernetes
+    build/mvn package -Pkubernetes -DskipTests
 
 To build a distribution of Spark with Kubernetes support, use the `dev/make-distribution.sh` script, and add the
 `kubernetes` profile as part of the build arguments. Any other build arguments can be specified as one would expect when

From 26f747ebc14a10812d76038f79a98788fba28486 Mon Sep 17 00:00:00 2001
From: Erik Erlandson <eje@redhat.com>
Date: Tue, 9 May 2017 23:20:48 -0700
Subject: [PATCH 474/534] Fix watcher conditional logic (#269)

---
 .../apache/spark/deploy/kubernetes/submit/v1/Client.scala   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index e1cfac8feba37..65e47ddca4bfe 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -538,7 +538,9 @@ private[spark] class Client(
   private class DriverEndpointsReadyWatcher(resolvedDriverEndpoints: SettableFuture[Endpoints])
       extends Watcher[Endpoints] {
     override def eventReceived(action: Action, endpoints: Endpoints): Unit = {
-      if ((action == Action.ADDED) || (action == Action.MODIFIED)
+      if ((action == Action.ADDED || action == Action.MODIFIED)
+          && (endpoints != null)
+          && (endpoints.getSubsets != null)
           && endpoints.getSubsets.asScala.nonEmpty
           && endpoints.getSubsets.asScala.exists(_.getAddresses.asScala.nonEmpty)
           && !resolvedDriverEndpoints.isDone) {
@@ -554,7 +556,7 @@ private[spark] class Client(
   private class DriverServiceReadyWatcher(resolvedDriverService: SettableFuture[Service])
       extends Watcher[Service] {
     override def eventReceived(action: Action, service: Service): Unit = {
-      if ((action == Action.ADDED) || (action == Action.MODIFIED)
+      if ((action == Action.ADDED || action == Action.MODIFIED)
           && !resolvedDriverService.isDone) {
         resolvedDriverService.set(service)
       }

From 546f09ce497aceab8c13daf53f99773977507836 Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Wed, 10 May 2017 00:47:39 -0700
Subject: [PATCH 475/534] Dispatch tasks to right executors that have tasks'
 input HDFS data (#216)

* Dispatch tasks to right executors that have tasks' input HDFS data on local disks

* Fix style issues

* Clean up unnecessary fields

* Clean up a misleading method name

* Address review comments

* Fix import ordering

* Delete executor pods in watcher

* Fix the driver hang by unblocking the main thread

* Fix import order

* Clear runningExecutorPods

* Fix incorrect merge

* Address review comments

* Clean up imports
---
 .../spark/scheduler/TaskSetManager.scala      |  2 +-
 .../kubernetes/KubernetesClientBuilder.scala  | 16 +++-
 .../kubernetes/KubernetesClusterManager.scala |  3 +-
 .../KubernetesClusterSchedulerBackend.scala   | 73 +++++++++++++++++--
 .../KubernetesTaskSchedulerImpl.scala         | 27 +++++++
 .../kubernetes/KubernetesTaskSetManager.scala | 63 ++++++++++++++++
 6 files changed, 172 insertions(+), 12 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSchedulerImpl.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index b766e4148e496..30df8862c3589 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -221,7 +221,7 @@ private[spark] class TaskSetManager(
    * Return the pending tasks list for a given host, or an empty list if
    * there is no map entry for that host
    */
-  private def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
+  protected def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
     pendingTasksForHost.getOrElse(host, ArrayBuffer())
   }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
index 6725992aae978..31c6eda77d058 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
@@ -21,10 +21,13 @@ import java.io.File
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
+import io.fabric8.kubernetes.client.utils.HttpClientUtils
+import okhttp3.Dispatcher
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.util.ThreadUtils
 
 private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: String) {
   private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
@@ -78,6 +81,17 @@ private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: St
       }
       serviceAccountConfigBuilder
     }
-    new DefaultKubernetesClient(configBuilder.build)
+    // Disable the ping thread that is not daemon, in order to allow
+    // the driver main thread to shut down upon errors. Otherwise, the driver
+    // will hang indefinitely.
+    val config = configBuilder
+      .withWebsocketPingInterval(0)
+      .build()
+    val httpClient = HttpClientUtils.createHttpClient(config).newBuilder()
+      // Use a Dispatcher with a custom executor service that creates daemon threads. The default
+      // executor service used by Dispatcher creates non-daemon threads.
+      .dispatcher(new Dispatcher(ThreadUtils.newDaemonCachedThreadPool("spark-on-k8s")))
+      .build()
+    new DefaultKubernetesClient(httpClient, config)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index 36f7149a832c3..70098f1f46ac0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -24,7 +24,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager {
   override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
   override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
-    val scheduler = new TaskSchedulerImpl(sc)
+    val scheduler = new KubernetesTaskSchedulerImpl(sc)
     sc.taskScheduler = scheduler
     scheduler
   }
@@ -37,6 +37,5 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager {
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
     scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
   }
-
 }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 15457db7e1459..a2294a6766980 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -16,13 +16,18 @@
  */
 package org.apache.spark.scheduler.cluster.kubernetes
 
-import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
+import java.io.Closeable
+import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
-import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder,
-    EnvVarSourceBuilder, Pod, QuantityBuilder}
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 
+import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder,
+    EnvVarSourceBuilder, Pod, QuantityBuilder}
+import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+
 import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
@@ -38,8 +43,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   import KubernetesClusterSchedulerBackend._
 
-  private val EXECUTOR_MODIFICATION_LOCK = new Object
-  private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
+  private val RUNNING_EXECUTOR_PODS_LOCK = new Object
+  private val runningExecutorPods = new mutable.HashMap[String, Pod] // Indexed by executor IDs.
+
+  private val EXECUTOR_PODS_BY_IPS_LOCK = new Object
+  private val executorPodsByIPs = new mutable.HashMap[String, Pod] // Indexed by executor IP addrs.
 
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
@@ -87,6 +95,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       super.minRegisteredRatio
     }
 
+  private val executorWatchResource = new AtomicReference[Closeable]
   protected var totalExpectedExecutors = new AtomicInteger(0)
 
   private val driverUrl = RpcEndpointAddress(
@@ -119,6 +128,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   override def start(): Unit = {
     super.start()
+    executorWatchResource.set(kubernetesClient.pods().withLabel(SPARK_APP_ID_LABEL, applicationId())
+      .watch(new ExecutorPodsWatcher()))
     if (!Utils.isDynamicAllocationEnabled(sc.conf)) {
       doRequestTotalExecutors(initialExecutors)
     }
@@ -133,11 +144,22 @@ private[spark] class KubernetesClusterSchedulerBackend(
     // When using Utils.tryLogNonFatalError some of the code fails but without any logs or
     // indication as to why.
     try {
-      runningExecutorPods.values.foreach(kubernetesClient.pods().delete(_))
+      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+        runningExecutorPods.values.foreach(kubernetesClient.pods().delete(_))
+        runningExecutorPods.clear()
+      }
+      EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+        executorPodsByIPs.clear()
+      }
+      val resource = executorWatchResource.getAndSet(null)
+      if (resource != null) {
+        resource.close()
+      }
     } catch {
       case e: Throwable => logError("Uncaught exception while shutting down controllers.", e)
     }
     try {
+      logInfo("Closing kubernetes client")
       kubernetesClient.close()
     } catch {
       case e: Throwable => logError("Uncaught exception closing Kubernetes client.", e)
@@ -231,7 +253,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
-    EXECUTOR_MODIFICATION_LOCK.synchronized {
+    RUNNING_EXECUTOR_PODS_LOCK.synchronized {
       if (requestedTotal > totalExpectedExecutors.get) {
         logInfo(s"Requesting ${requestedTotal - totalExpectedExecutors.get}"
           + s" additional executors, expecting total $requestedTotal and currently" +
@@ -246,7 +268,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future[Boolean] {
-    EXECUTOR_MODIFICATION_LOCK.synchronized {
+    RUNNING_EXECUTOR_PODS_LOCK.synchronized {
       for (executor <- executorIds) {
         runningExecutorPods.remove(executor) match {
           case Some(pod) => kubernetesClient.pods().delete(pod)
@@ -256,6 +278,41 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
     true
   }
+
+  def getExecutorPodByIP(podIP: String): Option[Pod] = {
+    EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+      executorPodsByIPs.get(podIP)
+    }
+  }
+
+  private class ExecutorPodsWatcher extends Watcher[Pod] {
+
+    override def eventReceived(action: Action, pod: Pod): Unit = {
+      if (action == Action.MODIFIED && pod.getStatus.getPhase == "Running"
+          && pod.getMetadata.getDeletionTimestamp == null) {
+        val podIP = pod.getStatus.getPodIP
+        val clusterNodeName = pod.getSpec.getNodeName
+        logDebug(s"Executor pod $pod ready, launched at $clusterNodeName as IP $podIP.")
+        EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+          executorPodsByIPs += ((podIP, pod))
+        }
+      } else if ((action == Action.MODIFIED && pod.getMetadata.getDeletionTimestamp != null) ||
+          action == Action.DELETED || action == Action.ERROR) {
+        val podName = pod.getMetadata.getName
+        val podIP = pod.getStatus.getPodIP
+        logDebug(s"Executor pod $podName at IP $podIP was at $action.")
+        if (podIP != null) {
+          EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+            executorPodsByIPs -= podIP
+          }
+        }
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {
+      logDebug("Executor pod watch closed.", cause)
+    }
+  }
 }
 
 private object KubernetesClusterSchedulerBackend {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSchedulerImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSchedulerImpl.scala
new file mode 100644
index 0000000000000..a5e126480b83d
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSchedulerImpl.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import org.apache.spark.SparkContext
+import org.apache.spark.scheduler.{TaskSchedulerImpl, TaskSet, TaskSetManager}
+
+private[spark] class KubernetesTaskSchedulerImpl(sc: SparkContext) extends TaskSchedulerImpl(sc) {
+
+  override def createTaskSetManager(taskSet: TaskSet, maxTaskFailures: Int): TaskSetManager = {
+    new KubernetesTaskSetManager(this, taskSet, maxTaskFailures)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala
new file mode 100644
index 0000000000000..5cea95be382f0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.scheduler.{TaskSchedulerImpl, TaskSet, TaskSetManager}
+
+private[spark] class KubernetesTaskSetManager(
+    sched: TaskSchedulerImpl,
+    taskSet: TaskSet,
+    maxTaskFailures: Int) extends TaskSetManager(sched, taskSet, maxTaskFailures) {
+
+  /**
+   * Overrides the lookup to use not only the executor pod IP, but also the cluster node
+   * name and host IP address that the pod is running on. The base class may have populated
+   * the lookup target map with HDFS datanode locations if this task set reads HDFS data.
+   * Those datanode locations are based on cluster node names or host IP addresses. Using
+   * only executor pod IPs may not match them.
+   */
+  override def getPendingTasksForHost(executorIP: String): ArrayBuffer[Int] = {
+    val pendingTasksExecutorIP = super.getPendingTasksForHost(executorIP)
+    if (pendingTasksExecutorIP.nonEmpty) {
+      pendingTasksExecutorIP
+    } else {
+      val backend = sched.backend.asInstanceOf[KubernetesClusterSchedulerBackend]
+      val pod = backend.getExecutorPodByIP(executorIP)
+      if (pod.nonEmpty) {
+        val clusterNodeName = pod.get.getSpec.getNodeName
+        val pendingTasksClusterNodeName = super.getPendingTasksForHost(clusterNodeName)
+        if (pendingTasksClusterNodeName.nonEmpty) {
+          logDebug(s"Got preferred task list $pendingTasksClusterNodeName for executor host " +
+            s"$executorIP using cluster node name $clusterNodeName")
+          pendingTasksClusterNodeName
+        } else {
+          val clusterNodeIP = pod.get.getStatus.getHostIP
+          val pendingTasksClusterNodeIP = super.getPendingTasksForHost(clusterNodeIP)
+          if (pendingTasksClusterNodeIP.nonEmpty) {
+            logDebug(s"Got preferred task list $pendingTasksClusterNodeIP for executor host " +
+              s"$executorIP using cluster node IP $clusterNodeIP")
+          }
+          pendingTasksClusterNodeIP
+        }
+      } else {
+        pendingTasksExecutorIP  // Empty
+      }
+    }
+  }
+}

From eb45ae558130ef0db6da53503cc71589c8dca267 Mon Sep 17 00:00:00 2001
From: Ye Yin <eyniy@qq.com>
Date: Wed, 17 May 2017 00:10:45 +0800
Subject: [PATCH 476/534] Add parameter for driver pod name (#258)

* Add parameter for driver pod name

* Mark KUBERNETES_DRIVER_POD_NAME not being internal. Update docment.

* Add test case for driver pod name

* Diff driver pod name with appid

* replace 'spark.kubernetes.driver.pod.name` with KUBERNETES_DRIVER_POD_NAME

* Update readme to complete item
---
 docs/running-on-kubernetes.md                    |  7 +++++++
 .../apache/spark/deploy/kubernetes/config.scala  |  1 -
 .../deploy/kubernetes/submit/v1/Client.scala     | 12 +++++++-----
 .../deploy/kubernetes/submit/v2/Client.scala     |  6 ++++--
 .../integrationtest/KubernetesV1Suite.scala      | 16 ++++++++++++++++
 5 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 02933c28bbc66..be410f18b5cfc 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -350,6 +350,13 @@ from the other deployment modes. See the [configuration page](configuration.html
     resource.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.pod.name</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    Name of the driver pod. If not set, the driver pod name is set to "spark.app.name" suffixed by the current timestamp to avoid name conflicts.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
   <td><code>true</code></td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 1c8b6798bbdd5..e379b40e376fc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -267,7 +267,6 @@ package object config extends Logging {
   private[spark] val KUBERNETES_DRIVER_POD_NAME =
     ConfigBuilder("spark.kubernetes.driver.pod.name")
       .doc("Name of the driver pod.")
-      .internal()
       .stringConf
       .createOptional
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index 65e47ddca4bfe..a4dfe90f71a8a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -51,6 +51,8 @@ private[spark] class Client(
   private val appName = sparkConf.getOption("spark.app.name")
     .getOrElse("spark")
   private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+  private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
+    .getOrElse(kubernetesAppId)
   private val secretName = s"$SUBMISSION_APP_SECRET_PREFIX-$kubernetesAppId"
   private val secretDirectory = s"$DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR/$kubernetesAppId"
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
@@ -150,7 +152,7 @@ private[spark] class Client(
         loggingInterval)
       Utils.tryWithResource(kubernetesClient
           .pods()
-          .withName(kubernetesAppId)
+          .withName(kubernetesDriverPodName)
           .watch(loggingWatch)) { _ =>
         val resourceCleanShutdownHook = ShutdownHookManager.addShutdownHook(() =>
           kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
@@ -247,7 +249,7 @@ private[spark] class Client(
       logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
         s" overridden as $kubernetesAppId")
     }
-    sparkConf.set(KUBERNETES_DRIVER_POD_NAME, kubernetesAppId)
+    sparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
     sparkConf.set(KUBERNETES_DRIVER_SERVICE_NAME, driverService.getMetadata.getName)
     sparkConf.set("spark.app.id", kubernetesAppId)
     sparkConf.setIfMissing("spark.app.name", appName)
@@ -314,7 +316,7 @@ private[spark] class Client(
     val podWatcher = new DriverPodReadyWatcher(podReadyFuture)
     Utils.tryWithResource(kubernetesClient
         .pods()
-        .withName(kubernetesAppId)
+        .withName(kubernetesDriverPodName)
         .watch(podWatcher)) { _ =>
       Utils.tryWithResource(kubernetesClient
           .services()
@@ -445,7 +447,7 @@ private[spark] class Client(
       .build()
     val driverPod = kubernetesClient.pods().createNew()
       .withNewMetadata()
-        .withName(kubernetesAppId)
+        .withName(kubernetesDriverPodName)
         .withLabels(driverKubernetesSelectors.asJava)
         .withAnnotations(customAnnotations.asJava)
         .endMetadata()
@@ -571,7 +573,7 @@ private[spark] class Client(
       kubernetesClient: KubernetesClient,
       e: Throwable): String = {
     val driverPod = try {
-      kubernetesClient.pods().withName(kubernetesAppId).get()
+      kubernetesClient.pods().withName(kubernetesDriverPodName).get()
     } catch {
       case throwable: Throwable =>
         logError(s"Timed out while waiting $driverSubmitTimeoutSecs seconds for the" +
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
index 69dbfd041bb86..a70c93942ffb5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
@@ -58,6 +58,8 @@ private[spark] class Client(
   private val appName = sparkConf.getOption("spark.app.name")
     .getOrElse("spark")
   private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+  private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
+    .getOrElse(kubernetesAppId)
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
   private val maybeStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
   private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
@@ -123,7 +125,7 @@ private[spark] class Client(
         .build()
       val basePod = new PodBuilder()
         .withNewMetadata()
-          .withName(kubernetesAppId)
+          .withName(kubernetesDriverPodName)
           .addToLabels(allLabels.asJava)
           .addToAnnotations(parsedCustomAnnotations.asJava)
           .endMetadata()
@@ -176,7 +178,7 @@ private[spark] class Client(
       if (resolvedFiles.nonEmpty) {
         resolvedSparkConf.set("spark.files", resolvedFiles.mkString(","))
       }
-      resolvedSparkConf.set(KUBERNETES_DRIVER_POD_NAME, kubernetesAppId)
+      resolvedSparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
       resolvedSparkConf.set("spark.app.id", kubernetesAppId)
       // We don't need this anymore since we just set the JVM options on the environment
       resolvedSparkConf.remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
index 4cbd074547915..f09339a9c3e08 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
@@ -171,6 +171,22 @@ private[spark] class KubernetesV1Suite(testBackend: IntegrationTestBackend)
       "Unexpected value for annotation2")
   }
 
+  test("Run with driver pod name") {
+    sparkConf.set(KUBERNETES_DRIVER_POD_NAME, "spark-pi")
+    new Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
+      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      appArgs = Array.empty[String]).run()
+    val driverPodMetadata = kubernetesTestComponents.kubernetesClient
+      .pods()
+      .withName("spark-pi")
+      .get()
+      .getMetadata()
+    val driverName = driverPodMetadata.getName
+    assert(driverName === "spark-pi", "Unexpected driver pod name.")
+  }
+
   test("Enable SSL on the driver submit server") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 

From e9da54952ce95f192b5ed9168ec4e9b74a5b6ca5 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <foxish@users.noreply.github.com>
Date: Wed, 17 May 2017 09:44:22 -0700
Subject: [PATCH 477/534] Dynamic allocation (#272)

* dynamic allocation: shuffle service docker, yaml and test fixture

* dynamic allocation: changes to spark-core

* dynamic allocation: tests

* dynamic allocation: docs

* dynamic allocation: kubernetes allocator and executor accounting

* dynamic allocation: shuffle service, node caching
---
 conf/kubernetes-shuffle-service.yaml          |  53 +++++
 .../CoarseGrainedExecutorBackend.scala        |   2 +-
 .../cluster/CoarseGrainedClusterMessage.scala |   2 +-
 .../CoarseGrainedSchedulerBackend.scala       |   2 +-
 .../apache/spark/storage/BlockManager.scala   |  10 +-
 docs/running-on-kubernetes.md                 |  66 +++++-
 resource-managers/kubernetes/README.md        |   6 +-
 .../kubernetes/ConfigurationUtils.scala       |  41 ++++
 .../spark/deploy/kubernetes/config.scala      |  45 ++++
 .../spark/deploy/kubernetes/constants.scala   |   1 +
 .../KubernetesClusterSchedulerBackend.scala   | 224 ++++++++++++++----
 .../cluster/kubernetes/ShufflePodCache.scala  |  91 +++++++
 .../main/docker/shuffle-service/Dockerfile    |  39 +++
 .../integrationtest/jobs/GroupByTest.scala    |  54 +++++
 .../integrationtest/KubernetesSuite.scala     |   4 +
 .../integrationtest/KubernetesV2Suite.scala   |  99 +++++++-
 .../docker/SparkDockerImageBuilder.scala      |   2 +
 17 files changed, 682 insertions(+), 59 deletions(-)
 create mode 100644 conf/kubernetes-shuffle-service.yaml
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
 create mode 100644 resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/GroupByTest.scala

diff --git a/conf/kubernetes-shuffle-service.yaml b/conf/kubernetes-shuffle-service.yaml
new file mode 100644
index 0000000000000..3aeb1f54f301c
--- /dev/null
+++ b/conf/kubernetes-shuffle-service.yaml
@@ -0,0 +1,53 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  labels:
+    app: spark-shuffle-service
+    spark-version: 2.1.0
+  name: shuffle
+spec:
+  template:
+    metadata:
+      labels:
+        app: spark-shuffle-service
+        spark-version: 2.1.0
+    spec:
+      volumes:
+        - name: temp-volume
+          hostPath:
+            path: '/var/tmp' # change this path according to your cluster configuration.
+      containers:
+        - name: shuffle
+          # This is an official image that is built
+          # from the dockerfiles/shuffle directory
+          # in the spark distribution.
+          image: kubespark/spark-shuffle:v2.1.0-kubernetes-0.1.0-alpha.3
+          volumeMounts:
+            - mountPath: '/tmp'
+              name: temp-volume
+              # more volumes can be mounted here.
+              # The spark job must be configured to use these
+              # mounts using the configuration:
+              #   spark.kubernetes.shuffle.dir=<mount-1>,<mount-2>,...
+          resources:
+             requests:
+               cpu: "1"
+             limits:
+               cpu: "1"
\ No newline at end of file
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 92a27902c6696..f0e13aa6bf109 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -200,7 +200,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         new SecurityManager(executorConf),
         clientMode = true)
       val driver = fetcher.setupEndpointRefByURI(driverUrl)
-      val cfg = driver.askWithRetry[SparkAppConfig](RetrieveSparkAppConfig)
+      val cfg = driver.askWithRetry[SparkAppConfig](RetrieveSparkAppConfig(executorId))
       val props = cfg.sparkProperties ++ Seq[(String, String)](("spark.app.id", appId))
       fetcher.shutdown()
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index 0a4f19d76073e..2406999f9ee92 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -28,7 +28,7 @@ private[spark] sealed trait CoarseGrainedClusterMessage extends Serializable
 
 private[spark] object CoarseGrainedClusterMessages {
 
-  case object RetrieveSparkAppConfig extends CoarseGrainedClusterMessage
+  case class RetrieveSparkAppConfig(executorId: String) extends CoarseGrainedClusterMessage
 
   case class SparkAppConfig(
       sparkProperties: Seq[(String, String)],
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 3452487e72e88..89e59353de845 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -206,7 +206,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         removeExecutor(executorId, reason)
         context.reply(true)
 
-      case RetrieveSparkAppConfig =>
+      case RetrieveSparkAppConfig(executorId) =>
         val reply = SparkAppConfig(sparkProperties,
           SparkEnv.get.securityManager.getIOEncryptionKey())
         context.reply(reply)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 04521c9159eac..18f7d135acdd2 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -184,8 +184,14 @@ private[spark] class BlockManager(
     blockManagerId = if (idFromMaster != null) idFromMaster else id
 
     shuffleServerId = if (externalShuffleServiceEnabled) {
-      logInfo(s"external shuffle service port = $externalShuffleServicePort")
-      BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort)
+      val shuffleServerHostName = if (blockManagerId.isDriver) {
+        blockTransferService.hostName
+      } else {
+        conf.get("spark.shuffle.service.host", blockTransferService.hostName)
+      }
+      logInfo(s"external shuffle service host = $shuffleServerHostName, " +
+        s"port = $externalShuffleServicePort")
+      BlockManagerId(executorId, shuffleServerHostName, externalShuffleServicePort)
     } else {
       blockManagerId
     }
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index be410f18b5cfc..5b7bb6cc612c5 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -189,7 +189,7 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td>
     The namespace that will be used for running the driver and executor pods. When using
     <code>spark-submit</code> in cluster mode, this can also be passed to <code>spark-submit</code> via the
-    <code>--kubernetes-namespace</code> command line argument. The namespace must already exist.
+    <code>--kubernetes-namespace</code> command line argument.
   </td>
 </tr>
 <tr>
@@ -208,6 +208,37 @@ from the other deployment modes. See the [configuration page](configuration.html
     <a href="https://docs.docker.com/engine/reference/commandline/tag/">Docker tag</a> format.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.shuffle.namespace</code></td>
+  <td><code>default</code></td>
+  <td>
+    Namespace in which the shuffle service pods are present. The shuffle service must be
+    created in the cluster prior to attempts to use it.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.shuffle.labels</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    Labels that will be used to look up shuffle service pods. This should be a comma-separated list of label key-value pairs,
+    where each label is in the format <code>key=value</code>. The labels chosen must be such that
+    they match exactly one shuffle service pod on each node that executors are launched.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.allocation.batch.size</code></td>
+  <td><code>5</code></td>
+  <td>
+    Number of pods to launch at once in each round of executor pod allocation.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.allocation.batch.delay</code></td>
+  <td><code>1</code></td>
+  <td>
+    Number of seconds to wait between each round of executor pod allocation.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.authenticate.submission.caCertFile</code></td>
   <td>(none)</td>
@@ -389,10 +420,41 @@ from the other deployment modes. See the [configuration page](configuration.html
 </tr>
 </table>
 
+## Dynamic Executor Scaling
+
+Spark on Kubernetes supports Dynamic Allocation with cluster mode. This mode requires running
+an external shuffle service. This is typically a [daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/)
+with a provisioned [hostpath](https://kubernetes.io/docs/concepts/storage/volumes/#hostpath) volume.
+This shuffle service may be shared by executors belonging to different SparkJobs. Using Spark with dynamic allocation
+on Kubernetes assumes that a cluster administrator has set up one or more shuffle-service daemonsets in the cluster.
+
+A sample configuration file is provided in `conf/kubernetes-shuffle-service.yaml` which can be customized as needed
+for a particular cluster. It is important to note that `spec.template.metadata.labels` are setup appropriately for the shuffle
+service because there may be multiple shuffle service instances running in a cluster. The labels give us a way to target a particular
+shuffle service.
+
+For example, if the shuffle service we want to use is in the default namespace, and
+has pods with labels `app=spark-shuffle-service` and `spark-version=2.1.0`, we can
+use those tags to target that particular shuffle service at job launch time. In order to run a job with dynamic allocation enabled,
+the command may then look like the following:
+
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class org.apache.spark.examples.GroupByTest \
+      --master k8s://<k8s-master>:<port> \
+      --kubernetes-namespace default \
+      --conf spark.app.name=group-by-test \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:latest \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:latest \
+      --conf spark.dynamicAllocation.enabled=true \
+      --conf spark.shuffle.service.enabled=true \
+      --conf spark.kubernetes.shuffle.namespace=default \
+      --conf spark.kubernetes.shuffle.labels="app=spark-shuffle-service,spark-version=2.1.0" \
+      examples/jars/spark_examples_2.11-2.2.0.jar 10 400000 2
+
 ## Current Limitations
 
 Running Spark on Kubernetes is currently an experimental feature. Some restrictions on the current implementation that
 should be lifted in the future include:
-* Applications can only use a fixed number of executors. Dynamic allocation is not supported.
 * Applications can only run in cluster mode.
 * Only Scala and Java applications can be run.
diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
index 734c29947b6d9..fd1ad29eb795d 100644
--- a/resource-managers/kubernetes/README.md
+++ b/resource-managers/kubernetes/README.md
@@ -14,10 +14,10 @@ important matters to keep in mind when developing this feature.
 
 # Building Spark with Kubernetes Support
 
-To build Spark with Kubernetes support, use the `kubernetes` profile when invoking Maven.
+To build Spark with Kubernetes support, use the `kubernetes` profile when invoking Maven. For example, to simply compile
+the Kubernetes core implementation module along with its dependencies:
 
-    git checkout branch-2.1-kubernetes
-    build/mvn package -Pkubernetes -DskipTests
+    build/mvn compile -Pkubernetes -pl resource-managers/kubernetes/core -am -DskipTests
 
 To build a distribution of Spark with Kubernetes support, use the `dev/make-distribution.sh` script, and add the
 `kubernetes` profile as part of the build arguments. Any other build arguments can be specified as one would expect when
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
new file mode 100644
index 0000000000000..f3bd598556019
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.kubernetes
+
+import org.apache.spark.SparkException
+
+object ConfigurationUtils {
+  def parseKeyValuePairs(
+    maybeKeyValues: Option[String],
+    configKey: String,
+    keyValueType: String): Map[String, String] = {
+
+    maybeKeyValues.map(keyValues => {
+      keyValues.split(",").map(_.trim).filterNot(_.isEmpty).map(keyValue => {
+        keyValue.split("=", 2).toSeq match {
+          case Seq(k, v) =>
+            (k, v)
+          case _ =>
+            throw new SparkException(s"Custom $keyValueType set by $configKey must be a" +
+              s" comma-separated list of key-value pairs, with format <key>=<value>." +
+              s" Got value: $keyValue. All values: $keyValues")
+        }
+      }).toMap
+    }).getOrElse(Map.empty[String, String])
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index e379b40e376fc..09b2d38cb8e38 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -157,6 +157,13 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val SPARK_SHUFFLE_SERVICE_HOST =
+    ConfigBuilder("spark.shuffle.service.host")
+      .doc("Host for Spark Shuffle Service")
+      .internal()
+      .stringConf
+      .createOptional
+
   // Note that while we set a default for this when we start up the
   // scheduler, the specific default value is dynamically determined
   // based on the executor memory.
@@ -270,6 +277,44 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_SHUFFLE_NAMESPACE =
+    ConfigBuilder("spark.kubernetes.shuffle.namespace")
+      .doc("Namespace of the shuffle service")
+      .stringConf
+      .createWithDefault("default")
+
+  private[spark] val KUBERNETES_SHUFFLE_SVC_IP =
+    ConfigBuilder("spark.kubernetes.shuffle.ip")
+      .doc("This setting is for debugging only. Setting this " +
+        "allows overriding the IP that the executor thinks its colocated " +
+        "shuffle service is on")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_SHUFFLE_LABELS =
+    ConfigBuilder("spark.kubernetes.shuffle.labels")
+      .doc("Labels to identify the shuffle service")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_SHUFFLE_DIR =
+    ConfigBuilder("spark.kubernetes.shuffle.dir")
+      .doc("Path to the shared shuffle directories.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_ALLOCATION_BATCH_SIZE =
+    ConfigBuilder("spark.kubernetes.allocation.batch.size")
+      .doc("Number of pods to launch at once in each round of dynamic allocation. ")
+      .intConf
+      .createWithDefault(5)
+
+  private[spark] val KUBERNETES_ALLOCATION_BATCH_DELAY =
+    ConfigBuilder("spark.kubernetes.allocation.batch.delay")
+      .doc("Number of seconds to wait between each round of executor allocation. ")
+      .longConf
+      .createWithDefault(1)
+
   private[spark] val DRIVER_SERVICE_MANAGER_TYPE =
     ConfigBuilder("spark.kubernetes.driver.serviceManagerType")
       .doc("A tag indicating which class to use for creating the Kubernetes service and" +
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index f82cb88b4c622..27e47eb61933f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -113,4 +113,5 @@ package object constants {
     s"$INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
   private[spark] val DOWNLOAD_JARS_VOLUME_NAME = "download-jars"
   private[spark] val DOWNLOAD_FILES_VOLUME_NAME = "download-files"
+  private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index a2294a6766980..669a073b1fab6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -17,22 +17,25 @@
 package org.apache.spark.scheduler.cluster.kubernetes
 
 import java.io.Closeable
+import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 
-import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder,
-    EnvVarSourceBuilder, Pod, QuantityBuilder}
+import io.fabric8.kubernetes.api.model._
 import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
+import org.apache.commons.io.FilenameUtils
 
-import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.{SparkContext, SparkEnv, SparkException}
+import org.apache.spark.deploy.kubernetes.ConfigurationUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.rpc.RpcEndpointAddress
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpointAddress, RpcEnv}
 import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RetrieveSparkAppConfig, SparkAppConfig}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.util.{ThreadUtils, Utils}
 
@@ -49,6 +52,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val EXECUTOR_PODS_BY_IPS_LOCK = new Object
   private val executorPodsByIPs = new mutable.HashMap[String, Pod] // Indexed by executor IP addrs.
 
+  private var shufflePodCache: Option[ShufflePodCache] = None
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
   private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)
@@ -88,6 +92,28 @@ private[spark] class KubernetesClusterSchedulerBackend(
       throw new SparkException(s"Executor cannot find driver pod", throwable)
   }
 
+  private val shuffleServiceConfig: Option[ShuffleServiceConfig] =
+    if (Utils.isDynamicAllocationEnabled(sc.conf)) {
+      val shuffleNamespace = conf.get(KUBERNETES_SHUFFLE_NAMESPACE)
+      val parsedShuffleLabels = ConfigurationUtils.parseKeyValuePairs(
+        conf.get(KUBERNETES_SHUFFLE_LABELS), KUBERNETES_SHUFFLE_LABELS.key,
+            "shuffle-labels")
+      if (parsedShuffleLabels.size == 0) {
+        throw new SparkException(s"Dynamic allocation enabled " +
+          s"but no ${KUBERNETES_SHUFFLE_LABELS.key} specified")
+      }
+
+      val shuffleDirs = conf.get(KUBERNETES_SHUFFLE_DIR).map {
+        _.split(",")
+      }.getOrElse(Utils.getConfiguredLocalDirs(conf))
+      Some(
+        ShuffleServiceConfig(shuffleNamespace,
+          parsedShuffleLabels,
+          shuffleDirs))
+    } else {
+      None
+    }
+
   override val minRegisteredRatio =
     if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
       0.8
@@ -105,6 +131,38 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private val initialExecutors = getInitialTargetExecutorNumber(1)
 
+  private val podAllocationInterval = conf.get(KUBERNETES_ALLOCATION_BATCH_DELAY)
+  require(podAllocationInterval > 0, s"Allocation batch delay " +
+    s"${KUBERNETES_ALLOCATION_BATCH_DELAY} " +
+    s"is ${podAllocationInterval}, should be a positive integer")
+
+  private val podAllocationSize = conf.get(KUBERNETES_ALLOCATION_BATCH_SIZE)
+  require(podAllocationSize > 0, s"Allocation batch size " +
+    s"${KUBERNETES_ALLOCATION_BATCH_SIZE} " +
+    s"is ${podAllocationSize}, should be a positive integer")
+
+  private val allocator = ThreadUtils
+    .newDaemonSingleThreadScheduledExecutor("kubernetes-pod-allocator")
+
+  private val allocatorRunnable: Runnable = new Runnable {
+    override def run(): Unit = {
+      if (totalRegisteredExecutors.get() < runningExecutorPods.size) {
+        logDebug("Waiting for pending executors before scaling")
+      } else if (totalExpectedExecutors.get() <= runningExecutorPods.size) {
+        logDebug("Maximum allowed executor limit reached. Not scaling up further.")
+      } else {
+        RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+          for (i <- 0 until math.min(
+            totalExpectedExecutors.get - runningExecutorPods.size, podAllocationSize)) {
+            runningExecutorPods += allocateNewExecutorPod()
+            logInfo(
+              s"Requesting a new executor, total executors is now ${runningExecutorPods.size}")
+          }
+        }
+      }
+    }
+  }
+
   private def getInitialTargetExecutorNumber(defaultNumExecutors: Int = 1): Int = {
     if (Utils.isDynamicAllocationEnabled(conf)) {
       val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
@@ -118,6 +176,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     } else {
       conf.getInt("spark.executor.instances", defaultNumExecutors)
     }
+
   }
 
   override def applicationId(): String = conf.get("spark.app.id", super.applicationId())
@@ -130,12 +189,25 @@ private[spark] class KubernetesClusterSchedulerBackend(
     super.start()
     executorWatchResource.set(kubernetesClient.pods().withLabel(SPARK_APP_ID_LABEL, applicationId())
       .watch(new ExecutorPodsWatcher()))
+
+    allocator.scheduleWithFixedDelay(
+      allocatorRunnable, 0, podAllocationInterval, TimeUnit.SECONDS)
+
     if (!Utils.isDynamicAllocationEnabled(sc.conf)) {
       doRequestTotalExecutors(initialExecutors)
+    } else {
+      shufflePodCache = shuffleServiceConfig
+        .map { config => new ShufflePodCache(
+          kubernetesClient, config.shuffleNamespace, config.shuffleLabels) }
+      shufflePodCache.foreach(_.start())
     }
   }
 
   override def stop(): Unit = {
+    // stop allocation of new resources and caches.
+    allocator.shutdown()
+    shufflePodCache.foreach(_.stop())
+
     // send stop message to executors so they shut down cleanly
     super.stop()
 
@@ -214,37 +286,60 @@ private[spark] class KubernetesClusterSchedulerBackend(
           .withContainerPort(port._2)
           .build()
       })
+
+    val basePodBuilder = new PodBuilder()
+      .withNewMetadata()
+        .withName(name)
+        .withLabels(selectors)
+        .withOwnerReferences()
+        .addNewOwnerReference()
+          .withController(true)
+          .withApiVersion(driverPod.getApiVersion)
+          .withKind(driverPod.getKind)
+          .withName(driverPod.getMetadata.getName)
+          .withUid(driverPod.getMetadata.getUid)
+        .endOwnerReference()
+      .endMetadata()
+      .withNewSpec()
+        .withHostname(hostname)
+        .addNewContainer()
+          .withName(s"executor")
+          .withImage(executorDockerImage)
+          .withImagePullPolicy("IfNotPresent")
+          .withNewResources()
+            .addToRequests("memory", executorMemoryQuantity)
+            .addToLimits("memory", executorMemoryLimitQuantity)
+            .addToRequests("cpu", executorCpuQuantity)
+            .addToLimits("cpu", executorCpuQuantity)
+          .endResources()
+          .withEnv(requiredEnv.asJava)
+          .withPorts(requiredPorts.asJava)
+        .endContainer()
+      .endSpec()
+
+    val resolvedPodBuilder = shuffleServiceConfig
+      .map { config =>
+        config.shuffleDirs.foldLeft(basePodBuilder) { (builder, dir) =>
+          builder
+            .editSpec()
+              .addNewVolume()
+                .withName(FilenameUtils.getBaseName(dir))
+                .withNewHostPath()
+                  .withPath(dir)
+                .endHostPath()
+              .endVolume()
+              .editFirstContainer()
+                .addNewVolumeMount()
+                  .withName(FilenameUtils.getBaseName(dir))
+                  .withMountPath(dir)
+                .endVolumeMount()
+              .endContainer()
+            .endSpec()
+        }
+      }.getOrElse(basePodBuilder)
+
     try {
-      (executorId, kubernetesClient.pods().createNew()
-        .withNewMetadata()
-          .withName(name)
-          .withLabels(selectors)
-          .withOwnerReferences()
-          .addNewOwnerReference()
-            .withController(true)
-            .withApiVersion(driverPod.getApiVersion)
-            .withKind(driverPod.getKind)
-            .withName(driverPod.getMetadata.getName)
-            .withUid(driverPod.getMetadata.getUid)
-          .endOwnerReference()
-        .endMetadata()
-        .withNewSpec()
-          .withHostname(hostname)
-          .addNewContainer()
-            .withName(s"executor")
-            .withImage(executorDockerImage)
-            .withImagePullPolicy("IfNotPresent")
-            .withNewResources()
-              .addToRequests("memory", executorMemoryQuantity)
-              .addToLimits("memory", executorMemoryLimitQuantity)
-              .addToRequests("cpu", executorCpuQuantity)
-              .addToLimits("cpu", executorCpuQuantity)
-              .endResources()
-            .withEnv(requiredEnv.asJava)
-            .withPorts(requiredPorts.asJava)
-            .endContainer()
-          .endSpec()
-        .done())
+      (executorId, kubernetesClient.pods().create(resolvedPodBuilder.build()))
     } catch {
       case throwable: Throwable =>
         logError("Failed to allocate executor pod.", throwable)
@@ -252,18 +347,13 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
+  override def createDriverEndpoint(
+    properties: Seq[(String, String)]): DriverEndpoint = {
+    new KubernetesDriverEndpoint(rpcEnv, properties)
+  }
+
   override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
-    RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-      if (requestedTotal > totalExpectedExecutors.get) {
-        logInfo(s"Requesting ${requestedTotal - totalExpectedExecutors.get}"
-          + s" additional executors, expecting total $requestedTotal and currently" +
-          s" expected ${totalExpectedExecutors.get}")
-        for (i <- 0 until (requestedTotal - totalExpectedExecutors.get)) {
-          runningExecutorPods += allocateNewExecutorPod()
-        }
-      }
-      totalExpectedExecutors.set(requestedTotal)
-    }
+    totalExpectedExecutors.set(requestedTotal)
     true
   }
 
@@ -313,6 +403,50 @@ private[spark] class KubernetesClusterSchedulerBackend(
       logDebug("Executor pod watch closed.", cause)
     }
   }
+
+  private class KubernetesDriverEndpoint(
+    rpcEnv: RpcEnv,
+    sparkProperties: Seq[(String, String)])
+    extends DriverEndpoint(rpcEnv, sparkProperties) {
+    override def receiveAndReply(
+      context: RpcCallContext): PartialFunction[Any, Unit] = {
+      new PartialFunction[Any, Unit]() {
+        override def isDefinedAt(msg: Any): Boolean = {
+          msg match {
+            case RetrieveSparkAppConfig(executorId) =>
+              Utils.isDynamicAllocationEnabled(sc.conf)
+            case _ => false
+          }
+        }
+
+        override def apply(msg: Any): Unit = {
+          msg match {
+            case RetrieveSparkAppConfig(executorId) =>
+              RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+                var resolvedProperties = sparkProperties
+                val runningExecutorPod = kubernetesClient
+                  .pods()
+                  .withName(runningExecutorPods(executorId).getMetadata.getName)
+                  .get()
+                val nodeName = runningExecutorPod.getSpec.getNodeName
+                val shufflePodIp = shufflePodCache.get.getShufflePodForExecutor(nodeName)
+                resolvedProperties = resolvedProperties ++ Seq(
+                  (SPARK_SHUFFLE_SERVICE_HOST.key, shufflePodIp))
+
+                val reply = SparkAppConfig(
+                  resolvedProperties,
+                  SparkEnv.get.securityManager.getIOEncryptionKey())
+                context.reply(reply)
+              }
+          }
+        }
+      }.orElse(super.receiveAndReply(context))
+    }
+  }
+
+  case class ShuffleServiceConfig(shuffleNamespace: String,
+    shuffleLabels: Map[String, String],
+    shuffleDirs: Seq[String])
 }
 
 private object KubernetesClusterSchedulerBackend {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala
new file mode 100644
index 0000000000000..53b4e745ce7c7
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.Pod
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watch, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import io.fabric8.kubernetes.client.internal.readiness.Readiness
+
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+
+private[spark] class ShufflePodCache (
+    client: KubernetesClient,
+    dsNamespace: String,
+    dsLabels: Map[String, String]) extends Logging {
+
+  private var shufflePodCache = scala.collection.mutable.Map[String, String]()
+  private var watcher: Watch = _
+
+  def start(): Unit = {
+    // seed the initial cache.
+    val pods = client.pods().withLabels(dsLabels.asJava).list()
+    pods.getItems.asScala.foreach {
+      pod =>
+        if (Readiness.isReady(pod)) {
+          addShufflePodToCache(pod)
+        } else {
+          logWarning(s"Found unready shuffle pod ${pod.getMetadata.getName} " +
+            s"on node ${pod.getSpec.getNodeName}")
+        }
+    }
+
+    watcher = client
+      .pods()
+      .withLabels(dsLabels.asJava)
+      .watch(new Watcher[Pod] {
+        override def eventReceived(action: Watcher.Action, p: Pod): Unit = {
+          action match {
+            case Action.DELETED | Action.ERROR =>
+              shufflePodCache.remove(p.getSpec.getNodeName)
+            case Action.ADDED | Action.MODIFIED if Readiness.isReady(p) =>
+              addShufflePodToCache(p)
+          }
+        }
+        override def onClose(e: KubernetesClientException): Unit = {}
+      })
+  }
+
+  private def addShufflePodToCache(pod: Pod): Unit = {
+    if (shufflePodCache.contains(pod.getSpec.getNodeName)) {
+      val registeredPodName = shufflePodCache.get(pod.getSpec.getNodeName).get
+      logError(s"Ambiguous specification of shuffle service pod. " +
+        s"Found multiple matching pods: ${pod.getMetadata.getName}, " +
+        s"${registeredPodName} on ${pod.getSpec.getNodeName}")
+
+      throw new SparkException(s"Ambiguous specification of shuffle service pod. " +
+        s"Found multiple matching pods: ${pod.getMetadata.getName}, " +
+        s"${registeredPodName} on ${pod.getSpec.getNodeName}")
+    } else {
+        shufflePodCache(pod.getSpec.getNodeName) = pod.getStatus.getPodIP
+    }
+  }
+
+  def stop(): Unit = {
+    watcher.close()
+  }
+
+  def getShufflePodForExecutor(executorNode: String): String = {
+    shufflePodCache.get(executorNode)
+        .getOrElse(throw new SparkException(s"Unable to find shuffle pod on node $executorNode"))
+  }
+}
+
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
new file mode 100644
index 0000000000000..630d3408519ac
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM openjdk:8-alpine
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-shuffle:latest -f dockerfiles/shuffle/Dockerfile .
+
+RUN apk upgrade --update
+RUN apk add --update bash
+RUN mkdir -p /opt/spark
+RUN touch /opt/spark/RELEASE
+
+ADD jars /opt/spark/jars
+ADD examples /opt/spark/examples
+ADD bin /opt/spark/bin
+ADD sbin /opt/spark/sbin
+ADD conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+
+WORKDIR /opt/spark
+
+CMD ["/bin/sh","-c","/opt/spark/bin/spark-class org.apache.spark.deploy.ExternalShuffleService 1"]
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/GroupByTest.scala b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/GroupByTest.scala
new file mode 100644
index 0000000000000..fe47d42485b24
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/GroupByTest.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest.jobs
+
+import java.util.Random
+
+import org.apache.spark.sql.SparkSession
+
+object GroupByTest {
+  def main(args: Array[String]) {
+    val spark = SparkSession
+      .builder
+      .appName("GroupBy Test")
+      .getOrCreate()
+
+    val numMappers = if (args.length > 0) args(0).toInt else 5
+    val numKVPairs = if (args.length > 1) args(1).toInt else 200000
+    val valSize = if (args.length > 2) args(2).toInt else 2
+    val numReducers = if (args.length > 3) args(3).toInt else numMappers
+
+    val pairs1 = spark.sparkContext.parallelize(0 until numMappers, numMappers).flatMap { p =>
+      val ranGen = new Random
+      val arr1 = new Array[(Int, Array[Byte])](numKVPairs)
+      for (i <- 0 until numKVPairs) {
+        val byteArr = new Array[Byte](valSize)
+        ranGen.nextBytes(byteArr)
+        arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
+      }
+      arr1
+    }.cache()
+    // Enforce that everything has been calculated and in cache
+    pairs1.count()
+
+    // scalastyle:off println
+    println("The Result is", pairs1.groupByKey(numReducers).count())
+    // scalastyle:on println
+    spark.stop()
+  }
+}
+
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index bd5ff7a005d46..56fcf692b8ff7 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -67,4 +67,8 @@ private[spark] object KubernetesSuite {
     ".integrationtest.jobs.SparkPiWithInfiniteWait"
   val FILE_EXISTENCE_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.FileExistenceTest"
+  val GROUP_BY_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
+    ".integrationtest.jobs.GroupByTest"
+
+  case class ShuffleNotReadyException() extends Exception
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
index 8fa7cbd52ee83..ae02de7937c6a 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
@@ -18,6 +18,10 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.util.UUID
 
+import scala.collection.JavaConverters._
+
+import com.google.common.collect.ImmutableList
+import io.fabric8.kubernetes.client.internal.readiness.Readiness
 import org.scalatest.{BeforeAndAfter, DoNotDiscover}
 import org.scalatest.concurrent.Eventually
 
@@ -27,7 +31,10 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
+import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
+import org.apache.spark.deploy.kubernetes.submit.v1.Client
 import org.apache.spark.deploy.kubernetes.submit.v2.{MountedDependencyManagerProviderImpl, SubmissionKubernetesClientProviderImpl}
+import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
 
 @DoNotDiscover
 private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
@@ -60,7 +67,7 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
     launchStagingServer(SSLOptions())
-    runSparkAppAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+    runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
   test("Enable SSL on the submission server") {
@@ -84,7 +91,7 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
       keyStorePassword = Some("keyStore"),
       keyPassword = Some("key"),
       trustStorePassword = Some("trustStore")))
-    runSparkAppAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+    runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
   test("Use container-local resources without the resource staging server") {
@@ -93,7 +100,22 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     sparkConf.setJars(Seq(
       KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
       KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
-    runSparkAppAndVerifyCompletion(KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+    runSparkPiAndVerifyCompletion(KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Dynamic executor scaling basic test") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    launchStagingServer(SSLOptions())
+    createShuffleServiceDaemonSet()
+
+    sparkConf.setJars(Seq(KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
+    sparkConf.set("spark.dynamicAllocation.enabled", "true")
+    sparkConf.set("spark.shuffle.service.enabled", "true")
+    sparkConf.set("spark.kubernetes.shuffle.labels", "app=spark-shuffle-service")
+    sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
+    sparkConf.set("spark.app.name", "group-by-test")
+    runSparkGroupByTestAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
   private def launchStagingServer(resourceStagingServerSslOptions: SSLOptions): Unit = {
@@ -111,7 +133,7 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
         s"${Minikube.getMinikubeIp}:$resourceStagingServerPort")
   }
 
-  private def runSparkAppAndVerifyCompletion(appResource: String): Unit = {
+  private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
     val client = new org.apache.spark.deploy.kubernetes.submit.v2.Client(
       sparkConf = sparkConf,
       mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
@@ -136,4 +158,73 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
         .contains("Pi is roughly 3"), "The application did not compute the value of pi.")
     }
   }
+
+  private def runSparkGroupByTestAndVerifyCompletion(appResource: String): Unit = {
+    val client = new org.apache.spark.deploy.kubernetes.submit.v2.Client(
+      sparkConf = sparkConf,
+      mainClass = KubernetesSuite.GROUP_BY_MAIN_CLASS,
+      appArgs = Array.empty[String],
+      mainAppResource = appResource,
+      kubernetesClientProvider =
+        new SubmissionKubernetesClientProviderImpl(sparkConf),
+      mountedDependencyManagerProvider =
+        new MountedDependencyManagerProviderImpl(sparkConf))
+    client.run()
+    val driverPod = kubernetesTestComponents.kubernetesClient
+      .pods()
+      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
+      .list()
+      .getItems
+      .get(0)
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      assert(kubernetesTestComponents.kubernetesClient
+        .pods()
+        .withName(driverPod.getMetadata.getName)
+        .getLog
+        .contains("The Result is"), "The application did not complete.")
+    }
+  }
+
+  private def createShuffleServiceDaemonSet(): Unit = {
+    val ds = kubernetesTestComponents.kubernetesClient.extensions().daemonSets()
+      .createNew()
+        .withNewMetadata()
+        .withName("shuffle")
+      .endMetadata()
+      .withNewSpec()
+        .withNewTemplate()
+          .withNewMetadata()
+            .withLabels(Map("app" -> "spark-shuffle-service").asJava)
+          .endMetadata()
+          .withNewSpec()
+            .addNewVolume()
+              .withName("shuffle-dir")
+              .withNewHostPath()
+                .withPath("/tmp")
+              .endHostPath()
+            .endVolume()
+            .addNewContainer()
+              .withName("shuffle")
+              .withImage("spark-shuffle:latest")
+              .withImagePullPolicy("IfNotPresent")
+              .addNewVolumeMount()
+                .withName("shuffle-dir")
+                .withMountPath("/tmp")
+              .endVolumeMount()
+            .endContainer()
+          .endSpec()
+        .endTemplate()
+      .endSpec()
+      .done()
+
+    // wait for daemonset to become available.
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val pods = kubernetesTestComponents.kubernetesClient.pods()
+        .withLabel("app", "spark-shuffle-service").list().getItems()
+
+      if (pods.size() == 0 || Readiness.isReady(pods.get(0))) {
+        throw KubernetesSuite.ShuffleNotReadyException()
+      }
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index d807c4d81009b..52b8c7d7359a6 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -31,6 +31,7 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
   private val DRIVER_V1_DOCKER_FILE = "dockerfiles/driver/Dockerfile"
   private val DRIVER_V2_DOCKER_FILE = "dockerfiles/driver-v2/Dockerfile"
   private val EXECUTOR_DOCKER_FILE = "dockerfiles/executor/Dockerfile"
+  private val SHUFFLE_SERVICE_DOCKER_FILE = "dockerfiles/shuffle-service/Dockerfile"
   private val DRIVER_INIT_DOCKER_FILE = "dockerfiles/driver-init/Dockerfile"
   private val STAGING_SERVER_DOCKER_FILE = "dockerfiles/resource-staging-server/Dockerfile"
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
@@ -60,6 +61,7 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
     Eventually.eventually(TIMEOUT, INTERVAL) { dockerClient.ping() }
     buildImage("spark-driver", DRIVER_V1_DOCKER_FILE)
     buildImage("spark-executor", EXECUTOR_DOCKER_FILE)
+    buildImage("spark-shuffle", SHUFFLE_SERVICE_DOCKER_FILE)
     buildImage("spark-driver-v2", DRIVER_V2_DOCKER_FILE)
     buildImage("spark-resource-staging-server", STAGING_SERVER_DOCKER_FILE)
     buildImage("spark-driver-init", DRIVER_INIT_DOCKER_FILE)

From f005268e7809b75b3d29726946fab9aa127cd45a Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 17 May 2017 11:55:23 -0700
Subject: [PATCH 478/534] Download remotely-located resources on driver and
 executor startup via init-container (#251)

* Download remotely-located resources on driver startup. Use init-container in executors.

* FIx owner reference slightly

* Clean up config

* Don't rely too heavily on conventions that can change

* Fix flaky test

* Tidy up file resolver

* Whitespace arrangement

* Indentation change

* Fix more indentation

* Consolidate init container component providers

* Minor method signature and comment changes

* Rename class for consistency

* Resolve conflicts

* Fix flaky test

* Add some tests and some refactoring.

* Make naming consistent for Staged -> Submitted

* Add unit test for the submission client.

* Refine expectations

* Rename variables and fix typos

* Address more comments. Remove redundant SingleKeyConfigMap.

* Minor test adjustments.

* add another test

* Fix conflicts.
---
 ...nerResourceStagingServerSecretPlugin.scala |  62 ++
 .../SparkPodInitContainerBootstrap.scala      | 103 ++++
 .../spark/deploy/kubernetes/config.scala      | 117 +++-
 .../spark/deploy/kubernetes/constants.scala   |  37 +-
 .../submit}/KubernetesFileUtils.scala         |   5 +-
 .../deploy/kubernetes/submit/v1/Client.scala  |   3 +-
 ...DriverSubmitSslConfigurationProvider.scala |   3 +-
 .../deploy/kubernetes/submit/v2/Client.scala  | 188 +++---
 .../v2/ContainerLocalizedFilesResolver.scala  |  68 +++
 ...riverInitContainerComponentsProvider.scala | 155 +++++
 .../ExecutorInitContainerConfiguration.scala  |  47 ++
 .../submit/v2/InitContainerUtil.scala         |  49 ++
 .../v2/MountedDependencyManagerImpl.scala     | 324 -----------
 .../v2/MountedDependencyManagerProvider.scala |  58 --
 ...opertiesConfigMapFromScalaMapBuilder.scala |  48 ++
 .../SparkInitContainerConfigMapBuilder.scala  |  69 +++
 ...dDependencyInitContainerConfigPlugin.scala |  69 +++
 .../v2/SubmittedDependencySecretBuilder.scala |  66 +++
 .../v2/SubmittedDependencyUploaderImpl.scala  | 116 ++++
 .../submit/v2/SubmittedResources.scala}       |  17 +-
 .../v1/KubernetesSparkRestServer.scala        |   1 +
 ...SparkDependencyDownloadInitContainer.scala | 181 ++++--
 .../v2/ResourceStagingService.scala           |   3 +-
 .../v2/ResourceStagingServiceImpl.scala       |   5 +-
 .../v2/ResourceStagingServiceRetrofit.scala   |   4 +-
 .../kubernetes/KubernetesClusterManager.scala |  49 +-
 .../KubernetesClusterSchedulerBackend.scala   |  35 +-
 .../SparkPodInitContainerBootstrapSuite.scala | 164 ++++++
 ...dencyInitContainerVolumesPluginSuite.scala |  60 ++
 .../kubernetes/submit/v2/ClientV2Suite.scala  | 542 +++++++++---------
 ...ContainerLocalizedFilesResolverSuite.scala |  69 +++
 ...cutorInitContainerConfigurationSuite.scala |  56 ++
 .../v2/MountedDependencyManagerSuite.scala    | 323 -----------
 ...rkInitContainerConfigMapBuilderSuite.scala | 101 ++++
 ...ndencyInitContainerConfigPluginSuite.scala |  83 +++
 ...ubmittedDependencySecretBuilderSuite.scala |  83 +++
 .../v2/SubmittedDependencyUploaderSuite.scala | 177 ++++++
 ...DependencyDownloadInitContainerSuite.scala |  70 ++-
 .../src/main/docker/executor/Dockerfile       |   5 +-
 .../kubernetes/integration-tests/pom.xml      |  22 +
 .../integration-test-asset-server/Dockerfile  |  21 +
 .../integrationtest/KubernetesV2Suite.scala   |  57 +-
 .../ResourceStagingServerLauncher.scala       |  30 +-
 .../SparkReadinessWatcher.scala               |  41 ++
 .../StaticAssetServerLauncher.scala           |  64 +++
 .../docker/SparkDockerImageBuilder.scala      |   3 +
 46 files changed, 2620 insertions(+), 1233 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest/kubernetes/v1 => kubernetes/submit}/KubernetesFileUtils.scala (88%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest/kubernetes/v2/StagedResourceIdentifier.scala => kubernetes/submit/v2/SubmittedResources.scala} (51%)
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/main/docker/integration-test-asset-server/Dockerfile
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/SparkReadinessWatcher.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/StaticAssetServerLauncher.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala
new file mode 100644
index 0000000000000..45b881a8a3737
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, Secret}
+
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] trait InitContainerResourceStagingServerSecretPlugin {
+
+  /**
+   * Configure the init-container to mount the secret files that allow it to retrieve dependencies
+   * from a resource staging server.
+   */
+  def mountResourceStagingServerSecretIntoInitContainer(
+      initContainer: ContainerBuilder): ContainerBuilder
+
+  /**
+   * Configure the pod to attach a Secret volume which hosts secret files allowing the
+   * init-container to retrieve dependencies from the resource staging server.
+   */
+  def addResourceStagingServerSecretVolumeToPod(basePod: PodBuilder): PodBuilder
+}
+
+private[spark] class InitContainerResourceStagingServerSecretPluginImpl(
+    initContainerSecretName: String,
+    initContainerSecretMountPath: String)
+    extends InitContainerResourceStagingServerSecretPlugin {
+
+  override def mountResourceStagingServerSecretIntoInitContainer(
+      initContainer: ContainerBuilder): ContainerBuilder = {
+    initContainer.addNewVolumeMount()
+      .withName(INIT_CONTAINER_SECRET_VOLUME_NAME)
+      .withMountPath(initContainerSecretMountPath)
+      .endVolumeMount()
+  }
+
+  override def addResourceStagingServerSecretVolumeToPod(basePod: PodBuilder): PodBuilder = {
+    basePod.editSpec()
+      .addNewVolume()
+        .withName(INIT_CONTAINER_SECRET_VOLUME_NAME)
+        .withNewSecret()
+          .withSecretName(initContainerSecretName)
+          .endSecret()
+        .endVolume()
+      .endSpec()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
new file mode 100644
index 0000000000000..227420db4636d
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, PodBuilder, VolumeMount, VolumeMountBuilder}
+
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.v2.{ContainerNameEqualityPredicate, InitContainerUtil}
+
+private[spark] trait SparkPodInitContainerBootstrap {
+  /**
+   * Bootstraps an init-container that downloads dependencies to be used by a main container.
+   * Note that this primarily assumes that the init-container's configuration is being provided
+   * by a ConfigMap that was installed by some other component; that is, the implementation
+   * here makes no assumptions about how the init-container is specifically configured. For
+   * example, this class is unaware if the init-container is fetching remote dependencies or if
+   * it is fetching dependencies from a resource staging server.
+   */
+  def bootstrapInitContainerAndVolumes(
+      mainContainerName: String, originalPodSpec: PodBuilder): PodBuilder
+}
+
+private[spark] class SparkPodInitContainerBootstrapImpl(
+    initContainerImage: String,
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    downloadTimeoutMinutes: Long,
+    initContainerConfigMapName: String,
+    initContainerConfigMapKey: String,
+    resourceStagingServerSecretPlugin: Option[InitContainerResourceStagingServerSecretPlugin])
+  extends SparkPodInitContainerBootstrap {
+
+  override def bootstrapInitContainerAndVolumes(
+      mainContainerName: String,
+      originalPodSpec: PodBuilder): PodBuilder = {
+    val sharedVolumeMounts = Seq[VolumeMount](
+      new VolumeMountBuilder()
+        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
+        .withMountPath(jarsDownloadPath)
+        .build(),
+      new VolumeMountBuilder()
+        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
+        .withMountPath(filesDownloadPath)
+        .build())
+
+    val initContainer = new ContainerBuilder()
+      .withName(s"spark-init")
+      .withImage(initContainerImage)
+      .withImagePullPolicy("IfNotPresent")
+      .addNewVolumeMount()
+        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+        .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
+        .endVolumeMount()
+      .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
+    val resolvedInitContainer = resourceStagingServerSecretPlugin.map { plugin =>
+      plugin.mountResourceStagingServerSecretIntoInitContainer(initContainer)
+    }.getOrElse(initContainer).build()
+    val podWithBasicVolumes = InitContainerUtil.appendInitContainer(
+        originalPodSpec, resolvedInitContainer)
+      .editSpec()
+        .addNewVolume()
+          .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+          .withNewConfigMap()
+            .withName(initContainerConfigMapName)
+            .addNewItem()
+              .withKey(initContainerConfigMapKey)
+              .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME)
+              .endItem()
+            .endConfigMap()
+          .endVolume()
+        .addNewVolume()
+          .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
+          .withEmptyDir(new EmptyDirVolumeSource())
+          .endVolume()
+        .addNewVolume()
+          .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
+          .withEmptyDir(new EmptyDirVolumeSource())
+          .endVolume()
+        .editMatchingContainer(new ContainerNameEqualityPredicate(mainContainerName))
+          .addToVolumeMounts(sharedVolumeMounts: _*)
+          .endContainer()
+        .endSpec()
+    resourceStagingServerSecretPlugin.map { plugin =>
+      plugin.addResourceStagingServerSecretVolumeToPod(podWithBasicVolumes)
+    }.getOrElse(podWithBasicVolumes)
+  }
+
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 09b2d38cb8e38..f0a39fe359227 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -349,42 +349,43 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val RESOURCE_STAGING_SERVER_SSL_NAMESPACE = "kubernetes.resourceStagingServer"
   private[spark] val RESOURCE_STAGING_SERVER_CERT_PEM =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.serverCertPem")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.serverCertPem")
       .doc("Certificate PEM file to use when having the Kubernetes dependency server" +
         " listen on TLS.")
       .stringConf
       .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_PASSWORD_FILE =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.keyStorePasswordFile")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.keyStorePasswordFile")
       .doc("File containing the keystore password for the Kubernetes dependency server.")
       .stringConf
       .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_KEY_PASSWORD_FILE =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.keyPasswordFile")
       .doc("File containing the key password for the Kubernetes dependency server.")
       .stringConf
       .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_SSL_ENABLED =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.enabled")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.enabled")
       .doc("Whether or not to use SSL when communicating with the dependency server.")
       .booleanConf
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.trustStore")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.trustStore")
       .doc("File containing the trustStore to communicate with the Kubernetes dependency server.")
       .stringConf
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.trustStorePassword")
       .doc("Password for the trustStore for talking to the dependency server.")
       .stringConf
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE =
-    ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.trustStoreType")
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.trustStoreType")
       .doc("Type of trustStore for communicating with the dependency server.")
       .stringConf
       .createOptional
@@ -397,64 +398,120 @@ package object config extends Logging {
       .createOptional
 
   private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER =
-    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadJarsResourceIdentifier")
+    ConfigBuilder("spark.kubernetes.initcontainer.downloadJarsResourceIdentifier")
       .doc("Identifier for the jars tarball that was uploaded to the staging service.")
       .internal()
       .stringConf
       .createOptional
 
   private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION =
-    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadJarsSecretLocation")
+    ConfigBuilder("spark.kubernetes.initcontainer.downloadJarsSecretLocation")
       .doc("Location of the application secret to use when the init-container contacts the" +
         " resource staging server to download jars.")
       .internal()
       .stringConf
-      .createWithDefault(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH)
+      .createWithDefault(s"$INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH/" +
+        s"$INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY")
 
   private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER =
-    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadFilesResourceIdentifier")
+    ConfigBuilder("spark.kubernetes.initcontainer.downloadFilesResourceIdentifier")
       .doc("Identifier for the files tarball that was uploaded to the staging service.")
       .internal()
       .stringConf
       .createOptional
 
   private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION =
-    ConfigBuilder("spark.kubernetes.driver.initcontainer.downloadFilesSecretLocation")
+    ConfigBuilder("spark.kubernetes.initcontainer.downloadFilesSecretLocation")
       .doc("Location of the application secret to use when the init-container contacts the" +
         " resource staging server to download files.")
       .internal()
       .stringConf
-      .createWithDefault(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH)
+      .createWithDefault(
+        s"$INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY")
+
+  private[spark] val INIT_CONTAINER_REMOTE_JARS =
+    ConfigBuilder("spark.kubernetes.initcontainer.remoteJars")
+      .doc("Comma-separated list of jar URIs to download in the init-container. This is" +
+        " calculated from spark.jars.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val INIT_CONTAINER_REMOTE_FILES =
+    ConfigBuilder("spark.kubernetes.initcontainer.remoteFiles")
+      .doc("Comma-separated list of file URIs to download in the init-container. This is" +
+        " calculated from spark.files.")
+      .internal()
+      .stringConf
+      .createOptional
 
   private[spark] val INIT_CONTAINER_DOCKER_IMAGE =
-    ConfigBuilder("spark.kubernetes.driver.initcontainer.docker.image")
-      .doc("Image for the driver's init-container that downloads mounted dependencies.")
+    ConfigBuilder("spark.kubernetes.initcontainer.docker.image")
+      .doc("Image for the driver and executor's init-container that downloads dependencies.")
       .stringConf
-      .createWithDefault(s"spark-driver-init:$sparkVersion")
+      .createWithDefault(s"spark-init:$sparkVersion")
 
-  private[spark] val DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION =
-    ConfigBuilder("spark.kubernetes.driver.mountdependencies.jarsDownloadDir")
-      .doc("Location to download local jars to in the driver. When using spark-submit, this" +
-        " directory must be empty and will be mounted as an empty directory volume on the" +
-        " driver pod.")
+  private[spark] val INIT_CONTAINER_JARS_DOWNLOAD_LOCATION =
+    ConfigBuilder("spark.kubernetes.mountdependencies.jarsDownloadDir")
+      .doc("Location to download jars to in the driver and executors. When using" +
+        " spark-submit, this directory must be empty and will be mounted as an empty directory" +
+        " volume on the driver and executor pod.")
       .stringConf
-      .createWithDefault("/var/spark-data/spark-local-jars")
+      .createWithDefault("/var/spark-data/spark-submitted-jars")
 
-  private[spark] val DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION =
-    ConfigBuilder("spark.kubernetes.driver.mountdependencies.filesDownloadDir")
-      .doc("Location to download local files to in the driver. When using spark-submit, this" +
-        " directory must be empty and will be mounted as an empty directory volume on the" +
-        " driver pod.")
+  private[spark] val INIT_CONTAINER_FILES_DOWNLOAD_LOCATION =
+    ConfigBuilder("spark.kubernetes.mountdependencies.filesDownloadDir")
+      .doc("Location to download files to in the driver and executors. When using" +
+        " spark-submit, this directory must be empty and will be mounted as an empty directory" +
+        " volume on the driver and executor pods.")
       .stringConf
-      .createWithDefault("/var/spark-data/spark-local-files")
+      .createWithDefault("/var/spark-data/spark-submitted-files")
 
-  private[spark] val DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT =
+  private[spark] val INIT_CONTAINER_MOUNT_TIMEOUT =
     ConfigBuilder("spark.kubernetes.mountdependencies.mountTimeout")
       .doc("Timeout before aborting the attempt to download and unpack local dependencies from" +
-        " the dependency staging server when initializing the driver pod.")
+        " remote locations and the resource staging server when initializing the driver and" +
+        " executor pods.")
       .timeConf(TimeUnit.MINUTES)
       .createWithDefault(5)
 
+  private[spark] val EXECUTOR_INIT_CONTAINER_CONFIG_MAP =
+    ConfigBuilder("spark.kubernetes.initcontainer.executor.configmapname")
+      .doc("Name of the config map to use in the init-container that retrieves submitted files" +
+        " for the executor.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY =
+    ConfigBuilder("spark.kubernetes.initcontainer.executor.configmapkey")
+      .doc("Key for the entry in the init container config map for submitted files that" +
+        " corresponds to the properties for this init-container.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val EXECUTOR_INIT_CONTAINER_SECRET =
+    ConfigBuilder("spark.kubernetes.initcontainer.executor.stagingServerSecret.name")
+      .doc("Name of the secret to mount into the init-container that retrieves submitted files.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  private[spark] val EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR =
+    ConfigBuilder("spark.kubernetes.initcontainer.executor.stagingServerSecret.mountDir")
+      .doc("Directory to mount the resource staging server secrets into for the executor" +
+        " init-containers. This must be exactly the same as the directory that the submission" +
+        " client mounted the secret into because the config map's properties specify the" +
+        " secret location as to be the same between the driver init-container and the executor" +
+        " init-container. Thus the submission client will always set this and the driver will" +
+        " never rely on a constant or convention, in order to protect against cases where the" +
+        " submission client has a different version from the driver itself, and hence might" +
+        " have different constants loaded in constants.scala.")
+      .internal()
+      .stringConf
+      .createOptional
+
   private[spark] def resolveK8sMaster(rawMasterString: String): String = {
     if (!rawMasterString.startsWith("k8s://")) {
       throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 27e47eb61933f..4c4f7b9fc3b23 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -70,8 +70,8 @@ package object constants {
   private[spark] val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   private[spark] val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
   private[spark] val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
-  private[spark] val ENV_UPLOADED_JARS_DIR = "SPARK_UPLOADED_JARS_DIR"
   private[spark] val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  private[spark] val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
   private[spark] val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   private[spark] val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
   private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
@@ -92,26 +92,21 @@ package object constants {
 
   // V2 submission init container
   private[spark] val INIT_CONTAINER_ANNOTATION = "pod.beta.kubernetes.io/init-containers"
-  private[spark] val INIT_CONTAINER_SECRETS_VOLUME_NAME = "dependency-secret"
-  private[spark] val INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH = "/mnt/secrets/spark-init"
-  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY = "downloadJarsSecret"
-  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY = "downloadFilesSecret"
-  private[spark] val INIT_CONTAINER_TRUSTSTORE_SECRET_KEY = "trustStore"
-  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH =
-    s"$INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH/$INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY"
-  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH =
-    s"$INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH/$INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY"
-  private[spark] val INIT_CONTAINER_TRUSTSTORE_PATH =
-    s"$INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH/$INIT_CONTAINER_TRUSTSTORE_SECRET_KEY"
-  private[spark] val INIT_CONTAINER_DOWNLOAD_CREDENTIALS_PATH =
-    "/mnt/secrets/kubernetes-credentials"
-  private[spark] val INIT_CONTAINER_CONFIG_MAP_KEY = "init-driver"
-  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "init-container-properties"
-  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH = "/etc/spark-init/"
-  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_NAME = "init-driver.properties"
+  private[spark] val INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH =
+    "/mnt/secrets/spark-init"
+  private[spark] val INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY =
+    "downloadSubmittedJarsSecret"
+  private[spark] val INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY =
+    "downloadSubmittedFilesSecret"
+  private[spark] val INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY = "trustStore"
+  private[spark] val INIT_CONTAINER_CONFIG_MAP_KEY = "download-submitted-files"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME = "download-jars-volume"
+  private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME = "download-files"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "spark-init-properties"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_DIR = "/etc/spark-init"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_NAME = "spark-init.properties"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_PATH =
-    s"$INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
-  private[spark] val DOWNLOAD_JARS_VOLUME_NAME = "download-jars"
-  private[spark] val DOWNLOAD_FILES_VOLUME_NAME = "download-files"
+    s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
   private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
+  private[spark] val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
similarity index 88%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesFileUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
index b8e644219097e..1b0af3fa9fb01 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesFileUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v1
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.util.Utils
 
@@ -41,4 +41,7 @@ private[spark] object KubernetesFileUtils {
     Option(Utils.resolveURI(uri).getScheme).getOrElse("file") == "file"
   }
 
+  def getOnlyRemoteFiles(uris: Iterable[String]): Iterable[String] = {
+    filterUriStringsByScheme(uris, scheme => scheme != "file" && scheme != "local")
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index a4dfe90f71a8a..0f1e7886a1ba2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -33,7 +33,8 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesCredentials, KubernetesFileUtils, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesCredentials, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ShutdownHookManager, Utils}
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
index 10ffddcd7e7fc..174e9c57a65ca 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
@@ -29,7 +29,8 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.v1.{KubernetesFileUtils, PemsToKeyStoreConverter}
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+import org.apache.spark.deploy.rest.kubernetes.v1.PemsToKeyStoreConverter
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
index a70c93942ffb5..a403a91840bd6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
@@ -19,11 +19,10 @@ package org.apache.spark.deploy.kubernetes.submit.v2
 import java.io.File
 import java.util.Collections
 
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, HasMetadata, OwnerReferenceBuilder, PodBuilder}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder}
 import scala.collection.JavaConverters._
-import scala.collection.mutable
 
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException}
+import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.internal.Logging
@@ -35,33 +34,25 @@ import org.apache.spark.util.Utils
  *
  * This class is responsible for instantiating Kubernetes resources that allow a Spark driver to
  * run in a pod on the Kubernetes cluster with the Spark configurations specified by spark-submit.
- * Application submitters that desire to provide their application's dependencies from their local
- * disk must provide a resource staging server URI to this client so that the client can push the
- * local resources to the resource staging server and have the driver pod pull the resources in an
- * init-container. Interactions with the resource staging server are offloaded to the
- * {@link MountedDependencyManager} class. If instead the application submitter has their
- * dependencies pre-staged in remote locations like HDFS or their own HTTP servers already, then
- * the mounted dependency manager is bypassed entirely, but the init-container still needs to
- * fetch these remote dependencies (TODO https://github.com/apache-spark-on-k8s/spark/issues/238).
+ * The API of this class makes it such that much of the specific behavior can be stubbed for
+ * testing; most of the detailed logic must be dependency-injected when constructing an instance
+ * of this client. Therefore the submission process is designed to be as modular as possible,
+ * where different steps of submission should be factored out into separate classes.
  */
 private[spark] class Client(
+    appName: String,
+    kubernetesAppId: String,
     mainClass: String,
     sparkConf: SparkConf,
     appArgs: Array[String],
-    mainAppResource: String,
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
     kubernetesClientProvider: SubmissionKubernetesClientProvider,
-    mountedDependencyManagerProvider: MountedDependencyManagerProvider) extends Logging {
+    initContainerComponentsProvider: DriverInitContainerComponentsProvider) extends Logging {
 
-  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-  private val master = resolveK8sMaster(sparkConf.get("spark.master"))
-  private val launchTime = System.currentTimeMillis
-  private val appName = sparkConf.getOption("spark.app.name")
-    .getOrElse("spark")
-  private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
   private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
     .getOrElse(kubernetesAppId)
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
-  private val maybeStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
   private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
   private val memoryOverheadMb = sparkConf
     .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
@@ -70,22 +61,15 @@ private[spark] class Client(
   private val driverContainerMemoryWithOverhead = driverMemoryMb + memoryOverheadMb
   private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
   private val customAnnotations = sparkConf.get(KUBERNETES_DRIVER_ANNOTATIONS)
-  private val sparkJars = sparkConf.getOption("spark.jars")
-    .map(_.split(","))
-    .getOrElse(Array.empty[String]) ++
-    Option(mainAppResource)
-      .filterNot(_ == SparkLauncher.NO_RESOURCE)
-      .toSeq
 
-  private val sparkFiles = sparkConf.getOption("spark.files")
-    .map(_.split(","))
-    .getOrElse(Array.empty[String])
   private val driverExtraClasspath = sparkConf.get(
     org.apache.spark.internal.config.DRIVER_CLASS_PATH)
   private val driverJavaOptions = sparkConf.get(
     org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
 
   def run(): Unit = {
+    validateNoDuplicateFileNames(sparkJars)
+    validateNoDuplicateFileNames(sparkFiles)
     val parsedCustomLabels = parseKeyValuePairs(customLabels, KUBERNETES_DRIVER_LABELS.key,
       "labels")
     require(!parsedCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
@@ -133,50 +117,39 @@ private[spark] class Client(
           .addToContainers(driverContainer)
           .endSpec()
 
-      val nonDriverPodKubernetesResources = mutable.Buffer[HasMetadata]()
-      val resolvedJars = mutable.Buffer[String]()
-      val resolvedFiles = mutable.Buffer[String]()
-      val driverPodWithMountedDeps = maybeStagingServerUri.map { stagingServerUri =>
-        val mountedDependencyManager = mountedDependencyManagerProvider.getMountedDependencyManager(
-          kubernetesAppId,
-          stagingServerUri,
-          allLabels,
-          namespace,
-          sparkJars,
-          sparkFiles)
-        val jarsResourceIdentifier = mountedDependencyManager.uploadJars()
-        val filesResourceIdentifier = mountedDependencyManager.uploadFiles()
-        val initContainerKubernetesSecret = mountedDependencyManager.buildInitContainerSecret(
-          jarsResourceIdentifier.resourceSecret, filesResourceIdentifier.resourceSecret)
-        val initContainerConfigMap = mountedDependencyManager.buildInitContainerConfigMap(
-          jarsResourceIdentifier.resourceId, filesResourceIdentifier.resourceId)
-        resolvedJars ++= mountedDependencyManager.resolveSparkJars()
-        resolvedFiles ++= mountedDependencyManager.resolveSparkFiles()
-        nonDriverPodKubernetesResources += initContainerKubernetesSecret
-        nonDriverPodKubernetesResources += initContainerConfigMap
-        mountedDependencyManager.configurePodToMountLocalDependencies(
-          driverContainer.getName, initContainerKubernetesSecret, initContainerConfigMap, basePod)
-      }.getOrElse {
-        sparkJars.map(Utils.resolveURI).foreach { jar =>
-          require(Option.apply(jar.getScheme).getOrElse("file") != "file",
-            "When submitting with local jars, a resource staging server must be provided to" +
-              s" deploy your jars into the driver pod. Cannot send jar with URI $jar.")
-        }
-        sparkFiles.map(Utils.resolveURI).foreach { file =>
-          require(Option.apply(file.getScheme).getOrElse("file") != "file",
-            "When submitting with local files, a resource staging server must be provided to" +
-              s" deploy your files into the driver pod. Cannot send file with URI $file")
-        }
-        resolvedJars ++= sparkJars
-        resolvedFiles ++= sparkFiles
-        basePod
+      val maybeSubmittedDependencyUploader = initContainerComponentsProvider
+        .provideInitContainerSubmittedDependencyUploader(allLabels)
+      val maybeSubmittedResourceIdentifiers = maybeSubmittedDependencyUploader.map { uploader =>
+        SubmittedResources(uploader.uploadJars(), uploader.uploadFiles())
       }
-      val resolvedSparkConf = sparkConf.clone()
-      if (resolvedJars.nonEmpty) {
-        resolvedSparkConf.set("spark.jars", resolvedJars.mkString(","))
+      val maybeSecretBuilder = initContainerComponentsProvider
+          .provideSubmittedDependenciesSecretBuilder(
+              maybeSubmittedResourceIdentifiers.map(_.secrets()))
+      val maybeSubmittedDependenciesSecret = maybeSecretBuilder.map(_.build())
+      val initContainerConfigMap = initContainerComponentsProvider
+        .provideInitContainerConfigMapBuilder(maybeSubmittedResourceIdentifiers.map(_.ids()))
+        .build()
+      val podWithInitContainer = initContainerComponentsProvider
+        .provideInitContainerBootstrap()
+        .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod)
+
+      val driverOwnedResources = Seq(initContainerConfigMap) ++
+          maybeSubmittedDependenciesSecret.toSeq
+
+      val containerLocalizedFilesResolver = initContainerComponentsProvider
+          .provideContainerLocalizedFilesResolver()
+      val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
+      val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
+
+      val executorInitContainerConfiguration = initContainerComponentsProvider
+          .provideExecutorInitContainerConfiguration()
+      val resolvedSparkConf = executorInitContainerConfiguration
+          .configureSparkConfForExecutorInitContainer(sparkConf)
+      if (resolvedSparkJars.nonEmpty) {
+        resolvedSparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
       }
-      if (resolvedFiles.nonEmpty) {
-        resolvedSparkConf.set("spark.files", resolvedFiles.mkString(","))
+      if (resolvedSparkFiles.nonEmpty) {
+        resolvedSparkConf.set("spark.files", resolvedSparkFiles.mkString(","))
       }
       resolvedSparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
       resolvedSparkConf.set("spark.app.id", kubernetesAppId)
@@ -188,19 +161,16 @@ private[spark] class Client(
       resolvedSparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
         resolvedSparkConf.set(KUBERNETES_DRIVER_OAUTH_TOKEN.key, "<present_but_redacted>")
       }
-
-      val mountedClassPath = resolvedJars.map(Utils.resolveURI).filter { jarUri =>
-        val scheme = Option.apply(jarUri.getScheme).getOrElse("file")
-        scheme == "local" || scheme == "file"
-      }.map(_.getPath).mkString(File.pathSeparator)
-      val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map { case (confKey, confValue) =>
-          s"-D$confKey=$confValue"
+      val resolvedLocalClasspath = containerLocalizedFilesResolver
+        .resolveSubmittedAndRemoteSparkJars()
+      val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
+        case (confKey, confValue) => s"-D$confKey=$confValue"
       }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-      val resolvedDriverPod = driverPodWithMountedDeps.editSpec()
+      val resolvedDriverPod = podWithInitContainer.editSpec()
         .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
           .addNewEnv()
             .withName(ENV_MOUNTED_CLASSPATH)
-            .withValue(mountedClassPath)
+            .withValue(resolvedLocalClasspath.mkString(File.pathSeparator))
             .endEnv()
           .addNewEnv()
             .withName(ENV_DRIVER_JAVA_OPTS)
@@ -218,11 +188,11 @@ private[spark] class Client(
           .withKind(createdDriverPod.getKind)
           .withController(true)
           .build()
-        nonDriverPodKubernetesResources.foreach { resource =>
+        driverOwnedResources.foreach { resource =>
           val originalMetadata = resource.getMetadata
           originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
         }
-        kubernetesClient.resourceList(nonDriverPodKubernetesResources: _*).createOrReplace()
+        kubernetesClient.resourceList(driverOwnedResources: _*).createOrReplace()
       } catch {
         case e: Throwable =>
           kubernetesClient.pods().delete(createdDriverPod)
@@ -231,6 +201,17 @@ private[spark] class Client(
     }
   }
 
+  private def validateNoDuplicateFileNames(allFiles: Seq[String]): Unit = {
+    val fileNamesToUris = allFiles.map { file =>
+      (new File(Utils.resolveURI(file).getPath).getName, file)
+    }
+    fileNamesToUris.groupBy(_._1).foreach {
+      case (fileName, urisWithFileName) =>
+        require(urisWithFileName.size == 1, "Cannot add multiple files with the same name, but" +
+          s" file name $fileName is shared by all of these URIs: $urisWithFileName")
+    }
+  }
+
   private def parseKeyValuePairs(
       maybeKeyValues: Option[String],
       configKey: String,
@@ -249,3 +230,46 @@ private[spark] class Client(
     }).getOrElse(Map.empty[String, String])
   }
 }
+
+private[spark] object Client {
+  def main(args: Array[String]): Unit = {
+    val sparkConf = new SparkConf(true)
+    val mainAppResource = args(0)
+    val mainClass = args(1)
+    val appArgs = args.drop(2)
+    run(sparkConf, mainAppResource, mainClass, appArgs)
+  }
+
+  def run(
+      sparkConf: SparkConf,
+      mainAppResource: String,
+      mainClass: String,
+      appArgs: Array[String]): Unit = {
+    val sparkJars = sparkConf.getOption("spark.jars")
+      .map(_.split(","))
+      .getOrElse(Array.empty[String]) ++
+      Option(mainAppResource)
+        .filterNot(_ == SparkLauncher.NO_RESOURCE)
+        .toSeq
+    val launchTime = System.currentTimeMillis
+    val sparkFiles = sparkConf.getOption("spark.files")
+      .map(_.split(","))
+      .getOrElse(Array.empty[String])
+    val appName = sparkConf.getOption("spark.app.name")
+      .getOrElse("spark")
+    val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+    val initContainerComponentsProvider = new DriverInitContainerComponentsProviderImpl(
+      sparkConf, kubernetesAppId, sparkJars, sparkFiles)
+    val kubernetesClientProvider = new SubmissionKubernetesClientProviderImpl(sparkConf)
+    new Client(
+      appName,
+      kubernetesAppId,
+      mainClass,
+      sparkConf,
+      appArgs,
+      sparkJars,
+      sparkFiles,
+      kubernetesClientProvider,
+      initContainerComponentsProvider).run()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala
new file mode 100644
index 0000000000000..5505d87fa8072
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.File
+
+import org.apache.spark.util.Utils
+
+private[spark] trait ContainerLocalizedFilesResolver {
+  def resolveSubmittedAndRemoteSparkJars(): Seq[String]
+  def resolveSubmittedSparkJars(): Seq[String]
+  def resolveSubmittedSparkFiles(): Seq[String]
+}
+
+private[spark] class ContainerLocalizedFilesResolverImpl(
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    filesDownloadPath: String) extends ContainerLocalizedFilesResolver {
+
+  override def resolveSubmittedAndRemoteSparkJars(): Seq[String] = {
+    sparkJars.map { jar =>
+      val jarUri = Utils.resolveURI(jar)
+      Option(jarUri.getScheme).getOrElse("file") match {
+        case "local" =>
+          jarUri.getPath
+        case _ =>
+          val jarFileName = new File(jarUri.getPath).getName
+          s"$jarsDownloadPath/$jarFileName"
+      }
+    }
+  }
+
+  override def resolveSubmittedSparkJars(): Seq[String] = {
+    resolveSubmittedFiles(sparkJars, jarsDownloadPath)
+  }
+
+  override def resolveSubmittedSparkFiles(): Seq[String] = {
+    resolveSubmittedFiles(sparkFiles, filesDownloadPath)
+  }
+
+  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
+    files.map { file =>
+      val fileUri = Utils.resolveURI(file)
+      Option(fileUri.getScheme).getOrElse("file") match {
+        case "file" =>
+          val fileName = new File(fileUri.getPath).getName
+          s"$downloadPath/$fileName"
+        case _ =>
+          file
+      }
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
new file mode 100644
index 0000000000000..5b649735f2b3d
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.rest.kubernetes.v2.RetrofitClientFactoryImpl
+
+/**
+ * Interface that wraps the provision of everything the submission client needs to set up the
+ * driver's init-container. This is all wrapped in the same place to ensure that related
+ * components are being constructed with consistent configurations with respect to one another.
+ */
+private[spark] trait DriverInitContainerComponentsProvider {
+
+  def provideInitContainerConfigMapBuilder(
+      maybeSubmittedResourceIds: Option[SubmittedResourceIds])
+      : SparkInitContainerConfigMapBuilder
+  def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver
+  def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration
+  def provideInitContainerSubmittedDependencyUploader(
+      driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader]
+  def provideSubmittedDependenciesSecretBuilder(
+      maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
+      : Option[SubmittedDependencySecretBuilder]
+  def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap
+}
+
+private[spark] class DriverInitContainerComponentsProviderImpl(
+    sparkConf: SparkConf,
+    kubernetesAppId: String,
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String])
+    extends DriverInitContainerComponentsProvider {
+
+  private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
+  private val resourceStagingServerSslOptions = new SecurityManager(sparkConf)
+      .getSSLOptions(RESOURCE_STAGING_SERVER_SSL_NAMESPACE)
+  private val jarsDownloadPath = sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
+  private val filesDownloadPath = sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
+  private val maybeSecretName = maybeResourceStagingServerUri.map { _ =>
+    s"$kubernetesAppId-init-secret"
+  }
+  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
+  private val configMapName = s"$kubernetesAppId-init-config"
+  private val configMapKey = s"$kubernetesAppId-init-config-key"
+  private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
+  private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
+
+  override def provideInitContainerConfigMapBuilder(
+      maybeSubmittedResourceIds: Option[SubmittedResourceIds])
+      : SparkInitContainerConfigMapBuilder = {
+    val submittedDependencyConfigPlugin = for {
+      stagingServerUri <- maybeResourceStagingServerUri
+      jarsResourceId <- maybeSubmittedResourceIds.map(_.jarsResourceId)
+      filesResourceId <- maybeSubmittedResourceIds.map(_.filesResourceId)
+    } yield {
+      new SubmittedDependencyInitContainerConfigPluginImpl(
+        stagingServerUri,
+        jarsResourceId,
+        filesResourceId,
+        INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
+        INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
+        INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
+        INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH,
+        resourceStagingServerSslOptions)
+    }
+    new SparkInitContainerConfigMapBuilderImpl(
+      sparkJars,
+      sparkFiles,
+      jarsDownloadPath,
+      filesDownloadPath,
+      configMapName,
+      configMapKey,
+      submittedDependencyConfigPlugin)
+  }
+
+  override def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver = {
+    new ContainerLocalizedFilesResolverImpl(
+        sparkJars, sparkFiles, jarsDownloadPath, filesDownloadPath)
+  }
+
+  override def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration = {
+    new ExecutorInitContainerConfigurationImpl(
+        maybeSecretName,
+        INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH,
+        configMapName,
+        configMapKey)
+  }
+
+  override def provideInitContainerSubmittedDependencyUploader(
+      driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader] = {
+    maybeResourceStagingServerUri.map { stagingServerUri =>
+      new SubmittedDependencyUploaderImpl(
+        kubernetesAppId,
+        driverPodLabels,
+        namespace,
+        stagingServerUri,
+        sparkJars,
+        sparkFiles,
+        resourceStagingServerSslOptions,
+        RetrofitClientFactoryImpl)
+    }
+  }
+
+  override def provideSubmittedDependenciesSecretBuilder(
+      maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
+      : Option[SubmittedDependencySecretBuilder] = {
+    for {
+      secretName <- maybeSecretName
+      jarsResourceSecret <- maybeSubmittedResourceSecrets.map(_.jarsResourceSecret)
+      filesResourceSecret <- maybeSubmittedResourceSecrets.map(_.filesResourceSecret)
+    } yield {
+      new SubmittedDependencySecretBuilderImpl(
+        secretName,
+        jarsResourceSecret,
+        filesResourceSecret,
+        INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
+        INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
+        INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
+        resourceStagingServerSslOptions)
+    }
+  }
+
+  override def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap = {
+    val resourceStagingServerSecretPlugin = maybeSecretName.map { secret =>
+      new InitContainerResourceStagingServerSecretPluginImpl(
+          secret, INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
+    }
+    new SparkPodInitContainerBootstrapImpl(
+      initContainerImage,
+      jarsDownloadPath,
+      filesDownloadPath,
+      downloadTimeoutMinutes,
+      configMapName,
+      configMapKey,
+      resourceStagingServerSecretPlugin)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala
new file mode 100644
index 0000000000000..adfdc060f0d0f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+
+private[spark] trait ExecutorInitContainerConfiguration {
+  /**
+   * Provide the driver with configuration that allows it to configure executors to
+   * fetch resources in the same way the driver does.
+   */
+  def configureSparkConfForExecutorInitContainer(originalSparkConf: SparkConf): SparkConf
+}
+
+private[spark] class ExecutorInitContainerConfigurationImpl(
+    initContainerSecretName: Option[String],
+    initContainerSecretMountDir: String,
+    initContainerConfigMapName: String,
+    initContainerConfigMapKey: String)
+    extends ExecutorInitContainerConfiguration {
+  def configureSparkConfForExecutorInitContainer(originalSparkConf: SparkConf): SparkConf = {
+    val configuredSparkConf = originalSparkConf.clone()
+      .set(EXECUTOR_INIT_CONTAINER_CONFIG_MAP,
+        initContainerConfigMapName)
+      .set(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY,
+        initContainerConfigMapKey)
+      .set(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR, initContainerSecretMountDir)
+    initContainerSecretName.map { secret =>
+      configuredSparkConf.set(EXECUTOR_INIT_CONTAINER_SECRET, secret)
+    }.getOrElse(configuredSparkConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala
new file mode 100644
index 0000000000000..0526ca53baaab
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import io.fabric8.kubernetes.api.model.{Container, PodBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] object InitContainerUtil {
+
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+
+  def appendInitContainer(
+    originalPodSpec: PodBuilder, initContainer: Container): PodBuilder = {
+    val resolvedInitContainers = originalPodSpec
+      .editMetadata()
+      .getAnnotations
+      .asScala
+      .get(INIT_CONTAINER_ANNOTATION)
+      .map { existingInitContainerAnnotation =>
+        val existingInitContainers = OBJECT_MAPPER.readValue(
+          existingInitContainerAnnotation, classOf[List[Container]])
+        existingInitContainers ++ Seq(initContainer)
+      }.getOrElse(Seq(initContainer))
+    val resolvedSerializedInitContainers = OBJECT_MAPPER.writeValueAsString(resolvedInitContainers)
+    originalPodSpec
+      .editMetadata()
+      .removeFromAnnotations(INIT_CONTAINER_ANNOTATION)
+      .addToAnnotations(INIT_CONTAINER_ANNOTATION, resolvedSerializedInitContainers)
+      .endMetadata()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala
deleted file mode 100644
index 9dbbcd0d56a3b..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerImpl.scala
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v2
-
-import java.io.{File, FileOutputStream, StringWriter}
-import java.util.Properties
-import javax.ws.rs.core.MediaType
-
-import com.fasterxml.jackson.databind.ObjectMapper
-import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, Container, ContainerBuilder, EmptyDirVolumeSource, PodBuilder, Secret, SecretBuilder, VolumeMount, VolumeMountBuilder}
-import okhttp3.RequestBody
-import retrofit2.Call
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-
-import org.apache.spark.{SparkException, SSLOptions}
-import org.apache.spark.deploy.kubernetes.CompressionUtils
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.v1.{KubernetesCredentials, KubernetesFileUtils}
-import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory, StagedResourceIdentifier}
-import org.apache.spark.util.Utils
-
-private[spark] trait MountedDependencyManager {
-
-  /**
-   * Upload submitter-local jars to the resource staging server.
-   * @return The resource ID and secret to use to retrieve these jars.
-   */
-  def uploadJars(): StagedResourceIdentifier
-
-  /**
-   * Upload submitter-local files to the resource staging server.
-   * @return The resource ID and secret to use to retrieve these files.
-   */
-  def uploadFiles(): StagedResourceIdentifier
-
-  def configurePodToMountLocalDependencies(
-    driverContainerName: String,
-    initContainerSecret: Secret,
-    initContainerConfigMap: ConfigMap,
-    originalPodSpec: PodBuilder): PodBuilder
-
-  def buildInitContainerSecret(jarsSecret: String, filesSecret: String): Secret
-
-  def buildInitContainerConfigMap(
-    jarsResourceId: String, filesResourceId: String): ConfigMap
-
-  /**
-   * Convert the Spark jar paths from their locations on the submitter's disk to
-   * the locations they will be downloaded to on the driver's disk.
-   */
-  def resolveSparkJars(): Seq[String]
-
-  /**
-   * Convert the Spark file paths from their locations on the submitter's disk to
-   * the locations they will be downloaded to on the driver's disk.
-   */
-  def resolveSparkFiles(): Seq[String]
-}
-
-/**
- * Default implementation of a MountedDependencyManager that is backed by a
- * Resource Staging Service.
- */
-private[spark] class MountedDependencyManagerImpl(
-    kubernetesAppId: String,
-    podLabels: Map[String, String],
-    podNamespace: String,
-    stagingServerUri: String,
-    initContainerImage: String,
-    jarsDownloadPath: String,
-    filesDownloadPath: String,
-    downloadTimeoutMinutes: Long,
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    stagingServiceSslOptions: SSLOptions,
-    retrofitClientFactory: RetrofitClientFactory) extends MountedDependencyManager {
-  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
-
-  private def localUriStringsToFiles(uris: Seq[String]): Iterable[File] = {
-    KubernetesFileUtils.getOnlySubmitterLocalFiles(uris)
-      .map(Utils.resolveURI)
-      .map(uri => new File(uri.getPath))
-  }
-  private def localJars: Iterable[File] = localUriStringsToFiles(sparkJars)
-  private def localFiles: Iterable[File] = localUriStringsToFiles(sparkFiles)
-
-  override def uploadJars(): StagedResourceIdentifier = doUpload(localJars, "uploaded-jars")
-  override def uploadFiles(): StagedResourceIdentifier = doUpload(localFiles, "uploaded-files")
-
-  private def doUpload(files: Iterable[File], fileNamePrefix: String): StagedResourceIdentifier = {
-    val filesDir = Utils.createTempDir(namePrefix = fileNamePrefix)
-    val filesTgz = new File(filesDir, s"$fileNamePrefix.tgz")
-    Utils.tryWithResource(new FileOutputStream(filesTgz)) { filesOutputStream =>
-      CompressionUtils.writeTarGzipToStream(filesOutputStream, files.map(_.getAbsolutePath))
-    }
-    // TODO provide credentials properly when the staging server monitors the Kubernetes API.
-    val kubernetesCredentialsString = OBJECT_MAPPER.writer()
-      .writeValueAsString(KubernetesCredentials(None, None, None, None))
-    val labelsAsString = OBJECT_MAPPER.writer().writeValueAsString(podLabels)
-
-    val filesRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), filesTgz)
-
-    val kubernetesCredentialsBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), kubernetesCredentialsString)
-
-    val namespaceRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.TEXT_PLAIN), podNamespace)
-
-    val labelsRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsAsString)
-
-    val service = retrofitClientFactory.createRetrofitClient(
-      stagingServerUri,
-      classOf[ResourceStagingServiceRetrofit],
-      stagingServiceSslOptions)
-    val uploadResponse = service.uploadResources(
-      labelsRequestBody, namespaceRequestBody, filesRequestBody, kubernetesCredentialsBody)
-    getTypedResponseResult(uploadResponse)
-  }
-
-  override def configurePodToMountLocalDependencies(
-      driverContainerName: String,
-      initContainerSecret: Secret,
-      initContainerConfigMap: ConfigMap,
-      originalPodSpec: PodBuilder): PodBuilder = {
-    val sharedVolumeMounts = Seq[VolumeMount](
-      new VolumeMountBuilder()
-        .withName(DOWNLOAD_JARS_VOLUME_NAME)
-        .withMountPath(jarsDownloadPath)
-        .build(),
-      new VolumeMountBuilder()
-        .withName(DOWNLOAD_FILES_VOLUME_NAME)
-        .withMountPath(filesDownloadPath)
-        .build())
-
-    val initContainers = Seq(new ContainerBuilder()
-      .withName("spark-driver-init")
-      .withImage(initContainerImage)
-      .withImagePullPolicy("IfNotPresent")
-      .addNewVolumeMount()
-        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
-        .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH)
-        .endVolumeMount()
-      .addNewVolumeMount()
-        .withName(INIT_CONTAINER_SECRETS_VOLUME_NAME)
-        .withMountPath(INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH)
-        .endVolumeMount()
-      .addToVolumeMounts(sharedVolumeMounts: _*)
-      .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
-      .build())
-
-    // Make sure we don't override any user-provided init containers by just appending ours to
-    // the existing list.
-    val resolvedInitContainers = originalPodSpec
-      .editMetadata()
-      .getAnnotations
-      .asScala
-      .get(INIT_CONTAINER_ANNOTATION)
-      .map { existingInitContainerAnnotation =>
-        val existingInitContainers = OBJECT_MAPPER.readValue(
-          existingInitContainerAnnotation, classOf[List[Container]])
-        existingInitContainers ++ initContainers
-      }.getOrElse(initContainers)
-    val resolvedSerializedInitContainers = OBJECT_MAPPER.writeValueAsString(resolvedInitContainers)
-    originalPodSpec
-      .editMetadata()
-        .removeFromAnnotations(INIT_CONTAINER_ANNOTATION)
-        .addToAnnotations(INIT_CONTAINER_ANNOTATION, resolvedSerializedInitContainers)
-        .endMetadata()
-      .editSpec()
-        .addNewVolume()
-          .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
-          .withNewConfigMap()
-            .withName(initContainerConfigMap.getMetadata.getName)
-            .addNewItem()
-              .withKey(INIT_CONTAINER_CONFIG_MAP_KEY)
-              .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME)
-              .endItem()
-            .endConfigMap()
-          .endVolume()
-        .addNewVolume()
-          .withName(DOWNLOAD_JARS_VOLUME_NAME)
-          .withEmptyDir(new EmptyDirVolumeSource())
-          .endVolume()
-        .addNewVolume()
-          .withName(DOWNLOAD_FILES_VOLUME_NAME)
-          .withEmptyDir(new EmptyDirVolumeSource())
-          .endVolume()
-        .addNewVolume()
-          .withName(INIT_CONTAINER_SECRETS_VOLUME_NAME)
-          .withNewSecret()
-            .withSecretName(initContainerSecret.getMetadata.getName)
-            .endSecret()
-          .endVolume()
-        .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainerName))
-          .addToVolumeMounts(sharedVolumeMounts: _*)
-          .addNewEnv()
-            .withName(ENV_UPLOADED_JARS_DIR)
-            .withValue(jarsDownloadPath)
-            .endEnv()
-          .endContainer()
-        .endSpec()
-  }
-
-  override def buildInitContainerSecret(jarsSecret: String, filesSecret: String): Secret = {
-    val trustStoreBase64 = stagingServiceSslOptions.trustStore.map { trustStoreFile =>
-      require(trustStoreFile.isFile, "Dependency server trustStore provided at" +
-        trustStoreFile.getAbsolutePath + " does not exist or is not a file.")
-      (INIT_CONTAINER_TRUSTSTORE_SECRET_KEY,
-        BaseEncoding.base64().encode(Files.toByteArray(trustStoreFile)))
-    }.toMap
-    val jarsSecretBase64 = BaseEncoding.base64().encode(jarsSecret.getBytes(Charsets.UTF_8))
-    val filesSecretBase64 = BaseEncoding.base64().encode(filesSecret.getBytes(Charsets.UTF_8))
-    val secretData = Map(
-      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY -> jarsSecretBase64,
-      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY -> filesSecretBase64) ++
-      trustStoreBase64
-    val kubernetesSecret = new SecretBuilder()
-      .withNewMetadata()
-      .withName(s"$kubernetesAppId-spark-init")
-      .endMetadata()
-      .addToData(secretData.asJava)
-      .build()
-    kubernetesSecret
-  }
-
-  override def buildInitContainerConfigMap(
-       jarsResourceId: String, filesResourceId: String): ConfigMap = {
-    val initContainerProperties = new Properties()
-    initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_URI.key, stagingServerUri)
-    initContainerProperties.setProperty(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION.key, jarsDownloadPath)
-    initContainerProperties.setProperty(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION.key, filesDownloadPath)
-    initContainerProperties.setProperty(
-      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key, jarsResourceId)
-    initContainerProperties.setProperty(
-      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key, INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH)
-    initContainerProperties.setProperty(
-      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key, filesResourceId)
-    initContainerProperties.setProperty(
-      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key, INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH)
-    initContainerProperties.setProperty(DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT.key,
-      s"${downloadTimeoutMinutes}m")
-    stagingServiceSslOptions.trustStore.foreach { _ =>
-      initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key,
-        INIT_CONTAINER_TRUSTSTORE_PATH)
-    }
-    initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_SSL_ENABLED.key,
-      stagingServiceSslOptions.enabled.toString)
-    stagingServiceSslOptions.trustStorePassword.foreach { password =>
-      initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key, password)
-    }
-    stagingServiceSslOptions.trustStoreType.foreach { storeType =>
-      initContainerProperties.setProperty(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key, storeType)
-    }
-    val propertiesWriter = new StringWriter()
-    initContainerProperties.store(propertiesWriter, "Init-container properties.")
-    new ConfigMapBuilder()
-      .withNewMetadata()
-      .withName(s"$kubernetesAppId-init-properties")
-      .endMetadata()
-      .addToData(INIT_CONTAINER_CONFIG_MAP_KEY, propertiesWriter.toString)
-      .build()
-  }
-
-  override def resolveSparkJars(): Seq[String] = resolveLocalFiles(sparkJars, jarsDownloadPath)
-
-  override def resolveSparkFiles(): Seq[String] = resolveLocalFiles(sparkFiles, filesDownloadPath)
-
-  private def resolveLocalFiles(
-      allFileUriStrings: Seq[String], localDownloadRoot: String): Seq[String] = {
-    val usedLocalFileNames = mutable.HashSet.empty[String]
-    val resolvedFiles = mutable.Buffer.empty[String]
-    for (fileUriString <- allFileUriStrings) {
-      val fileUri = Utils.resolveURI(fileUriString)
-      val resolvedFile = Option(fileUri.getScheme).getOrElse("file") match {
-        case "file" =>
-          // Deduplication logic matches that of CompressionUtils#writeTarGzipToStream
-          val file = new File(fileUri.getPath)
-          val extension = Files.getFileExtension(file.getName)
-          val nameWithoutExtension = Files.getNameWithoutExtension(file.getName)
-          var resolvedFileName = file.getName
-          var deduplicationCounter = 1
-          while (usedLocalFileNames.contains(resolvedFileName)) {
-            resolvedFileName = s"$nameWithoutExtension-$deduplicationCounter.$extension"
-            deduplicationCounter += 1
-          }
-          s"file://$localDownloadRoot/$resolvedFileName"
-        case _ => fileUriString
-      }
-      resolvedFiles += resolvedFile
-    }
-    resolvedFiles
-  }
-
-  private def getTypedResponseResult[T](call: Call[T]): T = {
-    val response = call.execute()
-    if (response.code() < 200 || response.code() >= 300) {
-      throw new SparkException("Unexpected response from dependency server when uploading" +
-        s" dependencies: ${response.code()}. Error body: " +
-        Option(response.errorBody()).map(_.string()).getOrElse("N/A"))
-    }
-    response.body()
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala
deleted file mode 100644
index 8f09112132b2c..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerProvider.scala
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v2
-
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.rest.kubernetes.v2.RetrofitClientFactoryImpl
-
-private[spark] trait MountedDependencyManagerProvider {
-  def getMountedDependencyManager(
-    kubernetesAppId: String,
-    stagingServerUri: String,
-    podLabels: Map[String, String],
-    podNamespace: String,
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String]): MountedDependencyManager
-}
-
-private[spark] class MountedDependencyManagerProviderImpl(sparkConf: SparkConf)
-    extends MountedDependencyManagerProvider {
-  override def getMountedDependencyManager(
-      kubernetesAppId: String,
-      stagingServerUri: String,
-      podLabels: Map[String, String],
-      podNamespace: String,
-      sparkJars: Seq[String],
-      sparkFiles: Seq[String]): MountedDependencyManager = {
-    val resourceStagingServerSslOptions = new SparkSecurityManager(sparkConf)
-      .getSSLOptions("kubernetes.resourceStagingServer")
-    new MountedDependencyManagerImpl(
-      kubernetesAppId,
-      podLabels,
-      podNamespace,
-      stagingServerUri,
-      sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE),
-      sparkConf.get(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION),
-      sparkConf.get(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION),
-      sparkConf.get(DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT),
-      sparkJars,
-      sparkFiles,
-      resourceStagingServerSslOptions,
-      RetrofitClientFactoryImpl)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala
new file mode 100644
index 0000000000000..cb9194552d2b6
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.StringWriter
+import java.util.Properties
+
+import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder}
+
+/**
+ * Creates a config map from a map object, with a single given key
+ * and writing the map in a {@link java.util.Properties} format.
+ */
+private[spark] object PropertiesConfigMapFromScalaMapBuilder {
+
+  def buildConfigMap(
+      configMapName: String,
+      configMapKey: String,
+      config: Map[String, String]): ConfigMap = {
+    val properties = new Properties()
+    config.foreach { case (key, value) => properties.setProperty(key, value) }
+    val propertiesWriter = new StringWriter()
+    properties.store(propertiesWriter,
+      s"Java properties built from Kubernetes config map with name: $configMapName" +
+        " and config map key: $configMapKey")
+    new ConfigMapBuilder()
+      .withNewMetadata()
+        .withName(configMapName)
+        .endMetadata()
+      .addToData(configMapKey, propertiesWriter.toString)
+      .build()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala
new file mode 100644
index 0000000000000..362fbbdf517dc
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import io.fabric8.kubernetes.api.model.ConfigMap
+
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+
+private[spark] trait SparkInitContainerConfigMapBuilder {
+  /**
+   * Construct a config map that an init-container should reference for fetching
+   * remote dependencies. The config map includes the remote jars and files to download,
+   * as well as details to fetch files from a resource staging server, if applicable.
+   */
+  def build(): ConfigMap
+}
+
+private[spark] class SparkInitContainerConfigMapBuilderImpl(
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    configMapName: String,
+    configMapKey: String,
+    submittedDependenciesPlugin: Option[SubmittedDependencyInitContainerConfigPlugin])
+    extends SparkInitContainerConfigMapBuilder {
+
+  override def build(): ConfigMap = {
+    val remoteJarsToDownload = KubernetesFileUtils.getOnlyRemoteFiles(sparkJars)
+    val remoteFilesToDownload = KubernetesFileUtils.getOnlyRemoteFiles(sparkFiles)
+    val remoteJarsConf = if (remoteJarsToDownload.nonEmpty) {
+      Map(INIT_CONTAINER_REMOTE_JARS.key -> remoteJarsToDownload.mkString(","))
+    } else {
+      Map.empty[String, String]
+    }
+    val remoteFilesConf = if (remoteFilesToDownload.nonEmpty) {
+      Map(INIT_CONTAINER_REMOTE_FILES.key -> remoteFilesToDownload.mkString(","))
+    } else {
+      Map.empty[String, String]
+    }
+    val baseInitContainerConfig = Map[String, String](
+      INIT_CONTAINER_JARS_DOWNLOAD_LOCATION.key -> jarsDownloadPath,
+      INIT_CONTAINER_FILES_DOWNLOAD_LOCATION.key -> filesDownloadPath) ++
+      remoteJarsConf ++
+      remoteFilesConf
+    val submittedDependenciesConfig = submittedDependenciesPlugin.map { plugin =>
+      plugin.configurationsToFetchSubmittedDependencies()
+    }.toSeq.flatten.toMap
+    PropertiesConfigMapFromScalaMapBuilder.buildConfigMap(
+        configMapName,
+        configMapKey,
+        baseInitContainerConfig ++ submittedDependenciesConfig)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
new file mode 100644
index 0000000000000..bc9abc4eaba81
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.SSLOptions
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] trait SubmittedDependencyInitContainerConfigPlugin {
+  /**
+   * Obtain configuration to fetch submitted dependencies from a resource staging server.
+   * This includes the resource identifiers for the jar and file bundles, as well as the
+   * remote location of the resource staging server, and the location of secret files for
+   * authenticating to the resource staging server. Note that the secret file paths here need to
+   * line up with the locations the secrets are mounted by
+   * SubmittedDependencyInitContainerVolumesPlugin; constants provide the consistency and
+   * convention for these to line up.
+   */
+  def configurationsToFetchSubmittedDependencies(): Map[String, String]
+}
+
+private[spark] class SubmittedDependencyInitContainerConfigPluginImpl(
+    resourceStagingServerUri: String,
+    jarsResourceId: String,
+    filesResourceId: String,
+    jarsSecretKey: String,
+    filesSecretKey: String,
+    trustStoreSecretKey: String,
+    secretsVolumeMountPath: String,
+    resourceStagingServiceSslOptions: SSLOptions)
+    extends SubmittedDependencyInitContainerConfigPlugin {
+
+  override def configurationsToFetchSubmittedDependencies(): Map[String, String] = {
+    Map[String, String](
+      RESOURCE_STAGING_SERVER_URI.key -> resourceStagingServerUri,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> jarsResourceId,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        s"$secretsVolumeMountPath/$jarsSecretKey",
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> filesResourceId,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        s"$secretsVolumeMountPath/$filesSecretKey",
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key ->
+        resourceStagingServiceSslOptions.enabled.toString) ++
+      resourceStagingServiceSslOptions.trustStore.map { _ =>
+        (RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key,
+          s"$secretsVolumeMountPath/$trustStoreSecretKey")
+      }.toMap ++
+      resourceStagingServiceSslOptions.trustStorePassword.map { password =>
+        (RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key, password)
+      }.toMap ++
+      resourceStagingServiceSslOptions.trustStoreType.map { storeType =>
+        (RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key, storeType)
+      }.toMap
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
new file mode 100644
index 0000000000000..1853b2ecce6d2
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{Secret, SecretBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SSLOptions
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] trait SubmittedDependencySecretBuilder {
+  /**
+   * Construct a Kubernetes secret bundle that init-containers can use to retrieve an
+   * application's dependencies.
+   */
+  def build(): Secret
+}
+
+private[spark] class SubmittedDependencySecretBuilderImpl(
+    secretName: String,
+    jarsResourceSecret: String,
+    filesResourceSecret: String,
+    jarsSecretKey: String,
+    filesSecretKey: String,
+    trustStoreSecretKey: String,
+    resourceStagingServerSslOptions: SSLOptions)
+    extends SubmittedDependencySecretBuilder {
+
+  override def build(): Secret = {
+    val trustStoreBase64 = resourceStagingServerSslOptions.trustStore.map { trustStoreFile =>
+      require(trustStoreFile.isFile, "Dependency server trustStore provided at" +
+        trustStoreFile.getAbsolutePath + " does not exist or is not a file.")
+      (trustStoreSecretKey, BaseEncoding.base64().encode(Files.toByteArray(trustStoreFile)))
+    }.toMap
+    val jarsSecretBase64 = BaseEncoding.base64().encode(jarsResourceSecret.getBytes(Charsets.UTF_8))
+    val filesSecretBase64 = BaseEncoding.base64().encode(
+      filesResourceSecret.getBytes(Charsets.UTF_8))
+    val secretData = Map(
+      jarsSecretKey -> jarsSecretBase64,
+      filesSecretKey -> filesSecretBase64) ++
+      trustStoreBase64
+    val kubernetesSecret = new SecretBuilder()
+      .withNewMetadata()
+      .withName(secretName)
+      .endMetadata()
+      .addToData(secretData.asJava)
+      .build()
+    kubernetesSecret
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
new file mode 100644
index 0000000000000..f22759d463cb7
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.{File, FileOutputStream}
+import javax.ws.rs.core.MediaType
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import okhttp3.RequestBody
+import retrofit2.Call
+
+import org.apache.spark.{SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
+import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
+import org.apache.spark.util.Utils
+
+private[spark] trait SubmittedDependencyUploader {
+  /**
+   * Upload submitter-local jars to the resource staging server.
+   * @return The resource ID and secret to use to retrieve these jars.
+   */
+  def uploadJars(): SubmittedResourceIdAndSecret
+
+  /**
+   * Upload submitter-local files to the resource staging server.
+   * @return The resource ID and secret to use to retrieve these files.
+   */
+  def uploadFiles(): SubmittedResourceIdAndSecret
+}
+
+/**
+ * Default implementation of a SubmittedDependencyManager that is backed by a
+ * Resource Staging Service.
+ */
+private[spark] class SubmittedDependencyUploaderImpl(
+    kubernetesAppId: String,
+    podLabels: Map[String, String],
+    podNamespace: String,
+    stagingServerUri: String,
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    stagingServiceSslOptions: SSLOptions,
+    retrofitClientFactory: RetrofitClientFactory) extends SubmittedDependencyUploader {
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+
+  private def localUriStringsToFiles(uris: Seq[String]): Iterable[File] = {
+    KubernetesFileUtils.getOnlySubmitterLocalFiles(uris)
+      .map(Utils.resolveURI)
+      .map(uri => new File(uri.getPath))
+  }
+  private def localJars: Iterable[File] = localUriStringsToFiles(sparkJars)
+  private def localFiles: Iterable[File] = localUriStringsToFiles(sparkFiles)
+
+  override def uploadJars(): SubmittedResourceIdAndSecret = doUpload(localJars, "uploaded-jars")
+  override def uploadFiles(): SubmittedResourceIdAndSecret = doUpload(localFiles, "uploaded-files")
+
+  private def doUpload(files: Iterable[File], fileNamePrefix: String)
+      : SubmittedResourceIdAndSecret = {
+    val filesDir = Utils.createTempDir(namePrefix = fileNamePrefix)
+    val filesTgz = new File(filesDir, s"$fileNamePrefix.tgz")
+    Utils.tryWithResource(new FileOutputStream(filesTgz)) { filesOutputStream =>
+      CompressionUtils.writeTarGzipToStream(filesOutputStream, files.map(_.getAbsolutePath))
+    }
+    // TODO provide credentials properly when the staging server monitors the Kubernetes API.
+    val kubernetesCredentialsString = OBJECT_MAPPER.writer()
+      .writeValueAsString(KubernetesCredentials(None, None, None, None))
+    val labelsAsString = OBJECT_MAPPER.writer().writeValueAsString(podLabels)
+
+    val filesRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), filesTgz)
+
+    val kubernetesCredentialsBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), kubernetesCredentialsString)
+
+    val namespaceRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.TEXT_PLAIN), podNamespace)
+
+    val labelsRequestBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsAsString)
+
+    val service = retrofitClientFactory.createRetrofitClient(
+      stagingServerUri,
+      classOf[ResourceStagingServiceRetrofit],
+      stagingServiceSslOptions)
+    val uploadResponse = service.uploadResources(
+      labelsRequestBody, namespaceRequestBody, filesRequestBody, kubernetesCredentialsBody)
+    getTypedResponseResult(uploadResponse)
+  }
+
+  private def getTypedResponseResult[T](call: Call[T]): T = {
+    val response = call.execute()
+    if (response.code() < 200 || response.code() >= 300) {
+      throw new SparkException("Unexpected response from dependency server when uploading" +
+        s" dependencies: ${response.code()}. Error body: " +
+        Option(response.errorBody()).map(_.string()).getOrElse("N/A"))
+    }
+    response.body()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedResources.scala
similarity index 51%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedResources.scala
index 65bc9bc17dae9..f4e5e991180ce 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/StagedResourceIdentifier.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedResources.scala
@@ -14,6 +14,19 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.kubernetes.submit.v2
 
-case class StagedResourceIdentifier(resourceId: String, resourceSecret: String)
+case class SubmittedResourceIdAndSecret(resourceId: String, resourceSecret: String)
+
+case class SubmittedResources(
+    jarsResourceIdAndSecret: SubmittedResourceIdAndSecret,
+    filesResourceIdAndSecret: SubmittedResourceIdAndSecret) {
+  def ids(): SubmittedResourceIds = SubmittedResourceIds(
+      jarsResourceIdAndSecret.resourceId, filesResourceIdAndSecret.resourceId)
+  def secrets(): SubmittedResourceSecrets = SubmittedResourceSecrets(
+      jarsResourceIdAndSecret.resourceSecret, filesResourceIdAndSecret.resourceSecret)
+}
+
+case class SubmittedResourceIds(jarsResourceId: String, filesResourceId: String)
+
+case class SubmittedResourceSecrets(jarsResourceSecret: String, filesResourceSecret: String)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
index ca05fe767146b..7847ba2546594 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
@@ -35,6 +35,7 @@ import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkCo
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
 import org.apache.spark.deploy.rest._
 import org.apache.spark.internal.config.OptionalConfigEntry
 import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
index 680d305985cc0..67caa176930ea 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
@@ -25,12 +25,15 @@ import com.google.common.io.Files
 import com.google.common.util.concurrent.SettableFuture
 import okhttp3.ResponseBody
 import retrofit2.{Call, Callback, Response}
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
 
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException}
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 private trait WaitableCallback[T] extends Callback[T] {
   private val complete = SettableFuture.create[Boolean]
@@ -61,55 +64,149 @@ private class DownloadTarGzCallback(downloadDir: File) extends WaitableCallback[
   }
 }
 
+// Extracted for testing so that unit tests don't have to depend on Utils.fetchFile
+private[v2] trait FileFetcher {
+  def fetchFile(uri: String, targetDir: File): Unit
+}
+
+private class FileFetcherImpl(sparkConf: SparkConf, securityManager: SparkSecurityManager)
+    extends FileFetcher {
+  def fetchFile(uri: String, targetDir: File): Unit = {
+    Utils.fetchFile(
+      url = uri,
+      targetDir = targetDir,
+      conf = sparkConf,
+      securityMgr = securityManager,
+      hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf),
+      timestamp = System.currentTimeMillis(),
+      useCache = false)
+  }
+}
+
+/**
+ * Process that fetches files from a resource staging server and/or arbitrary remote locations.
+ *
+ * The init-container can handle fetching files from any of those sources, but not all of the
+ * sources need to be specified. This allows for composing multiple instances of this container
+ * with different configurations for different download sources, or using the same container to
+ * download everything at once.
+ */
 private[spark] class KubernetesSparkDependencyDownloadInitContainer(
-    sparkConf: SparkConf, retrofitClientFactory: RetrofitClientFactory) extends Logging {
+    sparkConf: SparkConf,
+    retrofitClientFactory: RetrofitClientFactory,
+    fileFetcher: FileFetcher,
+    securityManager: SparkSecurityManager) extends Logging {
 
-  private val resourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
-    .getOrElse(throw new SparkException("No dependency server URI was provided."))
+  private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("download-executor"))
+  private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
 
-  private val downloadJarsResourceIdentifier = sparkConf
+  private val maybeDownloadJarsResourceIdentifier = sparkConf
     .get(INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER)
-    .getOrElse(throw new SparkException("No resource identifier provided for jars."))
   private val downloadJarsSecretLocation = new File(
     sparkConf.get(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION))
-  private val downloadFilesResourceIdentifier = sparkConf
+  private val maybeDownloadFilesResourceIdentifier = sparkConf
     .get(INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER)
-    .getOrElse(throw new SparkException("No resource identifier provided for files."))
   private val downloadFilesSecretLocation = new File(
     sparkConf.get(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION))
-  require(downloadJarsSecretLocation.isFile, "Application jars download secret provided" +
-    s" at ${downloadJarsSecretLocation.getAbsolutePath} does not exist or is not a file.")
-  require(downloadFilesSecretLocation.isFile, "Application files download secret provided" +
-    s" at ${downloadFilesSecretLocation.getAbsolutePath} does not exist or is not a file.")
 
-  private val jarsDownloadDir = new File(sparkConf.get(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION))
-  require(jarsDownloadDir.isDirectory, "Application jars download directory provided at" +
-    s" ${jarsDownloadDir.getAbsolutePath} does not exist or is not a directory.")
+  private val jarsDownloadDir = new File(
+    sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION))
+  private val filesDownloadDir = new File(
+    sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION))
+
+  private val remoteJars = sparkConf.get(INIT_CONTAINER_REMOTE_JARS)
+  private val remoteFiles = sparkConf.get(INIT_CONTAINER_REMOTE_FILES)
 
-  private val filesDownloadDir = new File(sparkConf.get(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION))
-  require(filesDownloadDir.isDirectory, "Application files download directory provided at" +
-    s" ${filesDownloadDir.getAbsolutePath} does not exist or is not a directory.")
-  private val downloadTimeoutMinutes = sparkConf.get(DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT)
+  private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
 
   def run(): Unit = {
-    val securityManager = new SparkSecurityManager(sparkConf)
-    val sslOptions = securityManager.getSSLOptions("kubernetes.resourceStagingServer")
-    val service = retrofitClientFactory.createRetrofitClient(
-      resourceStagingServerUri, classOf[ResourceStagingServiceRetrofit], sslOptions)
-    val jarsSecret = Files.toString(downloadJarsSecretLocation, Charsets.UTF_8)
-    val filesSecret = Files.toString(downloadFilesSecretLocation, Charsets.UTF_8)
-    val downloadJarsCallback = new DownloadTarGzCallback(jarsDownloadDir)
-    val downloadFilesCallback = new DownloadTarGzCallback(filesDownloadDir)
-    service.downloadResources(downloadJarsResourceIdentifier, jarsSecret)
-      .enqueue(downloadJarsCallback)
-    service.downloadResources(downloadFilesResourceIdentifier, filesSecret)
-      .enqueue(downloadFilesCallback)
-    logInfo("Waiting to download jars...")
-    downloadJarsCallback.waitForCompletion(downloadTimeoutMinutes, TimeUnit.MINUTES)
-    logInfo(s"Jars downloaded to ${jarsDownloadDir.getAbsolutePath}")
-    logInfo("Waiting to download files...")
-    downloadFilesCallback.waitForCompletion(downloadTimeoutMinutes, TimeUnit.MINUTES)
-    logInfo(s"Files downloaded to ${filesDownloadDir.getAbsolutePath}")
+    val resourceStagingServerJarsDownload = Future[Unit] {
+      downloadResourcesFromStagingServer(
+        maybeDownloadJarsResourceIdentifier,
+        downloadJarsSecretLocation,
+        jarsDownloadDir,
+        "Starting to download jars from resource staging server...",
+        "Finished downloading jars from resource staging server.",
+        s"Application jars download secret provided at" +
+          s" ${downloadJarsSecretLocation.getAbsolutePath} does not exist or is not a file.",
+        s"Application jars download directory provided at" +
+          s" ${jarsDownloadDir.getAbsolutePath} does not exist or is not a directory.")
+    }
+    val resourceStagingServerFilesDownload = Future[Unit] {
+      downloadResourcesFromStagingServer(
+        maybeDownloadFilesResourceIdentifier,
+        downloadFilesSecretLocation,
+        filesDownloadDir,
+        "Starting to download files from resource staging server...",
+        "Finished downloading files from resource staging server.",
+        s"Application files download secret provided at" +
+          s" ${downloadFilesSecretLocation.getAbsolutePath} does not exist or is not a file.",
+        s"Application files download directory provided at" +
+          s" ${filesDownloadDir.getAbsolutePath} does not exist or is not" +
+          s" a directory.")
+    }
+    val remoteJarsDownload = Future[Unit] {
+      downloadFiles(remoteJars,
+        jarsDownloadDir,
+        s"Remote jars download directory specified at $jarsDownloadDir does not exist" +
+          s" or is not a directory.")
+    }
+    val remoteFilesDownload = Future[Unit] {
+      downloadFiles(remoteFiles,
+        filesDownloadDir,
+        s"Remote files download directory specified at $filesDownloadDir does not exist" +
+          s" or is not a directory.")
+    }
+    waitForFutures(
+      resourceStagingServerJarsDownload,
+      resourceStagingServerFilesDownload,
+      remoteJarsDownload,
+      remoteFilesDownload)
+  }
+
+  private def downloadResourcesFromStagingServer(
+      maybeResourceId: Option[String],
+      resourceSecretLocation: File,
+      resourceDownloadDir: File,
+      downloadStartMessage: String,
+      downloadFinishedMessage: String,
+      errMessageOnSecretNotAFile: String,
+      errMessageOnDownloadDirNotADirectory: String): Unit = {
+    maybeResourceStagingServerUri.foreach { resourceStagingServerUri =>
+      maybeResourceId.foreach { resourceId =>
+        require(resourceSecretLocation.isFile, errMessageOnSecretNotAFile)
+        require(resourceDownloadDir.isDirectory, errMessageOnDownloadDirNotADirectory)
+        val sslOptions = securityManager.getSSLOptions("kubernetes.resourceStagingServer")
+        val service = retrofitClientFactory.createRetrofitClient(
+          resourceStagingServerUri, classOf[ResourceStagingServiceRetrofit], sslOptions)
+        val resourceSecret = Files.toString(resourceSecretLocation, Charsets.UTF_8)
+        val downloadResourceCallback = new DownloadTarGzCallback(resourceDownloadDir)
+        logInfo(downloadStartMessage)
+        service.downloadResources(resourceId, resourceSecret)
+          .enqueue(downloadResourceCallback)
+        downloadResourceCallback.waitForCompletion(downloadTimeoutMinutes, TimeUnit.MINUTES)
+        logInfo(downloadFinishedMessage)
+      }
+    }
+  }
+
+  private def downloadFiles(
+      filesCommaSeparated: Option[String],
+      downloadDir: File,
+      errMessageOnDestinationNotADirectory: String): Unit = {
+    if (filesCommaSeparated.isDefined) {
+      require(downloadDir.isDirectory, errMessageOnDestinationNotADirectory)
+    }
+    filesCommaSeparated.map(_.split(",")).toSeq.flatten.foreach { file =>
+      fileFetcher.fetchFile(file, downloadDir)
+    }
+  }
+
+  private def waitForFutures(futures: Future[_]*) {
+    futures.foreach {
+      ThreadUtils.awaitResult(_, Duration.create(downloadTimeoutMinutes, TimeUnit.MINUTES))
+    }
   }
 }
 
@@ -121,7 +218,13 @@ object KubernetesSparkDependencyDownloadInitContainer extends Logging {
     } else {
       new SparkConf(true)
     }
-    new KubernetesSparkDependencyDownloadInitContainer(sparkConf, RetrofitClientFactoryImpl).run()
+    val securityManager = new SparkSecurityManager(sparkConf)
+    val fileFetcher = new FileFetcherImpl(sparkConf, securityManager)
+    new KubernetesSparkDependencyDownloadInitContainer(
+      sparkConf,
+      RetrofitClientFactoryImpl,
+      fileFetcher,
+      securityManager).run()
     logInfo("Finished downloading application dependencies.")
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
index 844809dec995c..b7c6c4fb913da 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
@@ -22,6 +22,7 @@ import javax.ws.rs.core.{MediaType, StreamingOutput}
 
 import org.glassfish.jersey.media.multipart.FormDataParam
 
+import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
 import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 
 /**
@@ -69,7 +70,7 @@ private[spark] trait ResourceStagingService {
       @FormDataParam("podNamespace") podNamespace: String,
       @FormDataParam("resources") resources: InputStream,
       @FormDataParam("kubernetesCredentials") kubernetesCredentials: KubernetesCredentials)
-      : StagedResourceIdentifier
+      : SubmittedResourceIdAndSecret
 
   /**
    * Download an application's resources. The resources are provided as a stream, where the stream's
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
index cf6180fbf53d4..3dfa83c85e6dd 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
@@ -26,6 +26,7 @@ import com.google.common.io.{BaseEncoding, ByteStreams, Files}
 import scala.collection.concurrent.TrieMap
 
 import org.apache.spark.SparkException
+import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
 import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
@@ -41,7 +42,7 @@ private[spark] class ResourceStagingServiceImpl(dependenciesRootDir: File)
       podLabels: Map[String, String],
       podNamespace: String,
       resources: InputStream,
-      kubernetesCredentials: KubernetesCredentials): StagedResourceIdentifier = {
+      kubernetesCredentials: KubernetesCredentials): SubmittedResourceIdAndSecret = {
     val resourceId = UUID.randomUUID().toString
     val secretBytes = new Array[Byte](1024)
     SECURE_RANDOM.nextBytes(secretBytes)
@@ -65,7 +66,7 @@ private[spark] class ResourceStagingServiceImpl(dependenciesRootDir: File)
         podNamespace,
         resourcesTgz,
         kubernetesCredentials)
-      StagedResourceIdentifier(resourceId, resourceSecret)
+      SubmittedResourceIdAndSecret(resourceId, resourceSecret)
     } catch {
       case e: Throwable =>
         if (!resourcesDir.delete()) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
index b1a3cc0676757..e0079a372f0d9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
@@ -20,6 +20,8 @@ import okhttp3.{RequestBody, ResponseBody}
 import retrofit2.Call
 import retrofit2.http.{Multipart, Path, Streaming}
 
+import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
+
 /**
  * Retrofit-compatible variant of {@link ResourceStagingService}. For documentation on
  * how to use this service, see the aforementioned JAX-RS based interface.
@@ -33,7 +35,7 @@ private[spark] trait ResourceStagingServiceRetrofit {
       @retrofit2.http.Part("podNamespace") podNamespace: RequestBody,
       @retrofit2.http.Part("resources") resources: RequestBody,
       @retrofit2.http.Part("kubernetesCredentials")
-          kubernetesCredentials: RequestBody): Call[StagedResourceIdentifier]
+          kubernetesCredentials: RequestBody): Call[SubmittedResourceIdAndSecret]
 
   @Streaming
   @retrofit2.http.GET("/api/v0/resources/{resourceId}")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index 70098f1f46ac0..e2630b9918b61 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -17,9 +17,12 @@
 package org.apache.spark.scheduler.cluster.kubernetes
 
 import org.apache.spark.SparkContext
+import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 
-private[spark] class KubernetesClusterManager extends ExternalClusterManager {
+private[spark] class KubernetesClusterManager extends ExternalClusterManager with Logging {
 
   override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
@@ -31,7 +34,49 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager {
 
   override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler)
       : SchedulerBackend = {
-    new KubernetesClusterSchedulerBackend(sc.taskScheduler.asInstanceOf[TaskSchedulerImpl], sc)
+    val sparkConf = sc.getConf
+    val maybeConfigMap = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP)
+    val maybeConfigMapKey = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY)
+
+    val maybeExecutorInitContainerSecretName =
+      sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET)
+    val maybeExecutorInitContainerSecretMount =
+        sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR)
+    val executorInitContainerSecretVolumePlugin = for {
+      initContainerSecretName <- maybeExecutorInitContainerSecretName
+      initContainerSecretMountPath <- maybeExecutorInitContainerSecretMount
+    } yield {
+      new InitContainerResourceStagingServerSecretPluginImpl(
+        initContainerSecretName,
+        initContainerSecretMountPath)
+    }
+    // Only set up the bootstrap if they've provided both the config map key and the config map
+    // name. Note that we generally expect both to have been set from spark-submit V2, but for
+    // testing developers may simply run the driver JVM locally, but the config map won't be set
+    // then.
+    val bootStrap = for {
+      configMap <- maybeConfigMap
+      configMapKey <- maybeConfigMapKey
+    } yield {
+      new SparkPodInitContainerBootstrapImpl(
+        sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE),
+        sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION),
+        sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION),
+        sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT),
+        configMap,
+        configMapKey,
+        executorInitContainerSecretVolumePlugin)
+    }
+    if (maybeConfigMap.isEmpty) {
+      logWarning("The executor's init-container config map was not specified. Executors will" +
+        " therefore not attempt to fetch remote or submitted dependencies.")
+    }
+    if (maybeConfigMapKey.isEmpty) {
+      logWarning("The executor's init-container config map key was not specified. Executors will" +
+        " therefore not attempt to fetch remote or submitted dependencies.")
+    }
+    new KubernetesClusterSchedulerBackend(
+      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl], sc, bootStrap)
   }
 
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 669a073b1fab6..0dd875b307a6d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -20,17 +20,16 @@ import java.io.Closeable
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.concurrent.{ExecutionContext, Future}
-
-import io.fabric8.kubernetes.api.model._
+import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder}
 import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.io.FilenameUtils
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkEnv, SparkException}
-import org.apache.spark.deploy.kubernetes.ConfigurationUtils
+import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, SparkPodInitContainerBootstrap}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointAddress, RpcEnv}
@@ -41,7 +40,8 @@ import org.apache.spark.util.{ThreadUtils, Utils}
 
 private[spark] class KubernetesClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
-    val sc: SparkContext)
+    val sc: SparkContext,
+    executorInitContainerBootstrap: Option[SparkPodInitContainerBootstrap])
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
 
   import KubernetesClusterSchedulerBackend._
@@ -52,6 +52,9 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val EXECUTOR_PODS_BY_IPS_LOCK = new Object
   private val executorPodsByIPs = new mutable.HashMap[String, Pod] // Indexed by executor IP addrs.
 
+  private val executorExtraClasspath = conf.get(
+    org.apache.spark.internal.config.EXECUTOR_CLASS_PATH)
+  private val executorJarsDownloadDir = conf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
   private var shufflePodCache: Option[ShufflePodCache] = None
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
@@ -258,13 +261,20 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val executorCpuQuantity = new QuantityBuilder(false)
       .withAmount(executorCores)
       .build()
+    val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
+      new EnvVarBuilder()
+        .withName(ENV_EXECUTOR_EXTRA_CLASSPATH)
+        .withValue(cp)
+        .build()
+    }
     val requiredEnv = Seq(
       (ENV_EXECUTOR_PORT, executorPort.toString),
       (ENV_DRIVER_URL, driverUrl),
       (ENV_EXECUTOR_CORES, executorCores),
       (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId()),
-      (ENV_EXECUTOR_ID, executorId))
+      (ENV_EXECUTOR_ID, executorId),
+      (ENV_MOUNTED_CLASSPATH, s"$executorJarsDownloadDir/*"))
       .map(env => new EnvVarBuilder()
         .withName(env._1)
         .withValue(env._2)
@@ -317,7 +327,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
         .endContainer()
       .endSpec()
 
-    val resolvedPodBuilder = shuffleServiceConfig
+    val withMaybeShuffleConfigPodBuilder = shuffleServiceConfig
       .map { config =>
         config.shuffleDirs.foldLeft(basePodBuilder) { (builder, dir) =>
           builder
@@ -337,9 +347,14 @@ private[spark] class KubernetesClusterSchedulerBackend(
             .endSpec()
         }
       }.getOrElse(basePodBuilder)
+    val resolvedExecutorPod = executorInitContainerBootstrap.map { bootstrap =>
+      bootstrap.bootstrapInitContainerAndVolumes(
+        "executor",
+        withMaybeShuffleConfigPodBuilder)
+    }.getOrElse(withMaybeShuffleConfigPodBuilder)
 
     try {
-      (executorId, kubernetesClient.pods().create(resolvedPodBuilder.build()))
+      (executorId, kubernetesClient.pods.create(resolvedExecutorPod.build()))
     } catch {
       case throwable: Throwable =>
         logError("Failed to allocate executor pod.", throwable)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
new file mode 100644
index 0000000000000..6db7d3ff2da53
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBuilder}
+import org.scalatest.BeforeAndAfter
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.kubernetes.constants._
+
+class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAfter {
+  private val OBJECT_MAPPER = new ObjectMapper()
+  private val INIT_CONTAINER_IMAGE = "spark-init:latest"
+  private val JARS_DOWNLOAD_PATH = "/var/data/spark-jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/spark-files"
+  private val DOWNLOAD_TIMEOUT_MINUTES = 5
+  private val INIT_CONTAINER_CONFIG_MAP_NAME = "spark-init-config-map"
+  private val INIT_CONTAINER_CONFIG_MAP_KEY = "spark-init-config-map-key"
+  private val ADDED_SUBMITTED_DEPENDENCY_ENV = "ADDED_SUBMITTED_DEPENDENCY"
+  private val ADDED_SUBMITTED_DEPENDENCY_ANNOTATION = "added-submitted-dependencies"
+  private val MAIN_CONTAINER_NAME = "spark-main"
+  private val TRUE = "true"
+
+  private val submittedDependencyPlugin = new InitContainerResourceStagingServerSecretPlugin {
+    override def addResourceStagingServerSecretVolumeToPod(basePod: PodBuilder)
+        : PodBuilder = {
+      basePod.editMetadata()
+        .addToAnnotations(ADDED_SUBMITTED_DEPENDENCY_ANNOTATION, TRUE)
+        .endMetadata()
+    }
+
+    override def mountResourceStagingServerSecretIntoInitContainer(container: ContainerBuilder)
+        : ContainerBuilder = {
+      container
+        .addNewEnv()
+          .withName(ADDED_SUBMITTED_DEPENDENCY_ENV)
+          .withValue(TRUE)
+          .endEnv()
+    }
+  }
+
+  test("Running without submitted dependencies adds init-container with volume mounts.") {
+    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
+    val podAnnotations = bootstrappedPod.getMetadata.getAnnotations.asScala
+    assert(podAnnotations.contains(INIT_CONTAINER_ANNOTATION))
+    val initContainers = OBJECT_MAPPER.readValue(
+        podAnnotations(INIT_CONTAINER_ANNOTATION), classOf[Array[Container]])
+    assert(initContainers.length === 1)
+    val initContainer = initContainers.head
+    val initContainerVolumeMounts = initContainer.getVolumeMounts.asScala.map {
+      mount => (mount.getName, mount.getMountPath)
+    }.toMap
+    val expectedInitContainerVolumeMounts = Map(
+      INIT_CONTAINER_PROPERTIES_FILE_VOLUME -> INIT_CONTAINER_PROPERTIES_FILE_DIR,
+      INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
+      INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH)
+    assert(initContainerVolumeMounts === expectedInitContainerVolumeMounts)
+    assert(initContainer.getName === "spark-init")
+    assert(initContainer.getImage === INIT_CONTAINER_IMAGE)
+    assert(initContainer.getImagePullPolicy === "IfNotPresent")
+    assert(initContainer.getArgs.asScala === List(INIT_CONTAINER_PROPERTIES_FILE_PATH))
+  }
+
+  test("Running without submitted dependencies adds volume mounts to main container.") {
+    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
+    val containers = bootstrappedPod.getSpec.getContainers.asScala
+    val mainContainer = containers.find(_.getName === MAIN_CONTAINER_NAME)
+    assert(mainContainer.isDefined)
+    val volumeMounts = mainContainer.map(_.getVolumeMounts.asScala).toSeq.flatten.map {
+      mount => (mount.getName, mount.getMountPath)
+    }.toMap
+    val expectedVolumeMounts = Map(
+      INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
+      INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH)
+    assert(volumeMounts === expectedVolumeMounts)
+  }
+
+  test("Running without submitted dependencies adds volumes to the pod") {
+    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
+    val podVolumes = bootstrappedPod.getSpec.getVolumes.asScala
+    assert(podVolumes.size === 3)
+    assert(podVolumes.exists { volume =>
+      volume.getName == INIT_CONTAINER_PROPERTIES_FILE_VOLUME &&
+        Option(volume.getConfigMap).map { configMap =>
+          configMap.getItems.asScala.map {
+            keyToPath => (keyToPath.getKey, keyToPath.getPath)
+          }.toMap
+        }.contains(Map(INIT_CONTAINER_CONFIG_MAP_KEY -> INIT_CONTAINER_PROPERTIES_FILE_NAME))
+    })
+    assert(podVolumes.exists { volume =>
+      volume.getName == INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME && volume.getEmptyDir != null
+    })
+    assert(podVolumes.exists { volume =>
+      volume.getName == INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME && volume.getEmptyDir != null
+    })
+  }
+
+  test("Running with submitted dependencies modifies the init container with the plugin.") {
+    val bootstrappedPod = bootstrapPodWithSubmittedDependencies()
+    val podAnnotations = bootstrappedPod.getMetadata.getAnnotations.asScala
+    assert(podAnnotations(ADDED_SUBMITTED_DEPENDENCY_ANNOTATION) === TRUE)
+    val initContainers = OBJECT_MAPPER.readValue(
+      podAnnotations(INIT_CONTAINER_ANNOTATION), classOf[Array[Container]])
+    assert(initContainers.length === 1)
+    val initContainer = initContainers.head
+    assert(initContainer.getEnv.asScala.exists {
+      env => env.getName === ADDED_SUBMITTED_DEPENDENCY_ENV && env.getValue === TRUE
+    })
+  }
+
+  private def bootstrapPodWithoutSubmittedDependencies(): Pod = {
+    val bootstrapUnderTest = new SparkPodInitContainerBootstrapImpl(
+      INIT_CONTAINER_IMAGE,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOWNLOAD_TIMEOUT_MINUTES,
+      INIT_CONTAINER_CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY,
+      None)
+    bootstrapUnderTest.bootstrapInitContainerAndVolumes(
+      MAIN_CONTAINER_NAME, basePod()).build()
+  }
+
+  private def bootstrapPodWithSubmittedDependencies(): Pod = {
+    val bootstrapUnderTest = new SparkPodInitContainerBootstrapImpl(
+      INIT_CONTAINER_IMAGE,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOWNLOAD_TIMEOUT_MINUTES,
+      INIT_CONTAINER_CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY,
+      Some(submittedDependencyPlugin))
+    bootstrapUnderTest.bootstrapInitContainerAndVolumes(
+      MAIN_CONTAINER_NAME, basePod()).build()
+  }
+
+  private def basePod(): PodBuilder = {
+    new PodBuilder()
+      .withNewMetadata()
+        .withName("spark-pod")
+        .endMetadata()
+      .withNewSpec()
+        .addNewContainer()
+          .withName(MAIN_CONTAINER_NAME)
+          .endContainer()
+        .endSpec()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala
new file mode 100644
index 0000000000000..473d369c8eca3
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.kubernetes.constants._
+
+class SubmittedDependencyInitContainerVolumesPluginSuite extends SparkFunSuite {
+
+  private val SECRET_NAME = "secret"
+  private val SECRET_MOUNT_PATH = "/mnt/secrets"
+  private val plugin = new InitContainerResourceStagingServerSecretPluginImpl(
+      SECRET_NAME, SECRET_MOUNT_PATH)
+
+  test("The init container should have the secret volume mount.") {
+    val baseInitContainer = new ContainerBuilder().withName("container")
+    val configuredInitContainer = plugin.mountResourceStagingServerSecretIntoInitContainer(
+        baseInitContainer).build()
+    val volumeMounts = configuredInitContainer.getVolumeMounts.asScala
+    assert(volumeMounts.size === 1)
+    assert(volumeMounts.exists { volumeMount =>
+      volumeMount.getName === INIT_CONTAINER_SECRET_VOLUME_NAME &&
+          volumeMount.getMountPath === SECRET_MOUNT_PATH
+    })
+  }
+
+  test("The pod should have the secret volume.") {
+    val basePod = new PodBuilder()
+      .withNewMetadata().withName("pod").endMetadata()
+      .withNewSpec()
+        .addNewContainer()
+          .withName("container")
+          .endContainer()
+        .endSpec()
+    val configuredPod = plugin.addResourceStagingServerSecretVolumeToPod(basePod).build()
+    val volumes = configuredPod.getSpec.getVolumes.asScala
+    assert(volumes.size === 1)
+    assert(volumes.exists { volume =>
+      volume.getName === INIT_CONTAINER_SECRET_VOLUME_NAME &&
+          Option(volume.getSecret).map(_.getSecretName).contains(SECRET_NAME)
+    })
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
index e6536fbaa6941..4dc1e2e44980a 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
@@ -18,311 +18,331 @@ package org.apache.spark.deploy.kubernetes.submit.v2
 
 import java.io.File
 
-import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, Container, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
+import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
 import io.fabric8.kubernetes.client.KubernetesClient
 import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
 import org.hamcrest.{BaseMatcher, Description}
-import org.mockito.Matchers.{any, anyVararg, argThat, eq => mockitoEq, startsWith}
-import org.mockito.Mockito.when
+import org.mockito.{ArgumentCaptor, Mock, MockitoAnnotations}
+import org.mockito.Matchers.{any, anyVararg, argThat, eq => mockitoEq}
+import org.mockito.Mockito.{times, verify, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfter
-import org.scalatest.mock.MockitoSugar._
 import scala.collection.JavaConverters._
-import scala.reflect.ClassTag
+import scala.collection.mutable
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.SparkPodInitContainerBootstrap
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.v2.StagedResourceIdentifier
-import org.apache.spark.util.Utils
 
 class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
 
-  private val MAIN_CLASS = "org.apache.spark.test.Main"
-  private val APP_ARGS = Array[String]("arg1", "arg2")
-  private val MAIN_APP_RESOURCE = "local:///app/jars/spark-main.jar"
-  private val APP_NAME = "spark-test-app"
-  private val STAGING_SERVER_URI = "http://localhost:9000"
+  private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
+  private val FILES_RESOURCE = SubmittedResourceIdAndSecret("filesId", "filesSecret")
+  private val SUBMITTED_RESOURCES = SubmittedResources(JARS_RESOURCE, FILES_RESOURCE)
+  private val BOOTSTRAPPED_POD_ANNOTATION = "bootstrapped"
+  private val TRUE = "true"
+  private val APP_NAME = "spark-test"
+  private val APP_ID = "spark-app-id"
+  private val CUSTOM_LABEL_KEY = "customLabel"
+  private val CUSTOM_LABEL_VALUE = "customLabelValue"
+  private val ALL_EXPECTED_LABELS = Map(
+      CUSTOM_LABEL_KEY -> CUSTOM_LABEL_VALUE,
+      SPARK_APP_ID_LABEL -> APP_ID,
+      SPARK_APP_NAME_LABEL -> APP_NAME)
+  private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
+  private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
+  private val SECRET_NAME = "secret"
+  private val SECRET_KEY = "secret-key"
+  private val SECRET_DATA = "secret-data"
+  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
+  private val APP_ARGS = Array("3", "20")
   private val SPARK_JARS = Seq(
-    "local:///app/jars/spark-helper.jar", "file:///var/data/spark-local-helper.jar")
+      "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
   private val RESOLVED_SPARK_JARS = Seq(
-    "local:///app/jars/spark-helper.jar",
-    "file:///var/data/spark-downloaded/spark-local-helper.jar")
+    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///var/data/spark-jars/jar2.jar")
+  private val RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS = Seq(
+    "/var/data/spark-jars/jar1.jar", "/var/data/spark-jars/jar2.jar")
   private val SPARK_FILES = Seq(
-    "local:///app/files/spark-file.txt", "file:///var/data/spark-local-file.txt")
+    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
   private val RESOLVED_SPARK_FILES = Seq(
-    "local:///app/files/spark-file.txt", "file:///var/data/spark-downloaded/spark-local-file.txt")
-  private val DRIVER_EXTRA_CLASSPATH = "/app/jars/extra-jar1.jar:/app/jars/extra-jars2.jar"
-  private val DRIVER_DOCKER_IMAGE_VALUE = "spark-driver:latest"
-  private val DRIVER_MEMORY_OVERHEARD_MB = 128L
-  private val DRIVER_MEMORY_MB = 512L
-  private val NAMESPACE = "namespace"
-  private val DOWNLOAD_JARS_RESOURCE_IDENTIFIER = StagedResourceIdentifier("jarsId", "jarsSecret")
-  private val DOWNLOAD_FILES_RESOURCE_IDENTIFIER = StagedResourceIdentifier(
-    "filesId", "filesSecret")
-  private val MOUNTED_FILES_ANNOTATION_KEY = "mountedFiles"
-
-  private var sparkConf: SparkConf = _
-  private var submissionKubernetesClientProvider: SubmissionKubernetesClientProvider = _
-  private var submissionKubernetesClient: KubernetesClient = _
-  private type PODS = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
-  private type RESOURCES = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
-    HasMetadata, Boolean]
-  private var podOperations: PODS = _
-  private var resourceListOperations: RESOURCES = _
-  private var mountedDependencyManagerProvider: MountedDependencyManagerProvider = _
-  private var mountedDependencyManager: MountedDependencyManager = _
-  private var captureCreatedPodAnswer: SelfArgumentCapturingAnswer[Pod] = _
-  private var captureCreatedResourcesAnswer: AllArgumentsCapturingAnswer[HasMetadata, RESOURCES] = _
+    "hdfs://localhost:9000/app/files/file1.txt", "file:///var/data/spark-files/file2.txt")
+  private val INIT_CONTAINER_SECRET = new SecretBuilder()
+    .withNewMetadata()
+      .withName(SECRET_NAME)
+      .endMetadata()
+    .addToData(SECRET_KEY, SECRET_DATA)
+    .build()
+  private val CONFIG_MAP_NAME = "config-map"
+  private val CONFIG_MAP_KEY = "config-map-key"
+  private val CONFIG_MAP_DATA = "config-map-data"
+  private val CUSTOM_JAVA_OPTION_KEY = "myappoption"
+  private val CUSTOM_JAVA_OPTION_VALUE = "myappoptionvalue"
+  private val DRIVER_JAVA_OPTIONS = s"-D$CUSTOM_JAVA_OPTION_KEY=$CUSTOM_JAVA_OPTION_VALUE"
+  private val DRIVER_EXTRA_CLASSPATH = "/var/data/spark-app-custom/custom-jar.jar"
+  private val INIT_CONTAINER_CONFIG_MAP = new ConfigMapBuilder()
+    .withNewMetadata()
+      .withName(CONFIG_MAP_NAME)
+      .endMetadata()
+    .addToData(CONFIG_MAP_KEY, CONFIG_MAP_DATA)
+    .build()
+  private val CUSTOM_DRIVER_IMAGE = "spark-custom-driver:latest"
+  private val DRIVER_MEMORY_MB = 512
+  private val DRIVER_MEMORY_OVERHEAD_MB = 128
+  private val SPARK_CONF = new SparkConf(true)
+      .set(DRIVER_DOCKER_IMAGE, CUSTOM_DRIVER_IMAGE)
+      .set(org.apache.spark.internal.config.DRIVER_MEMORY, DRIVER_MEMORY_MB.toLong)
+      .set(KUBERNETES_DRIVER_MEMORY_OVERHEAD, DRIVER_MEMORY_OVERHEAD_MB.toLong)
+      .set(KUBERNETES_DRIVER_LABELS, s"$CUSTOM_LABEL_KEY=$CUSTOM_LABEL_VALUE")
+      .set(KUBERNETES_DRIVER_ANNOTATIONS, s"$CUSTOM_ANNOTATION_KEY=$CUSTOM_ANNOTATION_VALUE")
+      .set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, DRIVER_EXTRA_CLASSPATH)
+      .set(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS, DRIVER_JAVA_OPTIONS)
+  private val EXECUTOR_INIT_CONF_KEY = "executor-init-conf"
+  private val SPARK_CONF_WITH_EXECUTOR_INIT_CONF = SPARK_CONF.clone()
+      .set(EXECUTOR_INIT_CONF_KEY, TRUE)
+  private val DRIVER_POD_UID = "driver-pod-uid"
+  private val DRIVER_POD_KIND = "pod"
+  private val DRIVER_POD_API_VERSION = "v1"
+  @Mock
+  private var initContainerConfigMapBuilder: SparkInitContainerConfigMapBuilder = _
+  @Mock
+  private var containerLocalizedFilesResolver: ContainerLocalizedFilesResolver = _
+  @Mock
+  private var executorInitContainerConfiguration: ExecutorInitContainerConfiguration = _
+  @Mock
+  private var submittedDependencyUploader: SubmittedDependencyUploader = _
+  @Mock
+  private var submittedDependenciesSecretBuilder: SubmittedDependencySecretBuilder = _
+  @Mock
+  private var initContainerBootstrap: SparkPodInitContainerBootstrap = _
+  @Mock
+  private var initContainerComponentsProvider: DriverInitContainerComponentsProvider = _
+  @Mock
+  private var kubernetesClientProvider: SubmissionKubernetesClientProvider = _
+  @Mock
+  private var kubernetesClient: KubernetesClient = _
+  @Mock
+  private var podOps: MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]] = _
+  private type ResourceListOps = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
+      HasMetadata, java.lang.Boolean]
+  @Mock
+  private var resourceListOps: ResourceListOps = _
 
   before {
-    sparkConf = new SparkConf(true)
-      .set("spark.app.name", APP_NAME)
-      .set("spark.master", "k8s://https://localhost:443")
-      .set(DRIVER_DOCKER_IMAGE, DRIVER_DOCKER_IMAGE_VALUE)
-      .set(KUBERNETES_DRIVER_MEMORY_OVERHEAD, DRIVER_MEMORY_OVERHEARD_MB)
-      .set(KUBERNETES_NAMESPACE, NAMESPACE)
-      .set(org.apache.spark.internal.config.DRIVER_MEMORY, DRIVER_MEMORY_MB)
-    submissionKubernetesClientProvider = mock[SubmissionKubernetesClientProvider]
-    submissionKubernetesClient = mock[KubernetesClient]
-    podOperations = mock[PODS]
-    resourceListOperations = mock[RESOURCES]
-    mountedDependencyManagerProvider = mock[MountedDependencyManagerProvider]
-    mountedDependencyManager = mock[MountedDependencyManager]
-    when(submissionKubernetesClientProvider.get).thenReturn(submissionKubernetesClient)
-    when(submissionKubernetesClient.pods()).thenReturn(podOperations)
-    captureCreatedPodAnswer = new SelfArgumentCapturingAnswer[Pod]
-    captureCreatedResourcesAnswer = new AllArgumentsCapturingAnswer[HasMetadata, RESOURCES](
-      resourceListOperations)
-    when(podOperations.create(any())).thenAnswer(captureCreatedPodAnswer)
-    when(submissionKubernetesClient.resourceList(anyVararg[HasMetadata]))
-      .thenAnswer(captureCreatedResourcesAnswer)
-  }
-
-  // Tests w/o local dependencies, or behave independently to that configuration.
-  test("Simple properties and environment set on the driver pod.") {
-    sparkConf.set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, DRIVER_EXTRA_CLASSPATH)
-    val createdDriverPod = createAndGetDriverPod()
-    val maybeDriverContainer = getDriverContainer(createdDriverPod)
-    maybeDriverContainer.foreach { driverContainer =>
-      assert(driverContainer.getName === DRIVER_CONTAINER_NAME)
-      assert(driverContainer.getImage === DRIVER_DOCKER_IMAGE_VALUE)
-      assert(driverContainer.getImagePullPolicy === "IfNotPresent")
-      val envs = driverContainer.getEnv.asScala.map { env =>
-        (env.getName, env.getValue)
-      }.toMap
-      assert(envs(ENV_DRIVER_MEMORY) === s"${DRIVER_MEMORY_MB + DRIVER_MEMORY_OVERHEARD_MB}m")
-      assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
-      assert(envs(ENV_DRIVER_ARGS) === APP_ARGS.mkString(" "))
-      assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === DRIVER_EXTRA_CLASSPATH)
-    }
-  }
-
-  test("Created pod should apply custom annotations and labels") {
-    sparkConf.set(KUBERNETES_DRIVER_LABELS,
-      "label1=label1value,label2=label2value")
-    sparkConf.set(KUBERNETES_DRIVER_ANNOTATIONS,
-      "annotation1=annotation1value,annotation2=annotation2value")
-    val createdDriverPod = createAndGetDriverPod()
-    val labels = createdDriverPod.getMetadata.getLabels.asScala
-    assert(labels.size === 4)
-    // App ID is non-deterministic, but just check if it's set and is prefixed with the app name
-    val appIdLabel = labels(SPARK_APP_ID_LABEL)
-    assert(appIdLabel != null && appIdLabel.startsWith(APP_NAME) && appIdLabel != APP_NAME)
-    val appNameLabel = labels(SPARK_APP_NAME_LABEL)
-    assert(appNameLabel != null && appNameLabel == APP_NAME)
-    assert(labels("label1") === "label1value")
-    assert(labels("label2") === "label2value")
-    val annotations = createdDriverPod.getMetadata.getAnnotations.asScala
-    val expectedAnnotations = Map(
-      "annotation1" -> "annotation1value", "annotation2" -> "annotation2value")
-    assert(annotations === expectedAnnotations)
-  }
-
-  test("Driver JVM Options should be set in the environment.") {
-    sparkConf.set(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS, "-Dopt1=opt1value")
-    sparkConf.set("spark.logConf", "true")
-    val createdDriverPod = createAndGetDriverPod()
-    val maybeDriverContainer = getDriverContainer(createdDriverPod)
-    maybeDriverContainer.foreach { driverContainer =>
-      val maybeJvmOptionsEnv = driverContainer.getEnv
-        .asScala
-        .find(_.getName == ENV_DRIVER_JAVA_OPTS)
-      assert(maybeJvmOptionsEnv.isDefined)
-      maybeJvmOptionsEnv.foreach { jvmOptionsEnv =>
-        val jvmOptions = jvmOptionsEnv.getValue.split(" ")
-        jvmOptions.foreach { opt => assert(opt.startsWith("-D")) }
-        val optionKeyValues = jvmOptions.map { option =>
-          val withoutDashDPrefix = option.stripPrefix("-D")
-          val split = withoutDashDPrefix.split('=')
-          assert(split.length == 2)
-          (split(0), split(1))
-        }.toMap
-        assert(optionKeyValues("opt1") === "opt1value")
-        assert(optionKeyValues.contains("spark.app.id"))
-        assert(optionKeyValues("spark.jars") === MAIN_APP_RESOURCE)
-        assert(optionKeyValues(KUBERNETES_DRIVER_POD_NAME.key).startsWith(APP_NAME))
-        assert(optionKeyValues("spark.app.name") === APP_NAME)
-        assert(optionKeyValues("spark.logConf") === "true")
+    MockitoAnnotations.initMocks(this)
+    when(initContainerComponentsProvider.provideInitContainerBootstrap())
+      .thenReturn(initContainerBootstrap)
+    when(submittedDependencyUploader.uploadJars()).thenReturn(JARS_RESOURCE)
+    when(submittedDependencyUploader.uploadFiles()).thenReturn(FILES_RESOURCE)
+    when(initContainerBootstrap
+      .bootstrapInitContainerAndVolumes(mockitoEq(DRIVER_CONTAINER_NAME), any()))
+      .thenAnswer(new Answer[PodBuilder] {
+        override def answer(invocationOnMock: InvocationOnMock): PodBuilder = {
+          invocationOnMock.getArgumentAt(1, classOf[PodBuilder]).editMetadata()
+            .addToAnnotations(BOOTSTRAPPED_POD_ANNOTATION, TRUE)
+            .endMetadata()
+        }
+      })
+    when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver())
+      .thenReturn(containerLocalizedFilesResolver)
+    when(initContainerComponentsProvider.provideExecutorInitContainerConfiguration())
+      .thenReturn(executorInitContainerConfiguration)
+    when(submittedDependenciesSecretBuilder.build())
+      .thenReturn(INIT_CONTAINER_SECRET)
+    when(initContainerConfigMapBuilder.build())
+      .thenReturn(INIT_CONTAINER_CONFIG_MAP)
+    when(kubernetesClientProvider.get).thenReturn(kubernetesClient)
+    when(kubernetesClient.pods()).thenReturn(podOps)
+    when(podOps.create(any())).thenAnswer(new Answer[Pod] {
+      override def answer(invocation: InvocationOnMock): Pod = {
+        new PodBuilder(invocation.getArgumentAt(0, classOf[Pod]))
+          .editMetadata()
+          .withUid(DRIVER_POD_UID)
+          .endMetadata()
+          .withKind(DRIVER_POD_KIND)
+          .withApiVersion(DRIVER_POD_API_VERSION)
+          .build()
       }
-    }
+    })
+    when(containerLocalizedFilesResolver.resolveSubmittedAndRemoteSparkJars())
+        .thenReturn(RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS)
+    when(containerLocalizedFilesResolver.resolveSubmittedSparkJars())
+        .thenReturn(RESOLVED_SPARK_JARS)
+    when(containerLocalizedFilesResolver.resolveSubmittedSparkFiles())
+        .thenReturn(RESOLVED_SPARK_FILES)
+    when(executorInitContainerConfiguration.configureSparkConfForExecutorInitContainer(SPARK_CONF))
+        .thenReturn(SPARK_CONF_WITH_EXECUTOR_INIT_CONF)
+    when(kubernetesClient.resourceList(anyVararg[HasMetadata]())).thenReturn(resourceListOps)
   }
 
-  // Tests with local dependencies with the mounted dependency manager.
-  test("Uploading local dependencies should create Kubernetes secrets and config map") {
-    val initContainerConfigMap = getInitContainerConfigMap()
-    val initContainerSecret = getInitContainerSecret()
-    runWithMountedDependencies(initContainerConfigMap, initContainerSecret)
-    val driverPod = captureCreatedPodAnswer.capturedArgument
-    assert(captureCreatedResourcesAnswer.capturedArguments != null)
-    assert(captureCreatedResourcesAnswer.capturedArguments.size === 2)
-    assert(captureCreatedResourcesAnswer.capturedArguments.toSet ===
-      Set(initContainerSecret, initContainerConfigMap))
-    captureCreatedResourcesAnswer.capturedArguments.foreach { resource =>
-      val driverPodOwnerReferences = resource.getMetadata.getOwnerReferences
-      assert(driverPodOwnerReferences.size === 1)
-      val driverPodOwnerReference = driverPodOwnerReferences.asScala.head
-      assert(driverPodOwnerReference.getName === driverPod.getMetadata.getName)
-      assert(driverPodOwnerReference.getApiVersion === driverPod.getApiVersion)
-      assert(driverPodOwnerReference.getUid === driverPod.getMetadata.getUid)
-      assert(driverPodOwnerReference.getKind === driverPod.getKind)
-      assert(driverPodOwnerReference.getController)
-    }
+  test("Run with dependency uploader") {
+    when(initContainerComponentsProvider
+        .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
+        .thenReturn(Some(submittedDependencyUploader))
+    when(initContainerComponentsProvider
+        .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets())))
+        .thenReturn(Some(submittedDependenciesSecretBuilder))
+    when(initContainerComponentsProvider
+        .provideInitContainerConfigMapBuilder(Some(SUBMITTED_RESOURCES.ids())))
+        .thenReturn(initContainerConfigMapBuilder)
+    runAndVerifyDriverPodHasCorrectProperties()
+    val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
+    verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
+    val createdResources = resourceListArgumentCaptor.getAllValues.asScala
+    assert(createdResources.size === 2)
+    verifyCreatedResourcesHaveOwnerReferences(createdResources)
+    assert(createdResources.exists {
+      case secret: Secret =>
+        val expectedSecretData = Map(SECRET_KEY -> SECRET_DATA)
+        secret.getMetadata.getName == SECRET_NAME && secret.getData.asScala == expectedSecretData
+      case _ => false
+    })
+    verifyConfigMapWasCreated(createdResources)
+    verify(submittedDependencyUploader).uploadJars()
+    verify(submittedDependencyUploader).uploadFiles()
+    verify(initContainerComponentsProvider)
+        .provideInitContainerConfigMapBuilder(Some(SUBMITTED_RESOURCES.ids()))
+    verify(initContainerComponentsProvider)
+      .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets()))
   }
 
-  test("Uploading local resources should set classpath environment variables") {
-    val initContainerConfigMap = getInitContainerConfigMap()
-    val initContainerSecret = getInitContainerSecret()
-    runWithMountedDependencies(initContainerConfigMap, initContainerSecret)
-    val driverPod = captureCreatedPodAnswer.capturedArgument
-    val maybeDriverContainer = getDriverContainer(driverPod)
-    maybeDriverContainer.foreach { driverContainer =>
-      val envs = driverContainer.getEnv
-        .asScala
-        .map { env => (env.getName, env.getValue) }
-        .toMap
-      val classPathEntries = envs(ENV_MOUNTED_CLASSPATH).split(File.pathSeparator).toSet
-      val expectedClassPathEntries = RESOLVED_SPARK_JARS
-        .map(Utils.resolveURI)
-        .map(_.getPath)
-        .toSet
-      assert(classPathEntries === expectedClassPathEntries)
-    }
+  test("Run without dependency uploader") {
+    when(initContainerComponentsProvider
+      .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
+      .thenReturn(None)
+    when(initContainerComponentsProvider
+      .provideSubmittedDependenciesSecretBuilder(None))
+      .thenReturn(None)
+    when(initContainerComponentsProvider
+      .provideInitContainerConfigMapBuilder(None))
+      .thenReturn(initContainerConfigMapBuilder)
+    runAndVerifyDriverPodHasCorrectProperties()
+    val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
+    verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
+    val createdResources = resourceListArgumentCaptor.getAllValues.asScala
+    assert(createdResources.size === 1)
+    verifyCreatedResourcesHaveOwnerReferences(createdResources)
+    verifyConfigMapWasCreated(createdResources)
+    verify(submittedDependencyUploader, times(0)).uploadJars()
+    verify(submittedDependencyUploader, times(0)).uploadFiles()
+    verify(initContainerComponentsProvider)
+      .provideInitContainerConfigMapBuilder(None)
+    verify(initContainerComponentsProvider)
+      .provideSubmittedDependenciesSecretBuilder(None)
   }
 
-  private def getInitContainerSecret(): Secret = {
-    new SecretBuilder()
-      .withNewMetadata().withName(s"$APP_NAME-init-container-secret").endMetadata()
-      .addToData(
-        INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY, DOWNLOAD_JARS_RESOURCE_IDENTIFIER.resourceSecret)
-      .addToData(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY,
-        DOWNLOAD_FILES_RESOURCE_IDENTIFIER.resourceSecret)
-      .build()
+  private def verifyCreatedResourcesHaveOwnerReferences(
+      createdResources: mutable.Buffer[HasMetadata]): Unit = {
+    assert(createdResources.forall { resource =>
+      val owners = resource.getMetadata.getOwnerReferences.asScala
+      owners.size === 1 &&
+        owners.head.getController &&
+        owners.head.getKind == DRIVER_POD_KIND &&
+        owners.head.getUid == DRIVER_POD_UID &&
+        owners.head.getName == APP_ID &&
+        owners.head.getApiVersion == DRIVER_POD_API_VERSION
+    })
   }
 
-  private def getInitContainerConfigMap(): ConfigMap = {
-    new ConfigMapBuilder()
-      .withNewMetadata().withName(s"$APP_NAME-init-container-conf").endMetadata()
-      .addToData("key", "configuration")
-      .build()
+  private def verifyConfigMapWasCreated(createdResources: mutable.Buffer[HasMetadata]): Unit = {
+    assert(createdResources.exists {
+      case configMap: ConfigMap =>
+        val expectedConfigMapData = Map(CONFIG_MAP_KEY -> CONFIG_MAP_DATA)
+        configMap.getMetadata.getName == CONFIG_MAP_NAME &&
+          configMap.getData.asScala == expectedConfigMapData
+      case _ => false
+    })
   }
 
-  private def runWithMountedDependencies(
-      initContainerConfigMap: ConfigMap, initContainerSecret: Secret): Unit = {
-    sparkConf.set(RESOURCE_STAGING_SERVER_URI, STAGING_SERVER_URI)
-      .setJars(SPARK_JARS)
-      .set("spark.files", SPARK_FILES.mkString(","))
-    val labelsMatcher = new BaseMatcher[Map[String, String]] {
-      override def matches(maybeLabels: scala.Any) = {
-        maybeLabels match {
-          case labels: Map[String, String] =>
-            labels(SPARK_APP_ID_LABEL).startsWith(APP_NAME) &&
-              labels(SPARK_APP_NAME_LABEL) == APP_NAME
-          case _ => false
+  private def runAndVerifyDriverPodHasCorrectProperties(): Unit = {
+    new Client(
+      APP_NAME,
+      APP_ID,
+      MAIN_CLASS,
+      SPARK_CONF,
+      APP_ARGS,
+      SPARK_JARS,
+      SPARK_FILES,
+      kubernetesClientProvider,
+      initContainerComponentsProvider).run()
+    val podMatcher = new BaseMatcher[Pod] {
+      override def matches(o: scala.Any): Boolean = {
+        o match {
+          case p: Pod =>
+            Option(p)
+              .filter(_.getMetadata.getName == APP_ID)
+              .filter(podHasCorrectAnnotations)
+              .filter(_.getMetadata.getLabels.asScala == ALL_EXPECTED_LABELS)
+              .filter(containerHasCorrectBasicContainerConfiguration)
+              .filter(containerHasCorrectBasicEnvs)
+              .filter(containerHasCorrectMountedClasspath)
+              .exists(containerHasCorrectJvmOptions)
+          case _ =>
+            false
         }
       }
 
-      override def describeTo(description: Description) = {
-        description.appendText("Checks if the labels contain the app ID and app name.")
-      }
+      override def describeTo(description: Description): Unit = {}
     }
-    when(mountedDependencyManagerProvider.getMountedDependencyManager(
-      startsWith(APP_NAME),
-      mockitoEq(STAGING_SERVER_URI),
-      argThat(labelsMatcher),
-      mockitoEq(NAMESPACE),
-      mockitoEq(SPARK_JARS ++ Seq(MAIN_APP_RESOURCE)),
-      mockitoEq(SPARK_FILES))).thenReturn(mountedDependencyManager)
-    when(mountedDependencyManager.uploadJars()).thenReturn(DOWNLOAD_JARS_RESOURCE_IDENTIFIER)
-    when(mountedDependencyManager.uploadFiles()).thenReturn(DOWNLOAD_FILES_RESOURCE_IDENTIFIER)
-    when(mountedDependencyManager.buildInitContainerSecret(
-      DOWNLOAD_JARS_RESOURCE_IDENTIFIER.resourceSecret,
-      DOWNLOAD_FILES_RESOURCE_IDENTIFIER.resourceSecret))
-      .thenReturn(initContainerSecret)
-    when(mountedDependencyManager.buildInitContainerConfigMap(
-      DOWNLOAD_JARS_RESOURCE_IDENTIFIER.resourceId, DOWNLOAD_FILES_RESOURCE_IDENTIFIER.resourceId))
-      .thenReturn(initContainerConfigMap)
-    when(mountedDependencyManager.resolveSparkJars()).thenReturn(RESOLVED_SPARK_JARS)
-    when(mountedDependencyManager.resolveSparkFiles()).thenReturn(RESOLVED_SPARK_FILES)
-    when(mountedDependencyManager.configurePodToMountLocalDependencies(
-      mockitoEq(DRIVER_CONTAINER_NAME),
-      mockitoEq(initContainerSecret),
-      mockitoEq(initContainerConfigMap),
-      any())).thenAnswer(new Answer[PodBuilder] {
-      override def answer(invocationOnMock: InvocationOnMock): PodBuilder = {
-        val basePod = invocationOnMock.getArgumentAt(3, classOf[PodBuilder])
-        basePod.editMetadata().addToAnnotations(MOUNTED_FILES_ANNOTATION_KEY, "true").endMetadata()
-      }
-    })
-    val clientUnderTest = createClient()
-    clientUnderTest.run()
+    verify(podOps).create(argThat(podMatcher))
   }
 
-  private def getDriverContainer(driverPod: Pod): Option[Container] = {
-    val maybeDriverContainer = driverPod.getSpec
-      .getContainers
-      .asScala
-      .find(_.getName == DRIVER_CONTAINER_NAME)
-    assert(maybeDriverContainer.isDefined)
-    maybeDriverContainer
+  private def containerHasCorrectJvmOptions(pod: Pod): Boolean = {
+    val driverContainer = pod.getSpec.getContainers.asScala.head
+    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
+    envs.toMap.get(ENV_DRIVER_JAVA_OPTS).exists { javaOptions =>
+      val splitOptions = javaOptions.split(" ")
+      val expectedOptions = SPARK_CONF.getAll
+        .filterNot(_._1 == org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS.key)
+        .toMap ++
+        Map(
+          "spark.app.id" -> APP_ID,
+          KUBERNETES_DRIVER_POD_NAME.key -> APP_ID,
+          EXECUTOR_INIT_CONF_KEY -> TRUE,
+          CUSTOM_JAVA_OPTION_KEY -> CUSTOM_JAVA_OPTION_VALUE,
+          "spark.jars" -> RESOLVED_SPARK_JARS.mkString(","),
+          "spark.files" -> RESOLVED_SPARK_FILES.mkString(","))
+      splitOptions.forall(_.startsWith("-D")) &&
+        splitOptions.map { option =>
+          val withoutPrefix = option.substring(2)
+          (withoutPrefix.split("=", 2)(0), withoutPrefix.split("=", 2)(1))
+        }.toMap == expectedOptions
+    }
   }
 
-  private def createAndGetDriverPod(): Pod = {
-    val clientUnderTest = createClient()
-    clientUnderTest.run()
-    val createdDriverPod = captureCreatedPodAnswer.capturedArgument
-    assert(createdDriverPod != null)
-    createdDriverPod
+  private def containerHasCorrectMountedClasspath(pod: Pod): Boolean = {
+    val driverContainer = pod.getSpec.getContainers.asScala.head
+    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
+    envs.toMap.get(ENV_MOUNTED_CLASSPATH).exists { classpath =>
+      val mountedClasspathEntities = classpath.split(File.pathSeparator)
+      mountedClasspathEntities.toSet == RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS.toSet
+    }
   }
 
-  private def createClient(): Client = {
-    new Client(
-      MAIN_CLASS,
-      sparkConf,
-      APP_ARGS,
-      MAIN_APP_RESOURCE,
-      submissionKubernetesClientProvider,
-      mountedDependencyManagerProvider)
+  private def containerHasCorrectBasicEnvs(pod: Pod): Boolean = {
+    val driverContainer = pod.getSpec.getContainers.asScala.head
+    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
+    val expectedBasicEnvs = Map(
+      ENV_SUBMIT_EXTRA_CLASSPATH -> DRIVER_EXTRA_CLASSPATH,
+      ENV_DRIVER_MEMORY -> s"${DRIVER_MEMORY_MB + DRIVER_MEMORY_OVERHEAD_MB}m",
+      ENV_DRIVER_MAIN_CLASS -> MAIN_CLASS,
+      ENV_DRIVER_ARGS -> APP_ARGS.mkString(" "))
+    expectedBasicEnvs.toSet.subsetOf(envs.toSet)
   }
 
-  private class SelfArgumentCapturingAnswer[T: ClassTag] extends Answer[T] {
-    var capturedArgument: T = _
-
-    override def answer(invocationOnMock: InvocationOnMock): T = {
-      val argumentClass = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
-      val argument = invocationOnMock.getArgumentAt(0, argumentClass)
-      this.capturedArgument = argument
-      argument
-    }
+  private def containerHasCorrectBasicContainerConfiguration(pod: Pod): Boolean = {
+    val containers = pod.getSpec.getContainers.asScala
+    containers.size == 1 &&
+      containers.head.getName == DRIVER_CONTAINER_NAME &&
+      containers.head.getImage == CUSTOM_DRIVER_IMAGE &&
+      containers.head.getImagePullPolicy == "IfNotPresent"
   }
 
-  private class AllArgumentsCapturingAnswer[I, T](returnValue: T) extends Answer[T] {
-    var capturedArguments: Seq[I] = _
-
-    override def answer(invocationOnMock: InvocationOnMock): T = {
-      capturedArguments = invocationOnMock.getArguments.map(_.asInstanceOf[I]).toSeq
-      returnValue
-    }
+  private def podHasCorrectAnnotations(pod: Pod): Boolean = {
+    val expectedAnnotations = Map(
+      CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
+      BOOTSTRAPPED_POD_ANNOTATION -> TRUE)
+    pod.getMetadata.getAnnotations.asScala == expectedAnnotations
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala
new file mode 100644
index 0000000000000..6804f0010b6a5
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.SparkFunSuite
+
+class ContainerLocalizedFilesResolverSuite extends SparkFunSuite {
+  private val SPARK_JARS = Seq(
+    "hdfs://localhost:9000/app/jars/jar1.jar",
+    "file:///app/jars/jar2.jar",
+    "local:///app/jars/jar3.jar",
+    "http://app/jars/jar4.jar")
+  private val SPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.txt",
+    "file:///app/files/file2.txt",
+    "local:///app/files/file3.txt",
+    "http://app/files/file4.txt")
+  private val JARS_DOWNLOAD_PATH = "/var/data/spark-jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/spark-files"
+  private val localizedFilesResolver = new ContainerLocalizedFilesResolverImpl(
+    SPARK_JARS,
+    SPARK_FILES,
+    JARS_DOWNLOAD_PATH,
+    FILES_DOWNLOAD_PATH)
+
+  test("Submitted and remote Spark jars should resolve non-local uris to download path.") {
+    val resolvedJars = localizedFilesResolver.resolveSubmittedAndRemoteSparkJars()
+    val expectedResolvedJars = Seq(
+      s"$JARS_DOWNLOAD_PATH/jar1.jar",
+      s"$JARS_DOWNLOAD_PATH/jar2.jar",
+      "/app/jars/jar3.jar",
+      s"$JARS_DOWNLOAD_PATH/jar4.jar")
+    assert(resolvedJars === expectedResolvedJars)
+  }
+
+  test("Submitted Spark jars should resolve to the download path.") {
+    val resolvedJars = localizedFilesResolver.resolveSubmittedSparkJars()
+    val expectedResolvedJars = Seq(
+      "hdfs://localhost:9000/app/jars/jar1.jar",
+      s"$JARS_DOWNLOAD_PATH/jar2.jar",
+      "local:///app/jars/jar3.jar",
+      "http://app/jars/jar4.jar")
+    assert(resolvedJars === expectedResolvedJars)
+  }
+
+  test("Submitted Spark files should resolve to the download path.") {
+    val resolvedFiles = localizedFilesResolver.resolveSubmittedSparkFiles()
+    val expectedResolvedFiles = Seq(
+      "hdfs://localhost:9000/app/files/file1.txt",
+      s"$FILES_DOWNLOAD_PATH/file2.txt",
+      "local:///app/files/file3.txt",
+      "http://app/files/file4.txt")
+    assert(resolvedFiles === expectedResolvedFiles)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala
new file mode 100644
index 0000000000000..62bfd127d17e2
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+
+class ExecutorInitContainerConfigurationSuite extends SparkFunSuite {
+
+  private val SECRET_NAME = "init-container-secret"
+  private val SECRET_MOUNT_DIR = "/mnt/secrets/spark"
+  private val CONFIG_MAP_NAME = "spark-config-map"
+  private val CONFIG_MAP_KEY = "spark-config-map-key"
+
+  test("Not passing a secret name should not set the secret value.") {
+    val baseSparkConf = new SparkConf(false)
+    val configurationUnderTest = new ExecutorInitContainerConfigurationImpl(
+      None,
+      SECRET_MOUNT_DIR,
+      CONFIG_MAP_NAME,
+      CONFIG_MAP_KEY)
+    val resolvedSparkConf = configurationUnderTest
+        .configureSparkConfForExecutorInitContainer(baseSparkConf)
+    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP).contains(CONFIG_MAP_NAME))
+    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY).contains(CONFIG_MAP_KEY))
+    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR)
+        .contains(SECRET_MOUNT_DIR))
+    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET).isEmpty)
+  }
+
+  test("Passing a secret name should set the secret value.") {
+    val baseSparkConf = new SparkConf(false)
+    val configurationUnderTest = new ExecutorInitContainerConfigurationImpl(
+      Some(SECRET_NAME),
+      SECRET_MOUNT_DIR,
+      CONFIG_MAP_NAME,
+      CONFIG_MAP_KEY)
+    val resolvedSparkConf = configurationUnderTest
+      .configureSparkConfForExecutorInitContainer(baseSparkConf)
+    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET).contains(SECRET_NAME))
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala
deleted file mode 100644
index 321fe1b3fd889..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/MountedDependencyManagerSuite.scala
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.kubernetes.submit.v2
-
-import java.io.{ByteArrayOutputStream, File, StringReader}
-import java.util.{Properties, UUID}
-
-import com.fasterxml.jackson.databind.ObjectMapper
-import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, Container, Pod, PodBuilder, SecretBuilder}
-import okhttp3.RequestBody
-import okio.Okio
-import org.mockito.invocation.InvocationOnMock
-import org.mockito.stubbing.Answer
-import org.mockito.Matchers.any
-import org.mockito.Mockito
-import org.scalatest.BeforeAndAfter
-import org.scalatest.mock.MockitoSugar._
-import retrofit2.{Call, Response}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{SparkFunSuite, SSLOptions}
-import org.apache.spark.deploy.kubernetes.CompressionUtils
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory, StagedResourceIdentifier}
-import org.apache.spark.util.Utils
-
-private[spark] class MountedDependencyManagerSuite extends SparkFunSuite with BeforeAndAfter {
-  import MountedDependencyManagerSuite.createTempFile
-
-  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
-  private val APP_ID = "app-id"
-  private val LABELS = Map("label1" -> "label1value", "label2" -> "label2value")
-  private val NAMESPACE = "namespace"
-  private val STAGING_SERVER_URI = "http://localhost:8000"
-  private val INIT_CONTAINER_IMAGE = "spark-driver-init:latest"
-  private val JARS_DOWNLOAD_PATH = DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION.defaultValue.get
-  private val FILES_DOWNLOAD_PATH = DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION.defaultValue.get
-  private val DOWNLOAD_TIMEOUT_MINUTES = 5
-  private val LOCAL_JARS = Seq(createTempFile("jar"), createTempFile("jar"))
-  private val JARS = Seq("hdfs://localhost:9000/jars/jar1.jar",
-    s"file://${LOCAL_JARS.head}",
-    LOCAL_JARS(1))
-  private val LOCAL_FILES = Seq(createTempFile("txt"))
-  private val FILES = Seq("hdfs://localhost:9000/files/file1.txt",
-    LOCAL_FILES.head)
-  private val TRUSTSTORE_FILE = new File(createTempFile(".jks"))
-  private val TRUSTSTORE_PASSWORD = "trustStorePassword"
-  private val TRUSTSTORE_TYPE = "jks"
-  private val STAGING_SERVER_SSL_OPTIONS = SSLOptions(
-    enabled = true,
-    trustStore = Some(TRUSTSTORE_FILE),
-    trustStorePassword = Some(TRUSTSTORE_PASSWORD),
-    trustStoreType = Some(TRUSTSTORE_TYPE))
-  private val JARS_RESOURCE_ID = "jarsId"
-  private val JARS_SECRET = "jarsSecret"
-  private val FILES_RESOURCE_ID = "filesId"
-  private val FILES_SECRET = "filesSecret"
-  private var retrofitClientFactory: RetrofitClientFactory = _
-  private var retrofitClient: ResourceStagingServiceRetrofit = _
-
-  private var dependencyManagerUnderTest: MountedDependencyManager = _
-
-  before {
-    retrofitClientFactory = mock[RetrofitClientFactory]
-    retrofitClient = mock[ResourceStagingServiceRetrofit]
-    Mockito.when(
-      retrofitClientFactory.createRetrofitClient(
-        STAGING_SERVER_URI, classOf[ResourceStagingServiceRetrofit], STAGING_SERVER_SSL_OPTIONS))
-      .thenReturn(retrofitClient)
-    dependencyManagerUnderTest = new MountedDependencyManagerImpl(
-      APP_ID,
-      LABELS,
-      NAMESPACE,
-      STAGING_SERVER_URI,
-      INIT_CONTAINER_IMAGE,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      DOWNLOAD_TIMEOUT_MINUTES,
-      JARS,
-      FILES,
-      STAGING_SERVER_SSL_OPTIONS,
-      retrofitClientFactory)
-  }
-
-  test("Uploading jars should contact the staging server with the appropriate parameters") {
-    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
-      StagedResourceIdentifier("resourceId", "resourceSecret"))
-    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
-      .thenAnswer(capturingArgumentsAnswer)
-    dependencyManagerUnderTest.uploadJars()
-    testUploadSendsCorrectFiles(LOCAL_JARS, capturingArgumentsAnswer)
-  }
-
-  test("Uploading files should contact the staging server with the appropriate parameters") {
-    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
-      StagedResourceIdentifier("resourceId", "resourceSecret"))
-    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
-      .thenAnswer(capturingArgumentsAnswer)
-    dependencyManagerUnderTest.uploadFiles()
-    testUploadSendsCorrectFiles(LOCAL_FILES, capturingArgumentsAnswer)
-  }
-
-  test("Init container secret should contain jars, files, and trustStore") {
-    val jarsSecretBase64 = BaseEncoding.base64().encode(JARS_SECRET.getBytes(Charsets.UTF_8))
-    val filesSecretBase64 = BaseEncoding.base64().encode(FILES_SECRET.getBytes(Charsets.UTF_8))
-    val trustStoreBase64 = BaseEncoding.base64().encode(Files.toByteArray(TRUSTSTORE_FILE))
-    val secret = dependencyManagerUnderTest.buildInitContainerSecret("jarsSecret", "filesSecret")
-    assert(secret.getMetadata.getName === s"$APP_ID-spark-init")
-    val expectedSecrets = Map(
-      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_KEY -> jarsSecretBase64,
-      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_KEY -> filesSecretBase64,
-      INIT_CONTAINER_TRUSTSTORE_SECRET_KEY -> trustStoreBase64)
-    assert(secret.getData.asScala === expectedSecrets)
-  }
-
-  test("Init container config map should contain parameters for downloading from staging server") {
-    val configMap = dependencyManagerUnderTest.buildInitContainerConfigMap(
-      JARS_RESOURCE_ID, FILES_RESOURCE_ID)
-    assert(configMap.getMetadata.getName === s"$APP_ID-init-properties")
-    val propertiesRawString = configMap.getData.get(INIT_CONTAINER_CONFIG_MAP_KEY)
-    assert(propertiesRawString != null)
-    val propertiesReader = new StringReader(propertiesRawString)
-    val properties = new Properties()
-    properties.load(propertiesReader)
-    val propertiesMap = properties.stringPropertyNames().asScala.map { prop =>
-      (prop, properties.getProperty(prop))
-    }.toMap
-    val expectedProperties = Map[String, String](
-      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
-      DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION.key -> JARS_DOWNLOAD_PATH,
-      DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION.key -> FILES_DOWNLOAD_PATH,
-      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
-      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
-        INIT_CONTAINER_DOWNLOAD_JARS_SECRET_PATH,
-      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
-      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
-        INIT_CONTAINER_DOWNLOAD_FILES_SECRET_PATH,
-      DRIVER_MOUNT_DEPENDENCIES_INIT_TIMEOUT.key -> s"${DOWNLOAD_TIMEOUT_MINUTES}m",
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key -> INIT_CONTAINER_TRUSTSTORE_PATH,
-      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "true",
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASSWORD,
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE)
-    assert(propertiesMap === expectedProperties)
-  }
-
-  test("Resolving jars should map local paths to their mounted counterparts") {
-    val resolvedJars = dependencyManagerUnderTest.resolveSparkJars()
-    val expectedResolvedJars = Seq(
-      "hdfs://localhost:9000/jars/jar1.jar",
-      s"file://$JARS_DOWNLOAD_PATH/${new File(JARS(1)).getName}",
-      s"file://$JARS_DOWNLOAD_PATH/${new File(JARS(2)).getName}")
-    assert(resolvedJars === expectedResolvedJars)
-  }
-
-  test("Resolving files should map local paths to their mounted counterparts") {
-    val resolvedFiles = dependencyManagerUnderTest.resolveSparkFiles()
-    val expectedResolvedFiles = Seq(
-      "hdfs://localhost:9000/files/file1.txt",
-      s"file://$FILES_DOWNLOAD_PATH/${new File(FILES(1)).getName}")
-    assert(resolvedFiles === expectedResolvedFiles)
-  }
-
-  test("Downloading init container should be added to pod") {
-    val driverPod = configureDriverPod()
-    val podAnnotations = driverPod.getMetadata.getAnnotations
-    assert(podAnnotations.size === 1)
-    val initContainerRawAnnotation = podAnnotations.get(INIT_CONTAINER_ANNOTATION)
-    val initContainers = OBJECT_MAPPER.readValue(
-      initContainerRawAnnotation, classOf[Array[Container]])
-    assert(initContainers.size === 1)
-    val initContainer = initContainers.head
-    assert(initContainer.getName === "spark-driver-init")
-    assert(initContainer.getImage === INIT_CONTAINER_IMAGE)
-    assert(initContainer.getImagePullPolicy === "IfNotPresent")
-    val volumeMounts = initContainer.getVolumeMounts
-      .asScala
-      .map(mount => (mount.getName, mount.getMountPath))
-      .toMap
-    val expectedVolumeMounts = Map[String, String](
-      DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
-      DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH,
-      INIT_CONTAINER_PROPERTIES_FILE_VOLUME -> INIT_CONTAINER_PROPERTIES_FILE_MOUNT_PATH,
-      INIT_CONTAINER_SECRETS_VOLUME_NAME -> INIT_CONTAINER_SECRETS_VOLUME_MOUNT_PATH)
-    assert(volumeMounts === expectedVolumeMounts)
-  }
-
-  test("Driver pod should have added volumes and volume mounts for file downloads") {
-    val driverPod = configureDriverPod()
-    val volumes = driverPod.getSpec.getVolumes.asScala.map(volume => (volume.getName, volume)).toMap
-    val initContainerPropertiesVolume = volumes(INIT_CONTAINER_PROPERTIES_FILE_VOLUME).getConfigMap
-    assert(initContainerPropertiesVolume != null)
-    assert(initContainerPropertiesVolume.getName === "config")
-    assert(initContainerPropertiesVolume.getItems.asScala.exists { keyToPath =>
-      keyToPath.getKey == INIT_CONTAINER_CONFIG_MAP_KEY &&
-        keyToPath.getPath == INIT_CONTAINER_PROPERTIES_FILE_NAME
-    })
-    val jarsVolume = volumes(DOWNLOAD_JARS_VOLUME_NAME)
-    assert(jarsVolume.getEmptyDir != null)
-    val filesVolume = volumes(DOWNLOAD_FILES_VOLUME_NAME)
-    assert(filesVolume.getEmptyDir != null)
-    val initContainerSecretVolume = volumes(INIT_CONTAINER_SECRETS_VOLUME_NAME)
-    assert(initContainerSecretVolume.getSecret != null)
-    assert(initContainerSecretVolume.getSecret.getSecretName === "secret")
-    val driverContainer = driverPod.getSpec
-      .getContainers
-      .asScala
-      .find(_.getName == "driver-container").get
-    val driverContainerVolumeMounts = driverContainer.getVolumeMounts
-      .asScala
-      .map(mount => (mount.getName, mount.getMountPath))
-      .toMap
-    val expectedVolumeMountNamesAndPaths = Map[String, String](
-      DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
-      DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH)
-    assert(driverContainerVolumeMounts === expectedVolumeMountNamesAndPaths)
-    val envs = driverContainer.getEnv
-    assert(envs.size() === 1)
-    assert(envs.asScala.head.getName === ENV_UPLOADED_JARS_DIR)
-    assert(envs.asScala.head.getValue === JARS_DOWNLOAD_PATH)
-  }
-
-  private def configureDriverPod(): Pod = {
-    val initContainerSecret = new SecretBuilder()
-      .withNewMetadata().withName("secret").endMetadata()
-      .addToData("datakey", "datavalue")
-      .build()
-    val initContainerConfigMap = new ConfigMapBuilder()
-      .withNewMetadata().withName("config").endMetadata()
-      .addToData("datakey", "datavalue")
-      .build()
-    val basePod = new PodBuilder()
-      .withNewMetadata()
-        .withName("driver-pod")
-        .endMetadata()
-      .withNewSpec()
-        .addNewContainer()
-          .withName("driver-container")
-          .withImage("spark-driver:latest")
-          .endContainer()
-      .endSpec()
-    val adjustedPod = dependencyManagerUnderTest.configurePodToMountLocalDependencies(
-      "driver-container",
-      initContainerSecret,
-      initContainerConfigMap,
-      basePod).build()
-    adjustedPod
-  }
-
-  private def testUploadSendsCorrectFiles(
-      expectedFiles: Seq[String],
-      capturingArgumentsAnswer: UploadDependenciesArgumentsCapturingAnswer) = {
-    val requestLabelsBytes = requestBodyBytes(capturingArgumentsAnswer.podLabelsArg)
-    val requestLabelsString = new String(requestLabelsBytes, Charsets.UTF_8)
-    val requestLabelsMap = OBJECT_MAPPER.readValue(
-      requestLabelsString, classOf[Map[String, String]])
-    assert(requestLabelsMap === LABELS)
-    val requestNamespaceBytes = requestBodyBytes(capturingArgumentsAnswer.podNamespaceArg)
-    val requestNamespaceString = new String(requestNamespaceBytes, Charsets.UTF_8)
-    assert(requestNamespaceString === NAMESPACE)
-    val localJarsTarStream = new ByteArrayOutputStream()
-    CompressionUtils.writeTarGzipToStream(localJarsTarStream, expectedFiles)
-    val requestResourceBytes = requestBodyBytes(capturingArgumentsAnswer.podResourcesArg)
-    assert(requestResourceBytes.sameElements(localJarsTarStream.toByteArray))
-  }
-
-  private def requestBodyBytes(requestBody: RequestBody): Array[Byte] = {
-    Utils.tryWithResource(new ByteArrayOutputStream()) { outputStream =>
-      Utils.tryWithResource(Okio.sink(outputStream)) { sink =>
-        Utils.tryWithResource(Okio.buffer(sink)) { bufferedSink =>
-          requestBody.writeTo(bufferedSink)
-        }
-      }
-      outputStream.toByteArray
-    }
-  }
-}
-
-private class UploadDependenciesArgumentsCapturingAnswer(returnValue: StagedResourceIdentifier)
-    extends Answer[Call[StagedResourceIdentifier]] {
-
-  var podLabelsArg: RequestBody = _
-  var podNamespaceArg: RequestBody = _
-  var podResourcesArg: RequestBody = _
-  var kubernetesCredentialsArg: RequestBody = _
-
-  override def answer(invocationOnMock: InvocationOnMock): Call[StagedResourceIdentifier] = {
-    podLabelsArg = invocationOnMock.getArgumentAt(0, classOf[RequestBody])
-    podNamespaceArg = invocationOnMock.getArgumentAt(1, classOf[RequestBody])
-    podResourcesArg = invocationOnMock.getArgumentAt(2, classOf[RequestBody])
-    kubernetesCredentialsArg = invocationOnMock.getArgumentAt(3, classOf[RequestBody])
-    val responseCall = mock[Call[StagedResourceIdentifier]]
-    Mockito.when(responseCall.execute()).thenReturn(Response.success(returnValue))
-    responseCall
-  }
-}
-
-private object MountedDependencyManagerSuite {
-  def createTempFile(extension: String): String = {
-    val dir = Utils.createTempDir()
-    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
-    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
-    file.getAbsolutePath
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala
new file mode 100644
index 0000000000000..7c6fbf5ce6da2
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.StringReader
+import java.util.Properties
+
+import com.google.common.collect.Maps
+import org.mockito.Mockito.{verify, when}
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar._
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.kubernetes.config._
+
+class SparkInitContainerConfigMapBuilderSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val JARS = Seq(
+    "hdfs://localhost:9000/app/jars/jar1.jar",
+    "file:///app/jars/jar2.jar",
+    "http://localhost:9000/app/jars/jar3.jar",
+    "local:///app/jars/jar4.jar")
+  private val FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.txt",
+    "file:///app/files/file2.txt",
+    "http://localhost:9000/app/files/file3.txt",
+    "local:///app/files/file4.txt")
+  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/files"
+  private val CONFIG_MAP_NAME = "config-map"
+  private val CONFIG_MAP_KEY = "config-map-key"
+
+  test("Config map without submitted dependencies sets remote download configurations") {
+    val configMap = new SparkInitContainerConfigMapBuilderImpl(
+      JARS,
+      FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      CONFIG_MAP_NAME,
+      CONFIG_MAP_KEY,
+      None).build()
+    assert(configMap.getMetadata.getName === CONFIG_MAP_NAME)
+    val maybeConfigValue = configMap.getData.asScala.get(CONFIG_MAP_KEY)
+    assert(maybeConfigValue.isDefined)
+    maybeConfigValue.foreach { configValue =>
+      val propertiesStringReader = new StringReader(configValue)
+      val properties = new Properties()
+      properties.load(propertiesStringReader)
+      val propertiesMap = Maps.fromProperties(properties).asScala
+      val remoteJarsString = propertiesMap.get(INIT_CONTAINER_REMOTE_JARS.key)
+      assert(remoteJarsString.isDefined)
+      val remoteJars = remoteJarsString.map(_.split(",")).toSet.flatten
+      assert(remoteJars ===
+        Set("hdfs://localhost:9000/app/jars/jar1.jar", "http://localhost:9000/app/jars/jar3.jar"))
+      val remoteFilesString = propertiesMap.get(INIT_CONTAINER_REMOTE_FILES.key)
+      assert(remoteFilesString.isDefined)
+      val remoteFiles = remoteFilesString.map(_.split(",")).toSet.flatten
+      assert(remoteFiles ===
+        Set("hdfs://localhost:9000/app/files/file1.txt",
+          "http://localhost:9000/app/files/file3.txt"))
+      assert(propertiesMap(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION.key) === JARS_DOWNLOAD_PATH)
+      assert(propertiesMap(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION.key) === FILES_DOWNLOAD_PATH)
+    }
+  }
+
+  test("Config map with submitted dependencies adds configurations from plugin") {
+    val submittedDependenciesPlugin = mock[SubmittedDependencyInitContainerConfigPlugin]
+    when(submittedDependenciesPlugin.configurationsToFetchSubmittedDependencies())
+      .thenReturn(Map("customConf" -> "customConfValue"))
+    val configMap = new SparkInitContainerConfigMapBuilderImpl(
+      JARS,
+      FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      CONFIG_MAP_NAME,
+      CONFIG_MAP_KEY,
+      Some(submittedDependenciesPlugin)).build()
+    val configValue = configMap.getData.asScala(CONFIG_MAP_KEY)
+    val propertiesStringReader = new StringReader(configValue)
+    val properties = new Properties()
+    properties.load(propertiesStringReader)
+    val propertiesMap = Maps.fromProperties(properties).asScala
+    assert(propertiesMap("customConf") === "customConfValue")
+    verify(submittedDependenciesPlugin).configurationsToFetchSubmittedDependencies()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
new file mode 100644
index 0000000000000..11a671085c201
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.File
+
+import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.config._
+
+class SubmittedDependencyInitContainerConfigPluginSuite extends SparkFunSuite {
+  private val STAGING_SERVER_URI = "http://localhost:9000"
+  private val JARS_RESOURCE_ID = "jars-id"
+  private val FILES_RESOURCE_ID = "files-id"
+  private val JARS_SECRET_KEY = "jars"
+  private val FILES_SECRET_KEY = "files"
+  private val TRUSTSTORE_SECRET_KEY = "trustStore"
+  private val SECRETS_VOLUME_MOUNT_PATH = "/var/data/"
+  private val TRUSTSTORE_PASSWORD = "trustStore"
+  private val TRUSTSTORE_FILE = "/mnt/secrets/trustStore.jks"
+  private val TRUSTSTORE_TYPE = "jks"
+  private val RESOURCE_STAGING_SERVICE_SSL_OPTIONS = SSLOptions(
+    enabled = true,
+    trustStore = Some(new File(TRUSTSTORE_FILE)),
+    trustStorePassword = Some(TRUSTSTORE_PASSWORD),
+    trustStoreType = Some(TRUSTSTORE_TYPE))
+
+  test("Plugin should provide configuration for fetching uploaded dependencies") {
+    val configPluginUnderTest = new SubmittedDependencyInitContainerConfigPluginImpl(
+      STAGING_SERVER_URI,
+      JARS_RESOURCE_ID,
+      FILES_RESOURCE_ID,
+      JARS_SECRET_KEY,
+      FILES_SECRET_KEY,
+      TRUSTSTORE_SECRET_KEY,
+      SECRETS_VOLUME_MOUNT_PATH,
+      SSLOptions())
+    val addedConfigurations = configPluginUnderTest.configurationsToFetchSubmittedDependencies()
+    val expectedConfigurations = Map(
+      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        s"$SECRETS_VOLUME_MOUNT_PATH/$JARS_SECRET_KEY",
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        s"$SECRETS_VOLUME_MOUNT_PATH/$FILES_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "false")
+    assert(addedConfigurations === expectedConfigurations)
+  }
+
+  test("Plugin should set up SSL with the appropriate trustStore if it's provided.") {
+    val configPluginUnderTest = new SubmittedDependencyInitContainerConfigPluginImpl(
+      STAGING_SERVER_URI,
+      JARS_RESOURCE_ID,
+      FILES_RESOURCE_ID,
+      JARS_SECRET_KEY,
+      FILES_SECRET_KEY,
+      TRUSTSTORE_SECRET_KEY,
+      SECRETS_VOLUME_MOUNT_PATH,
+      RESOURCE_STAGING_SERVICE_SSL_OPTIONS)
+    val addedConfigurations = configPluginUnderTest.configurationsToFetchSubmittedDependencies()
+    val expectedSslConfigurations = Map(
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "true",
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key ->
+          s"$SECRETS_VOLUME_MOUNT_PATH/$TRUSTSTORE_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASSWORD,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE)
+    assert(expectedSslConfigurations.toSet.subsetOf(addedConfigurations.toSet))
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
new file mode 100644
index 0000000000000..189d87e27a28a
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.Secret
+import scala.collection.JavaConverters._
+import scala.collection.Map
+
+import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.util.Utils
+
+class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
+
+  private val SECRET_NAME = "submitted-dependency-secret"
+  private val JARS_SECRET = "jars-secret"
+  private val FILES_SECRET = "files-secret"
+  private val JARS_SECRET_KEY = "jars-secret-key"
+  private val FILES_SECRET_KEY = "files-secret-key"
+  private val TRUSTSTORE_SECRET_KEY = "truststore-secret-key"
+  private val TRUSTSTORE_STRING_CONTENTS = "trustStore-contents"
+
+  test("Building the secret without a trustStore") {
+    val builder = new SubmittedDependencySecretBuilderImpl(
+      SECRET_NAME,
+      JARS_SECRET,
+      FILES_SECRET,
+      JARS_SECRET_KEY,
+      FILES_SECRET_KEY,
+      TRUSTSTORE_SECRET_KEY,
+      SSLOptions())
+    val secret = builder.build()
+    assert(secret.getMetadata.getName === SECRET_NAME)
+    val secretDecodedData = decodeSecretData(secret)
+    val expectedSecretData = Map(JARS_SECRET_KEY -> JARS_SECRET, FILES_SECRET_KEY -> FILES_SECRET)
+    assert(secretDecodedData === expectedSecretData)
+  }
+
+  private def decodeSecretData(secret: Secret): Map[String, String] = {
+    val secretData = secret.getData.asScala
+    secretData.mapValues(encoded =>
+      new String(BaseEncoding.base64().decode(encoded), Charsets.UTF_8))
+  }
+
+  test("Building the secret with a trustStore") {
+    val tempTrustStoreDir = Utils.createTempDir(namePrefix = "temp-truststores")
+    try {
+      val trustStoreFile = new File(tempTrustStoreDir, "trustStore.jks")
+      Files.write(TRUSTSTORE_STRING_CONTENTS, trustStoreFile, Charsets.UTF_8)
+      val builder = new SubmittedDependencySecretBuilderImpl(
+        SECRET_NAME,
+        JARS_SECRET,
+        FILES_SECRET,
+        JARS_SECRET_KEY,
+        FILES_SECRET_KEY,
+        TRUSTSTORE_SECRET_KEY,
+        SSLOptions(trustStore = Some(trustStoreFile)))
+      val secret = builder.build()
+      val secretDecodedData = decodeSecretData(secret)
+      assert(secretDecodedData(TRUSTSTORE_SECRET_KEY) === TRUSTSTORE_STRING_CONTENTS)
+    } finally {
+      tempTrustStoreDir.delete()
+    }
+  }
+
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala
new file mode 100644
index 0000000000000..7b259aa2c3a0c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File}
+import java.util.UUID
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import okhttp3.RequestBody
+import okio.Okio
+import org.mockito.Matchers.any
+import org.mockito.Mockito
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar._
+import retrofit2.{Call, Response}
+
+import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
+import org.apache.spark.util.Utils
+
+private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with BeforeAndAfter {
+  import SubmittedDependencyUploaderSuite.createTempFile
+
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+  private val APP_ID = "app-id"
+  private val LABELS = Map("label1" -> "label1value", "label2" -> "label2value")
+  private val NAMESPACE = "namespace"
+  private val STAGING_SERVER_URI = "http://localhost:8000"
+  private val LOCAL_JARS = Seq(createTempFile("jar"), createTempFile("jar"))
+  private val JARS = Seq("hdfs://localhost:9000/jars/jar1.jar",
+    s"file://${LOCAL_JARS.head}",
+    LOCAL_JARS(1))
+  private val LOCAL_FILES = Seq(createTempFile("txt"))
+  private val FILES = Seq("hdfs://localhost:9000/files/file1.txt",
+    LOCAL_FILES.head)
+  private val TRUSTSTORE_FILE = new File(createTempFile(".jks"))
+  private val TRUSTSTORE_PASSWORD = "trustStorePassword"
+  private val TRUSTSTORE_TYPE = "jks"
+  private val STAGING_SERVER_SSL_OPTIONS = SSLOptions(
+    enabled = true,
+    trustStore = Some(TRUSTSTORE_FILE),
+    trustStorePassword = Some(TRUSTSTORE_PASSWORD),
+    trustStoreType = Some(TRUSTSTORE_TYPE))
+  private var retrofitClientFactory: RetrofitClientFactory = _
+  private var retrofitClient: ResourceStagingServiceRetrofit = _
+
+  private var dependencyUploaderUnderTest: SubmittedDependencyUploader = _
+
+  before {
+    retrofitClientFactory = mock[RetrofitClientFactory]
+    retrofitClient = mock[ResourceStagingServiceRetrofit]
+    Mockito.when(
+      retrofitClientFactory.createRetrofitClient(
+        STAGING_SERVER_URI, classOf[ResourceStagingServiceRetrofit], STAGING_SERVER_SSL_OPTIONS))
+      .thenReturn(retrofitClient)
+    dependencyUploaderUnderTest = new SubmittedDependencyUploaderImpl(
+      APP_ID,
+      LABELS,
+      NAMESPACE,
+      STAGING_SERVER_URI,
+      JARS,
+      FILES,
+      STAGING_SERVER_SSL_OPTIONS,
+      retrofitClientFactory)
+  }
+
+  test("Uploading jars should contact the staging server with the appropriate parameters") {
+    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
+      SubmittedResourceIdAndSecret("resourceId", "resourceSecret"))
+    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
+      .thenAnswer(capturingArgumentsAnswer)
+    dependencyUploaderUnderTest.uploadJars()
+    testUploadSendsCorrectFiles(LOCAL_JARS, capturingArgumentsAnswer)
+  }
+
+  test("Uploading files should contact the staging server with the appropriate parameters") {
+    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
+      SubmittedResourceIdAndSecret("resourceId", "resourceSecret"))
+    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
+      .thenAnswer(capturingArgumentsAnswer)
+    dependencyUploaderUnderTest.uploadFiles()
+    testUploadSendsCorrectFiles(LOCAL_FILES, capturingArgumentsAnswer)
+  }
+
+  private def testUploadSendsCorrectFiles(
+      expectedFiles: Seq[String],
+      capturingArgumentsAnswer: UploadDependenciesArgumentsCapturingAnswer) = {
+    val requestLabelsBytes = requestBodyBytes(capturingArgumentsAnswer.podLabelsArg)
+    val requestLabelsString = new String(requestLabelsBytes, Charsets.UTF_8)
+    val requestLabelsMap = OBJECT_MAPPER.readValue(
+      requestLabelsString, classOf[Map[String, String]])
+    assert(requestLabelsMap === LABELS)
+    val requestNamespaceBytes = requestBodyBytes(capturingArgumentsAnswer.podNamespaceArg)
+    val requestNamespaceString = new String(requestNamespaceBytes, Charsets.UTF_8)
+    assert(requestNamespaceString === NAMESPACE)
+
+    val unpackedFilesDir = Utils.createTempDir(namePrefix = "test-unpacked-files")
+    val compressedBytesInput = new ByteArrayInputStream(
+      requestBodyBytes(capturingArgumentsAnswer.podResourcesArg))
+    CompressionUtils.unpackTarStreamToDirectory(compressedBytesInput, unpackedFilesDir)
+    val writtenFiles = unpackedFilesDir.listFiles
+    assert(writtenFiles.size === expectedFiles.size)
+
+    expectedFiles.map(new File(_)).foreach { expectedFile =>
+      val maybeWrittenFile = writtenFiles.find(_.getName == expectedFile.getName)
+      assert(maybeWrittenFile.isDefined)
+      maybeWrittenFile.foreach { writtenFile =>
+        val writtenFileBytes = Files.toByteArray(writtenFile)
+        val expectedFileBytes = Files.toByteArray(expectedFile)
+        assert(expectedFileBytes.toSeq === writtenFileBytes.toSeq)
+      }
+    }
+  }
+
+  private def requestBodyBytes(requestBody: RequestBody): Array[Byte] = {
+    Utils.tryWithResource(new ByteArrayOutputStream()) { outputStream =>
+      Utils.tryWithResource(Okio.sink(outputStream)) { sink =>
+        Utils.tryWithResource(Okio.buffer(sink)) { bufferedSink =>
+          try {
+            requestBody.writeTo(bufferedSink)
+          } finally {
+            bufferedSink.flush()
+          }
+        }
+      }
+      outputStream.toByteArray
+    }
+  }
+}
+
+private class UploadDependenciesArgumentsCapturingAnswer(returnValue: SubmittedResourceIdAndSecret)
+    extends Answer[Call[SubmittedResourceIdAndSecret]] {
+
+  var podLabelsArg: RequestBody = _
+  var podNamespaceArg: RequestBody = _
+  var podResourcesArg: RequestBody = _
+  var kubernetesCredentialsArg: RequestBody = _
+
+  override def answer(invocationOnMock: InvocationOnMock): Call[SubmittedResourceIdAndSecret] = {
+    podLabelsArg = invocationOnMock.getArgumentAt(0, classOf[RequestBody])
+    podNamespaceArg = invocationOnMock.getArgumentAt(1, classOf[RequestBody])
+    podResourcesArg = invocationOnMock.getArgumentAt(2, classOf[RequestBody])
+    kubernetesCredentialsArg = invocationOnMock.getArgumentAt(3, classOf[RequestBody])
+    val responseCall = mock[Call[SubmittedResourceIdAndSecret]]
+    Mockito.when(responseCall.execute()).thenReturn(Response.success(returnValue))
+    responseCall
+  }
+}
+
+private object SubmittedDependencyUploaderSuite {
+  def createTempFile(extension: String): String = {
+    val dir = Utils.createTempDir()
+    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
+    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
+    file.getAbsolutePath
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
index 77eb7f2b9f49c..6ab37185b8d07 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
@@ -24,6 +24,7 @@ import com.google.common.base.Charsets
 import com.google.common.io.Files
 import okhttp3.{MediaType, ResponseBody}
 import org.mockito.Matchers.any
+import org.mockito.Mockito
 import org.mockito.Mockito.{doAnswer, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
@@ -31,7 +32,7 @@ import org.scalatest.BeforeAndAfter
 import org.scalatest.mock.MockitoSugar._
 import retrofit2.{Call, Callback, Response}
 
-import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.util.Utils
@@ -55,7 +56,6 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
   private val JARS_RESOURCE_ID = "jarsId"
   private val FILES_RESOURCE_ID = "filesId"
 
-  private var sparkConf: SparkConf = _
   private var downloadJarsDir: File = _
   private var downloadFilesDir: File = _
   private var downloadJarsSecretValue: String = _
@@ -64,7 +64,7 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
   private var filesCompressedBytes: Array[Byte] = _
   private var retrofitClientFactory: RetrofitClientFactory = _
   private var retrofitClient: ResourceStagingServiceRetrofit = _
-  private var initContainerUnderTest: KubernetesSparkDependencyDownloadInitContainer = _
+  private var fileFetcher: FileFetcher = _
 
   override def beforeAll(): Unit = {
     jarsCompressedBytes = compressPathsToBytes(JARS)
@@ -80,24 +80,10 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
     downloadFilesDir = Utils.createTempDir()
     retrofitClientFactory = mock[RetrofitClientFactory]
     retrofitClient = mock[ResourceStagingServiceRetrofit]
-    sparkConf = new SparkConf(true)
-      .set(RESOURCE_STAGING_SERVER_URI, STAGING_SERVER_URI)
-      .set(INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER, JARS_RESOURCE_ID)
-      .set(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION, DOWNLOAD_JARS_SECRET_LOCATION)
-      .set(INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER, FILES_RESOURCE_ID)
-      .set(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION, DOWNLOAD_FILES_SECRET_LOCATION)
-      .set(DRIVER_LOCAL_JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
-      .set(DRIVER_LOCAL_FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
-      .set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
-      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE, TRUSTSTORE_FILE.getAbsolutePath)
-      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD, TRUSTSTORE_PASSWORD)
-      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE, TRUSTSTORE_TYPE)
-
+    fileFetcher = mock[FileFetcher]
     when(retrofitClientFactory.createRetrofitClient(
         STAGING_SERVER_URI, classOf[ResourceStagingServiceRetrofit], STAGING_SERVER_SSL_OPTIONS))
       .thenReturn(retrofitClient)
-    initContainerUnderTest = new KubernetesSparkDependencyDownloadInitContainer(
-      sparkConf, retrofitClientFactory)
   }
 
   after {
@@ -105,9 +91,15 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
     downloadFilesDir.delete()
   }
 
-  test("Downloads should unpack response body streams to directories") {
+  test("Downloads from resource staging server should unpack response body to directories") {
     val downloadJarsCall = mock[Call[ResponseBody]]
     val downloadFilesCall = mock[Call[ResponseBody]]
+    val sparkConf = getSparkConfForResourceStagingServerDownloads
+    val initContainerUnderTest = new KubernetesSparkDependencyDownloadInitContainer(
+      sparkConf,
+      retrofitClientFactory,
+      fileFetcher,
+      securityManager = new SparkSecurityManager(sparkConf))
     when(retrofitClient.downloadResources(JARS_RESOURCE_ID, downloadJarsSecretValue))
       .thenReturn(downloadJarsCall)
     when(retrofitClient.downloadResources(FILES_RESOURCE_ID, downloadFilesSecretValue))
@@ -125,6 +117,46 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
     initContainerUnderTest.run()
     checkWrittenFilesAreTheSameAsOriginal(JARS, downloadJarsDir)
     checkWrittenFilesAreTheSameAsOriginal(FILES, downloadFilesDir)
+    Mockito.verifyZeroInteractions(fileFetcher)
+  }
+
+  test("Downloads from remote server should invoke the file fetcher") {
+    val sparkConf = getSparkConfForRemoteFileDownloads
+    val initContainerUnderTest = new KubernetesSparkDependencyDownloadInitContainer(
+      sparkConf,
+      retrofitClientFactory,
+      fileFetcher,
+      securityManager = new SparkSecurityManager(sparkConf))
+    initContainerUnderTest.run()
+    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/jar1.jar", downloadJarsDir)
+    Mockito.verify(fileFetcher).fetchFile("hdfs://localhost:9000/jar2.jar", downloadJarsDir)
+    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/file.txt", downloadFilesDir)
+
+  }
+
+  private def getSparkConfForResourceStagingServerDownloads: SparkConf = {
+    new SparkConf(true)
+      .set(RESOURCE_STAGING_SERVER_URI, STAGING_SERVER_URI)
+      .set(INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER, JARS_RESOURCE_ID)
+      .set(INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION, DOWNLOAD_JARS_SECRET_LOCATION)
+      .set(INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER, FILES_RESOURCE_ID)
+      .set(INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION, DOWNLOAD_FILES_SECRET_LOCATION)
+      .set(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
+      .set(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
+      .set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE, TRUSTSTORE_FILE.getAbsolutePath)
+      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD, TRUSTSTORE_PASSWORD)
+      .set(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE, TRUSTSTORE_TYPE)
+  }
+
+  private def getSparkConfForRemoteFileDownloads: SparkConf = {
+    new SparkConf(true)
+      .set(INIT_CONTAINER_REMOTE_JARS,
+        "http://localhost:9000/jar1.jar,hdfs://localhost:9000/jar2.jar")
+      .set(INIT_CONTAINER_REMOTE_FILES,
+        "http://localhost:9000/file.txt")
+      .set(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
+      .set(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
   }
 
   private def checkWrittenFilesAreTheSameAsOriginal(
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index 23c6751f1b3ed..c5f1c43ff7cf4 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -37,4 +37,7 @@ ENV SPARK_HOME /opt/spark
 WORKDIR /opt/spark
 
 # TODO support spark.executor.extraClassPath
-CMD exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp ${SPARK_HOME}/jars/\* org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp $SPARK_CLASSPATH org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index c94893cbce410..02904c0e5fe21 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -200,6 +200,28 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <version>3.0.2</version>
+        <executions>
+          <execution>
+            <id>copy-integration-test-http-server-dockerfile</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/docker/dockerfiles</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>src/main/docker</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
       <plugin>
         <groupId>com.googlecode.maven-download-plugin</groupId>
         <artifactId>download-maven-plugin</artifactId>
diff --git a/resource-managers/kubernetes/integration-tests/src/main/docker/integration-test-asset-server/Dockerfile b/resource-managers/kubernetes/integration-tests/src/main/docker/integration-test-asset-server/Dockerfile
new file mode 100644
index 0000000000000..e26d207cf4397
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/main/docker/integration-test-asset-server/Dockerfile
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Simple asset server that can provide the integration test jars over HTTP.
+FROM trinitronx/python-simplehttpserver:travis-12
+
+ADD examples/integration-tests-jars /var/www
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
index ae02de7937c6a..3be4507ac105a 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
@@ -18,23 +18,19 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.util.UUID
 
-import scala.collection.JavaConverters._
-
-import com.google.common.collect.ImmutableList
 import io.fabric8.kubernetes.client.internal.readiness.Readiness
 import org.scalatest.{BeforeAndAfter, DoNotDiscover}
 import org.scalatest.concurrent.Eventually
+import scala.collection.JavaConverters._
 
-import org.apache.spark._
+import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.SSLUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
-import org.apache.spark.deploy.kubernetes.submit.v1.Client
-import org.apache.spark.deploy.kubernetes.submit.v2.{MountedDependencyManagerProviderImpl, SubmissionKubernetesClientProviderImpl}
-import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
+import org.apache.spark.deploy.kubernetes.submit.v2.Client
+import org.apache.spark.launcher.SparkLauncher
 
 @DoNotDiscover
 private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
@@ -44,11 +40,14 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   private var kubernetesTestComponents: KubernetesTestComponents = _
   private var sparkConf: SparkConf = _
   private var resourceStagingServerLauncher: ResourceStagingServerLauncher = _
+  private var staticAssetServerLauncher: StaticAssetServerLauncher = _
 
   override def beforeAll(): Unit = {
     kubernetesTestComponents = new KubernetesTestComponents(testBackend.getKubernetesClient)
     resourceStagingServerLauncher = new ResourceStagingServerLauncher(
       kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
+    staticAssetServerLauncher = new StaticAssetServerLauncher(
+      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
   }
 
   before {
@@ -98,7 +97,6 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
     sparkConf.setJars(Seq(
-      KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
       KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
     runSparkPiAndVerifyCompletion(KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE)
   }
@@ -118,6 +116,25 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     runSparkGroupByTestAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
+  test("Use remote resources without the resource staging server.") {
+    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
+    sparkConf.setJars(Seq(
+      s"$assetServerUri/${KubernetesSuite.EXAMPLES_JAR_FILE.getName}",
+      s"$assetServerUri/${KubernetesSuite.HELPER_JAR_FILE.getName}"
+    ))
+    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+  }
+
+  test("Mix remote resources with submitted ones.") {
+    launchStagingServer(SSLOptions())
+    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
+    sparkConf.setJars(Seq(
+      KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+      s"$assetServerUri/${KubernetesSuite.HELPER_JAR_FILE.getName}"
+    ))
+    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+  }
+
   private def launchStagingServer(resourceStagingServerSslOptions: SSLOptions): Unit = {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
@@ -134,16 +151,7 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   }
 
   private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
-    val client = new org.apache.spark.deploy.kubernetes.submit.v2.Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      appArgs = Array.empty[String],
-      mainAppResource = appResource,
-      kubernetesClientProvider =
-        new SubmissionKubernetesClientProviderImpl(sparkConf),
-      mountedDependencyManagerProvider =
-        new MountedDependencyManagerProviderImpl(sparkConf))
-    client.run()
+    Client.run(sparkConf, appResource, KubernetesSuite.SPARK_PI_MAIN_CLASS, Array.empty[String])
     val driverPod = kubernetesTestComponents.kubernetesClient
       .pods()
       .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
@@ -160,16 +168,11 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   }
 
   private def runSparkGroupByTestAndVerifyCompletion(appResource: String): Unit = {
-    val client = new org.apache.spark.deploy.kubernetes.submit.v2.Client(
+    Client.run(
       sparkConf = sparkConf,
-      mainClass = KubernetesSuite.GROUP_BY_MAIN_CLASS,
       appArgs = Array.empty[String],
-      mainAppResource = appResource,
-      kubernetesClientProvider =
-        new SubmissionKubernetesClientProviderImpl(sparkConf),
-      mountedDependencyManagerProvider =
-        new MountedDependencyManagerProviderImpl(sparkConf))
-    client.run()
+      mainClass = KubernetesSuite.GROUP_BY_MAIN_CLASS,
+      mainAppResource = appResource)
     val driverPod = kubernetesTestComponents.kubernetesClient
       .pods()
       .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
index ca549fa27d630..3a99f907d15fd 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
@@ -43,7 +43,6 @@ private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesC
   private val PROPERTIES_FILE_NAME = "staging-server.properties"
   private val PROPERTIES_DIR = "/var/data/spark-staging-server"
   private val PROPERTIES_FILE_PATH = s"$PROPERTIES_DIR/$PROPERTIES_FILE_NAME"
-  private var activeResources = Seq.empty[HasMetadata]
 
   // Returns the NodePort the staging server is listening on
   def launchStagingServer(sslOptions: SSLOptions): Int = {
@@ -146,8 +145,8 @@ private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesC
           .endPort()
         .endSpec()
       .build()
-    val stagingServerPodReadyWatcher = new ReadinessWatcher[Pod]
-    val serviceReadyWatcher = new ReadinessWatcher[Endpoints]
+    val stagingServerPodReadyWatcher = new SparkReadinessWatcher[Pod]
+    val serviceReadyWatcher = new SparkReadinessWatcher[Endpoints]
     val allResources = Seq(
       stagingServerService,
       stagingServerConfigMap,
@@ -159,9 +158,7 @@ private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesC
       Utils.tryWithResource(kubernetesClient.endpoints()
           .withName(stagingServerService.getMetadata.getName)
           .watch(serviceReadyWatcher)) { _ =>
-        activeResources = kubernetesClient.resourceList(allResources: _*)
-          .createOrReplace()
-          .asScala
+        kubernetesClient.resourceList(allResources: _*).createOrReplace()
         stagingServerPodReadyWatcher.waitUntilReady()
         serviceReadyWatcher.waitUntilReady()
       }
@@ -172,25 +169,4 @@ private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesC
       .get(0)
       .getNodePort
   }
-
-  def tearDownStagingServer(): Unit = {
-    kubernetesClient.resourceList(activeResources: _*).delete()
-    activeResources = Seq.empty[HasMetadata]
-  }
-
-  private class ReadinessWatcher[T <: HasMetadata] extends Watcher[T] {
-
-    private val signal = SettableFuture.create[Boolean]
-
-    override def eventReceived(action: Action, resource: T): Unit = {
-      if ((action == Action.MODIFIED || action == Action.ADDED) &&
-        Readiness.isReady(resource)) {
-        signal.set(true)
-      }
-    }
-
-    override def onClose(cause: KubernetesClientException): Unit = {}
-
-    def waitUntilReady(): Boolean = signal.get(30, TimeUnit.SECONDS)
-  }
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/SparkReadinessWatcher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/SparkReadinessWatcher.scala
new file mode 100644
index 0000000000000..20517eb2fc2a6
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/SparkReadinessWatcher.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.util.concurrent.TimeUnit
+
+import com.google.common.util.concurrent.SettableFuture
+import io.fabric8.kubernetes.api.model.HasMetadata
+import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import io.fabric8.kubernetes.client.internal.readiness.Readiness
+
+private[spark] class SparkReadinessWatcher[T <: HasMetadata] extends Watcher[T] {
+
+  private val signal = SettableFuture.create[Boolean]
+
+  override def eventReceived(action: Action, resource: T): Unit = {
+    if ((action == Action.MODIFIED || action == Action.ADDED) &&
+        Readiness.isReady(resource)) {
+      signal.set(true)
+    }
+  }
+
+  override def onClose(cause: KubernetesClientException): Unit = {}
+
+  def waitUntilReady(): Boolean = signal.get(30, TimeUnit.SECONDS)
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/StaticAssetServerLauncher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/StaticAssetServerLauncher.scala
new file mode 100644
index 0000000000000..6b483769f5254
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/StaticAssetServerLauncher.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import io.fabric8.kubernetes.api.model.{HTTPGetActionBuilder, Pod}
+import io.fabric8.kubernetes.client.KubernetesClient
+
+import org.apache.spark.util.Utils
+
+/**
+ * Launches a simple HTTP server which provides jars that can be downloaded by Spark applications
+ * in integration tests.
+ */
+private[spark] class StaticAssetServerLauncher(kubernetesClient: KubernetesClient) {
+
+  // Returns the HTTP Base URI of the server.
+  def launchStaticAssetServer(): String = {
+    val readinessWatcher = new SparkReadinessWatcher[Pod]
+    val probePingHttpGet = new HTTPGetActionBuilder()
+      .withNewPort(8080)
+      .withScheme("HTTP")
+      .withPath("/")
+      .build()
+    Utils.tryWithResource(kubernetesClient
+        .pods()
+        .withName("integration-test-static-assets")
+        .watch(readinessWatcher)) { _ =>
+      val pod = kubernetesClient.pods().createNew()
+        .withNewMetadata()
+          .withName("integration-test-static-assets")
+          .endMetadata()
+        .withNewSpec()
+          .addNewContainer()
+            .withName("static-asset-server-container")
+            .withImage("spark-integration-test-asset-server:latest")
+            .withImagePullPolicy("IfNotPresent")
+            .withNewReadinessProbe()
+              .withHttpGet(probePingHttpGet)
+              .endReadinessProbe()
+            .endContainer()
+          .endSpec()
+        .done()
+      readinessWatcher.waitUntilReady()
+      val podIP = kubernetesClient.pods().withName(pod.getMetadata.getName).get()
+        .getStatus
+        .getPodIP
+      s"http://$podIP:8080"
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index 52b8c7d7359a6..0692cf55db848 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -34,6 +34,8 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
   private val SHUFFLE_SERVICE_DOCKER_FILE = "dockerfiles/shuffle-service/Dockerfile"
   private val DRIVER_INIT_DOCKER_FILE = "dockerfiles/driver-init/Dockerfile"
   private val STAGING_SERVER_DOCKER_FILE = "dockerfiles/resource-staging-server/Dockerfile"
+  private val STATIC_ASSET_SERVER_DOCKER_FILE =
+    "dockerfiles/integration-test-asset-server/Dockerfile"
   private val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   private val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
   private val dockerHost = dockerEnv.getOrElse("DOCKER_HOST",
@@ -65,6 +67,7 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
     buildImage("spark-driver-v2", DRIVER_V2_DOCKER_FILE)
     buildImage("spark-resource-staging-server", STAGING_SERVER_DOCKER_FILE)
     buildImage("spark-driver-init", DRIVER_INIT_DOCKER_FILE)
+    buildImage("spark-integration-test-asset-server", STATIC_ASSET_SERVER_DOCKER_FILE)
   }
 
   private def buildImage(name: String, dockerFile: String): Unit = {

From e071ad9c0e8b6a0099de9907def520af6e159caf Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Wed, 17 May 2017 16:33:55 -0700
Subject: [PATCH 479/534] Scalastyle fixes (#278)

---
 .../integrationtest/KubernetesTestComponents.scala          | 2 +-
 .../deploy/kubernetes/integrationtest/ProcessUtils.scala    | 6 +++---
 .../spark/deploy/kubernetes/integrationtest/constants.scala | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
index 8cdacee655c05..677c0db606a47 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
@@ -95,4 +95,4 @@ private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesCl
     val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
     HttpClientUtil.createClient[T](Set(url), 5, sslContext.getSocketFactory, trustManager)
   }
-}
\ No newline at end of file
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala
index d0bfac3085487..4008007b72fc4 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ProcessUtils.scala
@@ -26,9 +26,9 @@ import org.apache.spark.util.Utils
 
 object ProcessUtils extends Logging {
   /**
-    * executeProcess is used to run a command and return the output if it
-    * completes within timeout seconds.
-    */
+   * executeProcess is used to run a command and return the output if it
+   * completes within timeout seconds.
+   */
   def executeProcess(fullCommand: Array[String], timeout: Long): Seq[String] = {
     val pb = new ProcessBuilder().command(fullCommand: _*)
     pb.redirectErrorStream(true)
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala
index 8207198b529d2..bfded1003fc25 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/constants.scala
@@ -19,4 +19,4 @@ package org.apache.spark.deploy.kubernetes.integrationtest
 package object constants {
   val MINIKUBE_TEST_BACKEND = "minikube"
   val GCE_TEST_BACKEND = "gce"
-}
\ No newline at end of file
+}

From 6882a1bf0e91cec325b947e1bc9ef7718cc5bf52 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Thu, 18 May 2017 00:25:17 -0500
Subject: [PATCH 480/534] Exit properly when the k8s cluster is not available.
 (#256)

* Exit properly when the k8s cluster is not available.

* add jetty to k8s module dependency so we can use only rebuild the k8s module.

* CR

* Fixed single thread scheduler.

* Fixed scalastyle check.

* CR
---
 resource-managers/kubernetes/core/pom.xml           |  1 -
 .../spark/deploy/kubernetes/submit/v1/Client.scala  |  1 +
 .../submit/v1/LoggingPodStatusWatcher.scala         | 13 +++++++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 70c252009c9b4..aa429f73a5627 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -133,4 +133,3 @@
   </build>
 
 </project>
-
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index 0f1e7886a1ba2..8f1e356bec8ca 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -155,6 +155,7 @@ private[spark] class Client(
           .pods()
           .withName(kubernetesDriverPodName)
           .watch(loggingWatch)) { _ =>
+        loggingWatch.start()
         val resourceCleanShutdownHook = ShutdownHookManager.addShutdownHook(() =>
           kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
         val cleanupServiceManagerHook = ShutdownHookManager.addShutdownHook(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
index 7be334194d9d7..537bcccaa1458 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
@@ -24,6 +24,7 @@ import io.fabric8.kubernetes.client.Watcher.Action
 import scala.collection.JavaConverters._
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.util.ThreadUtils
 
 /**
  * A monitor for the running Kubernetes pod of a Spark application. Status logging occurs on
@@ -40,19 +41,23 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
     extends Watcher[Pod] with Logging {
 
   // start timer for periodic logging
-  private val scheduler = Executors.newScheduledThreadPool(1)
+  private val scheduler =
+    ThreadUtils.newDaemonSingleThreadScheduledExecutor("logging-pod-status-watcher")
   private val logRunnable: Runnable = new Runnable {
     override def run() = logShortStatus()
   }
-  if (interval > 0) {
-    scheduler.scheduleWithFixedDelay(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
-  }
 
   private var pod: Option[Pod] = Option.empty
   private def phase: String = pod.map(_.getStatus().getPhase()).getOrElse("unknown")
   private def status: String = pod.map(_.getStatus().getContainerStatuses().toString())
     .getOrElse("unknown")
 
+  def start(): Unit = {
+    if (interval > 0) {
+      scheduler.scheduleAtFixedRate(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
+    }
+  }
+
   override def eventReceived(action: Action, pod: Pod): Unit = {
     this.pod = Option(pod)
     action match {

From 9d6665cde35b897498aa0a9ffdbb89b3b903caf1 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 18 May 2017 14:27:31 -0700
Subject: [PATCH 481/534] Support driver pod kubernetes credentials mounting in
 V2 submission (#246)

---
 .../kubernetes/KubernetesCredentials.scala    |  23 +++
 .../spark/deploy/kubernetes/config.scala      |  20 +-
 .../spark/deploy/kubernetes/constants.scala   |  16 ++
 ...iverPodKubernetesCredentialsProvider.scala |  11 +-
 .../deploy/kubernetes/submit/v1/Client.scala  |   6 +-
 .../deploy/kubernetes/submit/v2/Client.scala  |  26 ++-
 ...riverPodKubernetesCredentialsMounter.scala | 175 ++++++++++++++++++
 ...KubernetesCredentialsMounterProvider.scala |  46 +++++
 .../v2/SubmittedDependencyUploaderImpl.scala  |   3 +-
 .../v1/KubernetesRestProtocolMessages.scala   |   7 +-
 .../v1/KubernetesSparkRestServer.scala        |   7 +-
 .../v2/ResourceStagingService.scala           |   2 +-
 .../v2/ResourceStagingServiceImpl.scala       |   2 +-
 .../DriverPodKubernetesClientProvider.scala   |  16 +-
 .../kubernetes/KubernetesClientBuilder.scala  |  97 ----------
 .../kubernetes/submit/v2/ClientV2Suite.scala  | 171 ++++++++++++-----
 ...PodKubernetesCredentialsMounterSuite.scala | 167 +++++++++++++++++
 .../v2/ResourceStagingServerSuite.scala       |   3 +-
 .../v2/ResourceStagingServiceImplSuite.scala  |   2 +-
 .../integrationtest/KubernetesV2Suite.scala   |  13 ++
 20 files changed, 632 insertions(+), 181 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesCredentials.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v1 => }/DriverPodKubernetesCredentialsProvider.scala (88%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesCredentials.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesCredentials.scala
new file mode 100644
index 0000000000000..aba94e6969529
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesCredentials.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+case class KubernetesCredentials(
+  oauthTokenBase64: Option[String],
+  caCertDataBase64: Option[String],
+  clientKeyDataBase64: Option[String],
+  clientCertDataBase64: Option[String])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index f0a39fe359227..45e5a46a26258 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -120,14 +120,20 @@ package object config extends Logging {
   private[spark] val KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE =
     ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.caCertFile")
       .doc("Path on the driver pod's disk containing the CA cert file to use when authenticating" +
-        " against Kubernetes.")
+        " against Kubernetes. Typically this is configured by spark-submit from mounting a" +
+        " secret from the submitting machine into the pod, and hence this configuration is marked" +
+        " as internal, but this can also be set manually to use a certificate that is mounted" +
+        " into the driver pod via other means.")
       .stringConf
       .createOptional
 
   private[spark] val KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE =
     ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.clientKeyFile")
       .doc("Path on the driver pod's disk containing the client key file to use when" +
-        " authenticating against Kubernetes.")
+        " authenticating against Kubernetes. Typically this is configured by spark-submit from" +
+        " mounting a secret from the submitting machine into the pod, and hence this" +
+        " configuration is marked as internal, but this can also be set manually to" +
+        " use a key file that is mounted into the driver pod via other means.")
       .internal()
       .stringConf
       .createOptional
@@ -135,7 +141,10 @@ package object config extends Logging {
   private[spark] val KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE =
     ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.clientCertFile")
       .doc("Path on the driver pod's disk containing the client cert file to use when" +
-        " authenticating against Kubernetes.")
+        " authenticating against Kubernetes. Typically this is configured by spark-submit from" +
+        " mounting a secret from the submitting machine into the pod, and hence this" +
+        " configuration is marked as internal, but this can also be set manually to" +
+        " use a certificate that is mounted into the driver pod via other means.")
       .internal()
       .stringConf
       .createOptional
@@ -143,7 +152,10 @@ package object config extends Logging {
   private[spark] val KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN =
     ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.oauthTokenFile")
       .doc("Path on the driver pod's disk containing the OAuth token file to use when" +
-        " authenticating against Kubernetes.")
+        " authenticating against Kubernetes. Typically this is configured by spark-submit from" +
+        " mounting a secret from the submitting machine into the pod, and hence this" +
+        " configuration is marked as internal, but this can also be set manually to" +
+        " use a token that is mounted into the driver pod via other means.")
       .internal()
       .stringConf
       .createOptional
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 4c4f7b9fc3b23..8d0965078aaa8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -38,6 +38,22 @@ package object constants {
   private[spark] val SUBMISSION_SSL_SECRETS_VOLUME_NAME = "spark-submission-server-ssl-secrets"
   private[spark] val SUBMISSION_SSL_KEY_PEM_SECRET_NAME = "spark-submission-server-key-pem"
   private[spark] val SUBMISSION_SSL_CERT_PEM_SECRET_NAME = "spark-submission-server-cert-pem"
+  private[spark] val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
+    "/mnt/secrets/spark-kubernetes-credentials"
+  private[spark] val DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME = "ca-cert"
+  private[spark] val DRIVER_CREDENTIALS_CA_CERT_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME"
+  private[spark] val DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME = "client-key"
+  private[spark] val DRIVER_CREDENTIALS_CLIENT_KEY_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME"
+  private[spark] val DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME = "client-cert"
+  private[spark] val DRIVER_CREDENTIALS_CLIENT_CERT_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME"
+  private[spark] val DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME = "oauth-token"
+  private[spark] val DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME"
+  private[spark] val DRIVER_CREDENTIALS_SECRET_VOLUME_NAME = "kubernetes-credentials"
+
 
   // Default and fixed ports
   private[spark] val SUBMISSION_SERVER_PORT = 7077
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
similarity index 88%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
index 112226dbe3fc1..404741520c059 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverPodKubernetesCredentialsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
@@ -14,15 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v1
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
+import com.google.common.base.Charsets
 import com.google.common.io.{BaseEncoding, Files}
 
 import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.internal.config.OptionalConfigEntry
 
 private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf) {
@@ -38,7 +39,9 @@ private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf
       require(sparkConf.get(KUBERNETES_DRIVER_CLIENT_CERT_FILE).isEmpty,
         "Cannot specify both a service account and a driver pod client cert file.")
     }
-    val oauthToken = sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN)
+    val oauthTokenBase64 = sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).map { token =>
+      BaseEncoding.base64().encode(token.getBytes(Charsets.UTF_8))
+    }
     val caCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CA_CERT_FILE,
       s"Driver CA cert file provided at %s does not exist or is not a file.")
     val clientKeyDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
@@ -46,7 +49,7 @@ private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf
     val clientCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
       s"Driver client cert file provided at %s does not exist or is not a file.")
     KubernetesCredentials(
-      oauthToken = oauthToken,
+      oauthTokenBase64 = oauthTokenBase64,
       caCertDataBase64 = caCertDataBase64,
       clientKeyDataBase64 = clientKeyDataBase64,
       clientCertDataBase64 = clientCertDataBase64)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index 8f1e356bec8ca..fa3c97c6957b5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -30,11 +30,11 @@ import org.apache.commons.codec.binary.Base64
 import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
-import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesCredentials, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
+import org.apache.spark.deploy.kubernetes.submit.{DriverPodKubernetesCredentialsProvider, KubernetesFileUtils}
+import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ShutdownHookManager, Utils}
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
index a403a91840bd6..da08e17dee85b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
@@ -48,7 +48,9 @@ private[spark] class Client(
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
     kubernetesClientProvider: SubmissionKubernetesClientProvider,
-    initContainerComponentsProvider: DriverInitContainerComponentsProvider) extends Logging {
+    initContainerComponentsProvider: DriverInitContainerComponentsProvider,
+    kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider)
+    extends Logging {
 
   private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
     .getOrElse(kubernetesAppId)
@@ -133,9 +135,6 @@ private[spark] class Client(
         .provideInitContainerBootstrap()
         .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod)
 
-      val driverOwnedResources = Seq(initContainerConfigMap) ++
-          maybeSubmittedDependenciesSecret.toSeq
-
       val containerLocalizedFilesResolver = initContainerComponentsProvider
           .provideContainerLocalizedFilesResolver()
       val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
@@ -143,8 +142,15 @@ private[spark] class Client(
 
       val executorInitContainerConfiguration = initContainerComponentsProvider
           .provideExecutorInitContainerConfiguration()
-      val resolvedSparkConf = executorInitContainerConfiguration
+      val sparkConfWithExecutorInit = executorInitContainerConfiguration
           .configureSparkConfForExecutorInitContainer(sparkConf)
+      val credentialsMounter = kubernetesCredentialsMounterProvider
+          .getDriverPodKubernetesCredentialsMounter()
+      val credentialsSecret = credentialsMounter.createCredentialsSecret()
+      val podWithInitContainerAndMountedCreds = credentialsMounter.mountDriverKubernetesCredentials(
+        podWithInitContainer, driverContainer.getName, credentialsSecret)
+      val resolvedSparkConf = credentialsMounter.setDriverPodKubernetesCredentialLocations(
+          sparkConfWithExecutorInit)
       if (resolvedSparkJars.nonEmpty) {
         resolvedSparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
       }
@@ -166,7 +172,7 @@ private[spark] class Client(
       val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
         case (confKey, confValue) => s"-D$confKey=$confValue"
       }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-      val resolvedDriverPod = podWithInitContainer.editSpec()
+      val resolvedDriverPod = podWithInitContainerAndMountedCreds.editSpec()
         .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
           .addNewEnv()
             .withName(ENV_MOUNTED_CLASSPATH)
@@ -181,6 +187,9 @@ private[spark] class Client(
         .build()
       val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
       try {
+        val driverOwnedResources = Seq(initContainerConfigMap) ++
+          maybeSubmittedDependenciesSecret.toSeq ++
+          credentialsSecret.toSeq
         val driverPodOwnerReference = new OwnerReferenceBuilder()
           .withName(createdDriverPod.getMetadata.getName)
           .withApiVersion(createdDriverPod.getApiVersion)
@@ -261,6 +270,8 @@ private[spark] object Client {
     val initContainerComponentsProvider = new DriverInitContainerComponentsProviderImpl(
       sparkConf, kubernetesAppId, sparkJars, sparkFiles)
     val kubernetesClientProvider = new SubmissionKubernetesClientProviderImpl(sparkConf)
+    val kubernetesCredentialsMounterProvider =
+        new DriverPodKubernetesCredentialsMounterProviderImpl(sparkConf, kubernetesAppId)
     new Client(
       appName,
       kubernetesAppId,
@@ -270,6 +281,7 @@ private[spark] object Client {
       sparkJars,
       sparkFiles,
       kubernetesClientProvider,
-      initContainerComponentsProvider).run()
+      initContainerComponentsProvider,
+      kubernetesCredentialsMounterProvider).run()
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala
new file mode 100644
index 0000000000000..9759669335774
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import io.fabric8.kubernetes.api.model.{PodBuilder, Secret, SecretBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.config.OptionalConfigEntry
+
+private[spark] trait DriverPodKubernetesCredentialsMounter {
+
+  /**
+   * Set fields on the Spark configuration that indicate where the driver pod is
+   * to find its Kubernetes credentials for requesting executors.
+   */
+  def setDriverPodKubernetesCredentialLocations(sparkConf: SparkConf): SparkConf
+
+  /**
+   * Create the Kubernetes secret object that correspond to the driver's credentials
+   * that have to be created and mounted into the driver pod. The single Secret
+   * object contains all of the data entries for the driver pod's Kubernetes
+   * credentials. Returns empty if no secrets are to be mounted.
+   */
+  def createCredentialsSecret(): Option[Secret]
+
+  /**
+   * Mount any Kubernetes credentials from the submitting machine's disk into the driver pod. The
+   * secret that is passed in here should have been created from createCredentialsSecret so that
+   * the implementation does not need to hold its state.
+   */
+  def mountDriverKubernetesCredentials(
+    originalPodSpec: PodBuilder,
+    driverContainerName: String,
+    credentialsSecret: Option[Secret]): PodBuilder
+}
+
+private[spark] class DriverPodKubernetesCredentialsMounterImpl(
+    kubernetesAppId: String,
+    submitterLocalDriverPodKubernetesCredentials: KubernetesCredentials,
+    maybeUserSpecifiedMountedClientKeyFile: Option[String],
+    maybeUserSpecifiedMountedClientCertFile: Option[String],
+    maybeUserSpecifiedMountedOAuthTokenFile: Option[String],
+    maybeUserSpecifiedMountedCaCertFile: Option[String])
+    extends DriverPodKubernetesCredentialsMounter {
+
+  override def setDriverPodKubernetesCredentialLocations(sparkConf: SparkConf): SparkConf = {
+    val resolvedMountedClientKeyFile = resolveSecretLocation(
+      maybeUserSpecifiedMountedClientKeyFile,
+      submitterLocalDriverPodKubernetesCredentials.clientKeyDataBase64,
+      DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
+    val resolvedMountedClientCertFile = resolveSecretLocation(
+      maybeUserSpecifiedMountedClientCertFile,
+      submitterLocalDriverPodKubernetesCredentials.clientCertDataBase64,
+      DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
+    val resolvedMountedCaCertFile = resolveSecretLocation(
+      maybeUserSpecifiedMountedCaCertFile,
+      submitterLocalDriverPodKubernetesCredentials.caCertDataBase64,
+      DRIVER_CREDENTIALS_CA_CERT_PATH)
+    val resolvedMountedOAuthTokenFile = resolveSecretLocation(
+      maybeUserSpecifiedMountedOAuthTokenFile,
+      submitterLocalDriverPodKubernetesCredentials.oauthTokenBase64,
+      DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
+    val sparkConfWithCredentialLocations = sparkConf.clone()
+      .setOption(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE, resolvedMountedCaCertFile)
+      .setOption(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE, resolvedMountedClientKeyFile)
+      .setOption(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE, resolvedMountedClientCertFile)
+      .setOption(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN, resolvedMountedOAuthTokenFile)
+    sparkConfWithCredentialLocations.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
+      sparkConfWithCredentialLocations.set(KUBERNETES_DRIVER_OAUTH_TOKEN, "<present_but_redacted>")
+    }
+    sparkConfWithCredentialLocations.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
+      sparkConfWithCredentialLocations.set(KUBERNETES_SUBMIT_OAUTH_TOKEN, "<present_but_redacted>")
+    }
+    sparkConfWithCredentialLocations
+  }
+
+  override def createCredentialsSecret(): Option[Secret] = {
+    val allSecretData =
+      resolveSecretData(
+        maybeUserSpecifiedMountedClientKeyFile,
+        submitterLocalDriverPodKubernetesCredentials.clientKeyDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME) ++
+      resolveSecretData(
+        maybeUserSpecifiedMountedClientCertFile,
+        submitterLocalDriverPodKubernetesCredentials.clientCertDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME) ++
+      resolveSecretData(
+        maybeUserSpecifiedMountedCaCertFile,
+        submitterLocalDriverPodKubernetesCredentials.caCertDataBase64,
+        DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME) ++
+      resolveSecretData(
+        maybeUserSpecifiedMountedOAuthTokenFile,
+        submitterLocalDriverPodKubernetesCredentials.oauthTokenBase64,
+        DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME)
+    if (allSecretData.isEmpty) {
+      None
+    } else {
+      Some(new SecretBuilder()
+        .withNewMetadata().withName(s"$kubernetesAppId-kubernetes-credentials").endMetadata()
+        .withData(allSecretData.asJava)
+        .build())
+    }
+  }
+
+  override def mountDriverKubernetesCredentials(
+      originalPodSpec: PodBuilder,
+      driverContainerName: String,
+      credentialsSecret: Option[Secret]): PodBuilder = {
+    credentialsSecret.map { secret =>
+      originalPodSpec.editSpec()
+        .addNewVolume()
+          .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+          .withNewSecret().withSecretName(secret.getMetadata.getName).endSecret()
+          .endVolume()
+        .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainerName))
+          .addNewVolumeMount()
+            .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+            .withMountPath(DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
+            .endVolumeMount()
+          .endContainer()
+        .endSpec()
+    }.getOrElse(originalPodSpec)
+  }
+
+  private def resolveSecretLocation(
+    mountedUserSpecified: Option[String],
+    valueMountedFromSubmitter: Option[String],
+    mountedCanonicalLocation: String): Option[String] = {
+    mountedUserSpecified.orElse(valueMountedFromSubmitter.map( _ => {
+      mountedCanonicalLocation
+    }))
+  }
+
+  private def resolveSecretData(
+      mountedUserSpecified: Option[String],
+      valueMountedFromSubmitter: Option[String],
+      secretName: String): Map[String, String] = {
+    mountedUserSpecified.map { _ => Map.empty[String, String]}
+        .getOrElse {
+      valueMountedFromSubmitter.map { valueBase64 =>
+        Map(secretName -> valueBase64)
+      }.getOrElse(Map.empty[String, String])
+    }
+  }
+
+  private implicit def augmentSparkConf(sparkConf: SparkConf): OptionSettableSparkConf = {
+    new OptionSettableSparkConf(sparkConf)
+  }
+}
+
+private class OptionSettableSparkConf(sparkConf: SparkConf) {
+  def setOption[T](configEntry: OptionalConfigEntry[T], option: Option[T]): SparkConf = {
+    option.map( opt => {
+      sparkConf.set(configEntry, opt)
+    }).getOrElse(sparkConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala
new file mode 100644
index 0000000000000..e981c54d23a9d
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.DriverPodKubernetesCredentialsProvider
+
+private[spark] trait DriverPodKubernetesCredentialsMounterProvider {
+
+  def getDriverPodKubernetesCredentialsMounter()
+      : DriverPodKubernetesCredentialsMounter
+}
+
+private[spark] class DriverPodKubernetesCredentialsMounterProviderImpl(
+    sparkConf: SparkConf,
+    kubernetesAppId: String)
+    extends DriverPodKubernetesCredentialsMounterProvider {
+
+  override def getDriverPodKubernetesCredentialsMounter()
+      : DriverPodKubernetesCredentialsMounter = {
+    val submitterLocalDriverPodKubernetesCredentials =
+      new DriverPodKubernetesCredentialsProvider(sparkConf).get()
+    new DriverPodKubernetesCredentialsMounterImpl(
+      kubernetesAppId,
+      submitterLocalDriverPodKubernetesCredentials,
+      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE),
+      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE),
+      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN),
+      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE))
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
index f22759d463cb7..5f98facfb691f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
@@ -25,9 +25,8 @@ import okhttp3.RequestBody
 import retrofit2.Call
 
 import org.apache.spark.{SparkException, SSLOptions}
-import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
 import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
-import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
 import org.apache.spark.util.Utils
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
index cd1f9dcdf5879..bdd4a85da8f85 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
@@ -19,15 +19,10 @@ package org.apache.spark.deploy.rest.kubernetes.v1
 import com.fasterxml.jackson.annotation.{JsonIgnore, JsonSubTypes, JsonTypeInfo}
 
 import org.apache.spark.SPARK_VERSION
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.deploy.rest.{SubmitRestProtocolRequest, SubmitRestProtocolResponse}
 import org.apache.spark.util.Utils
 
-case class KubernetesCredentials(
-    oauthToken: Option[String],
-    caCertDataBase64: Option[String],
-    clientKeyDataBase64: Option[String],
-    clientCertDataBase64: Option[String])
-
 case class KubernetesCreateSubmissionRequest(
     appResource: AppResource,
     mainClass: String,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
index 7847ba2546594..52ca3ef956a79 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
@@ -33,7 +33,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
 import org.apache.spark.deploy.rest._
@@ -306,7 +306,10 @@ private[spark] class KubernetesSparkRestServer(
           + resolvedDirectory.getAbsolutePath)
       }
       val oauthTokenFile = writeRawStringCredentialAndGetConf("oauth-token.txt", resolvedDirectory,
-        KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN, kubernetesCredentials.oauthToken)
+        KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN,
+        kubernetesCredentials.oauthTokenBase64.map { base64 =>
+          new String(BaseEncoding.base64().decode(base64), Charsets.UTF_8)
+        })
       val caCertFile = writeBase64CredentialAndGetConf("ca.crt", resolvedDirectory,
         KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE, kubernetesCredentials.caCertDataBase64)
       val clientKeyFile = writeBase64CredentialAndGetConf("key.key", resolvedDirectory,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
index b7c6c4fb913da..5dbe55b72bd8b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
@@ -22,8 +22,8 @@ import javax.ws.rs.core.{MediaType, StreamingOutput}
 
 import org.glassfish.jersey.media.multipart.FormDataParam
 
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
-import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 
 /**
  * Service that receives application data that can be retrieved later on. This is primarily used
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
index 3dfa83c85e6dd..34c3192ae6780 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
@@ -26,8 +26,8 @@ import com.google.common.io.{BaseEncoding, ByteStreams, Files}
 import scala.collection.concurrent.TrieMap
 
 import org.apache.spark.SparkException
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
-import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
index b8c2b0c91bbeb..50f2c218c22c4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
@@ -21,10 +21,13 @@ import java.io.File
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
+import io.fabric8.kubernetes.client.utils.HttpClientUtils
+import okhttp3.Dispatcher
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.util.ThreadUtils
 
 private[spark] class DriverPodKubernetesClientProvider(sparkConf: SparkConf, namespace: String) {
   private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
@@ -78,6 +81,17 @@ private[spark] class DriverPodKubernetesClientProvider(sparkConf: SparkConf, nam
       }
       serviceAccountConfigBuilder
     }
-    new DefaultKubernetesClient(configBuilder.build)
+    // Disable the ping thread that is not daemon, in order to allow
+    // the driver main thread to shut down upon errors. Otherwise, the driver
+    // will hang indefinitely.
+    val config = configBuilder
+      .withWebsocketPingInterval(0)
+      .build()
+    val httpClient = HttpClientUtils.createHttpClient(config).newBuilder()
+      // Use a Dispatcher with a custom executor service that creates daemon threads. The default
+      // executor service used by Dispatcher creates non-daemon threads.
+      .dispatcher(new Dispatcher(ThreadUtils.newDaemonCachedThreadPool("spark-on-k8s")))
+      .build()
+    new DefaultKubernetesClient(httpClient, config)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
deleted file mode 100644
index 31c6eda77d058..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClientBuilder.scala
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.scheduler.cluster.kubernetes
-
-import java.io.File
-
-import com.google.common.base.Charsets
-import com.google.common.io.Files
-import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
-import io.fabric8.kubernetes.client.utils.HttpClientUtils
-import okhttp3.Dispatcher
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.util.ThreadUtils
-
-private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: String) {
-  private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
-  private val SERVICE_ACCOUNT_CA_CERT = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
-  private val oauthTokenFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN)
-  private val caCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE)
-  private val clientKeyFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE)
-  private val clientCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE)
-
-  /**
-   * Creates a {@link KubernetesClient}, expecting to be from within the context of a pod. When
-   * doing so, service account token files can be picked up from canonical locations.
-   */
-  def buildFromWithinPod(): DefaultKubernetesClient = {
-    val baseClientConfigBuilder = new ConfigBuilder()
-      .withApiVersion("v1")
-      .withMasterUrl(KUBERNETES_MASTER_INTERNAL_URL)
-      .withNamespace(namespace)
-
-    val configBuilder = oauthTokenFile
-        .orElse(caCertFile)
-        .orElse(clientKeyFile)
-        .orElse(clientCertFile)
-        .map { _ =>
-      var mountedAuthConfigBuilder = baseClientConfigBuilder
-      oauthTokenFile.foreach { tokenFilePath =>
-        val tokenFile = new File(tokenFilePath)
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder
-          .withOauthToken(Files.toString(tokenFile, Charsets.UTF_8))
-      }
-      caCertFile.foreach { caFile =>
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withCaCertFile(caFile)
-      }
-      clientKeyFile.foreach { keyFile =>
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientKeyFile(keyFile)
-      }
-      clientCertFile.foreach { certFile =>
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientCertFile(certFile)
-      }
-      mountedAuthConfigBuilder
-    }.getOrElse {
-      var serviceAccountConfigBuilder = baseClientConfigBuilder
-      if (SERVICE_ACCOUNT_CA_CERT.isFile) {
-        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withCaCertFile(
-          SERVICE_ACCOUNT_CA_CERT.getAbsolutePath)
-      }
-
-      if (SERVICE_ACCOUNT_TOKEN.isFile) {
-        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withOauthToken(
-          Files.toString(SERVICE_ACCOUNT_TOKEN, Charsets.UTF_8))
-      }
-      serviceAccountConfigBuilder
-    }
-    // Disable the ping thread that is not daemon, in order to allow
-    // the driver main thread to shut down upon errors. Otherwise, the driver
-    // will hang indefinitely.
-    val config = configBuilder
-      .withWebsocketPingInterval(0)
-      .build()
-    val httpClient = HttpClientUtils.createHttpClient(config).newBuilder()
-      // Use a Dispatcher with a custom executor service that creates daemon threads. The default
-      // executor service used by Dispatcher creates non-daemon threads.
-      .dispatcher(new Dispatcher(ThreadUtils.newDaemonCachedThreadPool("spark-on-k8s")))
-      .build()
-    new DefaultKubernetesClient(httpClient, config)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
index 4dc1e2e44980a..f0282dbb6d31a 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
@@ -22,7 +22,7 @@ import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, DoneablePod
 import io.fabric8.kubernetes.client.KubernetesClient
 import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
 import org.hamcrest.{BaseMatcher, Description}
-import org.mockito.{ArgumentCaptor, Mock, MockitoAnnotations}
+import org.mockito.{AdditionalAnswers, ArgumentCaptor, Mock, MockitoAnnotations}
 import org.mockito.Matchers.{any, anyVararg, argThat, eq => mockitoEq}
 import org.mockito.Mockito.{times, verify, when}
 import org.mockito.invocation.InvocationOnMock
@@ -37,7 +37,6 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 
 class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
-
   private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
   private val FILES_RESOURCE = SubmittedResourceIdAndSecret("filesId", "filesSecret")
   private val SUBMITTED_RESOURCES = SubmittedResources(JARS_RESOURCE, FILES_RESOURCE)
@@ -53,9 +52,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       SPARK_APP_NAME_LABEL -> APP_NAME)
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
-  private val SECRET_NAME = "secret"
-  private val SECRET_KEY = "secret-key"
-  private val SECRET_DATA = "secret-data"
+  private val INIT_CONTAINER_SECRET_NAME = "init-container-secret"
+  private val INIT_CONTAINER_SECRET_DATA = Map("secret-key" -> "secret-data")
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
   private val APP_ARGS = Array("3", "20")
   private val SPARK_JARS = Seq(
@@ -70,22 +68,21 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     "hdfs://localhost:9000/app/files/file1.txt", "file:///var/data/spark-files/file2.txt")
   private val INIT_CONTAINER_SECRET = new SecretBuilder()
     .withNewMetadata()
-      .withName(SECRET_NAME)
+      .withName(INIT_CONTAINER_SECRET_NAME)
       .endMetadata()
-    .addToData(SECRET_KEY, SECRET_DATA)
+    .withData(INIT_CONTAINER_SECRET_DATA.asJava)
     .build()
-  private val CONFIG_MAP_NAME = "config-map"
-  private val CONFIG_MAP_KEY = "config-map-key"
-  private val CONFIG_MAP_DATA = "config-map-data"
   private val CUSTOM_JAVA_OPTION_KEY = "myappoption"
   private val CUSTOM_JAVA_OPTION_VALUE = "myappoptionvalue"
   private val DRIVER_JAVA_OPTIONS = s"-D$CUSTOM_JAVA_OPTION_KEY=$CUSTOM_JAVA_OPTION_VALUE"
   private val DRIVER_EXTRA_CLASSPATH = "/var/data/spark-app-custom/custom-jar.jar"
+  private val CONFIG_MAP_NAME = "config-map"
+  private val CONFIG_MAP_DATA = Map("config-map-key" -> "config-map-data")
   private val INIT_CONTAINER_CONFIG_MAP = new ConfigMapBuilder()
     .withNewMetadata()
       .withName(CONFIG_MAP_NAME)
       .endMetadata()
-    .addToData(CONFIG_MAP_KEY, CONFIG_MAP_DATA)
+    .withData(CONFIG_MAP_DATA.asJava)
     .build()
   private val CUSTOM_DRIVER_IMAGE = "spark-custom-driver:latest"
   private val DRIVER_MEMORY_MB = 512
@@ -104,6 +101,17 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val DRIVER_POD_UID = "driver-pod-uid"
   private val DRIVER_POD_KIND = "pod"
   private val DRIVER_POD_API_VERSION = "v1"
+  private val CREDENTIALS_SECRET_NAME = "credentials-secret"
+  private val CREDENTIALS_SECRET_DATA = Map("credentials-secret-key" -> "credentials-secret-value")
+  private val CREDENTIALS_SECRET = new SecretBuilder()
+    .withNewMetadata()
+      .withName(CREDENTIALS_SECRET_NAME)
+      .endMetadata()
+    .withData(CREDENTIALS_SECRET_DATA.asJava)
+    .build()
+  private val CREDENTIALS_SET_CONF = "spark.kubernetes.driverCredentials.provided"
+  private val CREDENTIALS_SET_ANNOTATION = "credentials-set"
+
   @Mock
   private var initContainerConfigMapBuilder: SparkInitContainerConfigMapBuilder = _
   @Mock
@@ -128,6 +136,10 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       HasMetadata, java.lang.Boolean]
   @Mock
   private var resourceListOps: ResourceListOps = _
+  @Mock
+  private var credentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider = _
+  @Mock
+  private var credentialsMounter: DriverPodKubernetesCredentialsMounter = _
 
   before {
     MockitoAnnotations.initMocks(this)
@@ -174,9 +186,12 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     when(executorInitContainerConfiguration.configureSparkConfForExecutorInitContainer(SPARK_CONF))
         .thenReturn(SPARK_CONF_WITH_EXECUTOR_INIT_CONF)
     when(kubernetesClient.resourceList(anyVararg[HasMetadata]())).thenReturn(resourceListOps)
+    when(credentialsMounterProvider.getDriverPodKubernetesCredentialsMounter())
+        .thenReturn(credentialsMounter)
   }
 
   test("Run with dependency uploader") {
+    expectationsForNoMountedCredentials()
     when(initContainerComponentsProvider
         .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
         .thenReturn(Some(submittedDependencyUploader))
@@ -194,8 +209,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     verifyCreatedResourcesHaveOwnerReferences(createdResources)
     assert(createdResources.exists {
       case secret: Secret =>
-        val expectedSecretData = Map(SECRET_KEY -> SECRET_DATA)
-        secret.getMetadata.getName == SECRET_NAME && secret.getData.asScala == expectedSecretData
+        secret.getMetadata.getName == INIT_CONTAINER_SECRET_NAME &&
+            secret.getData.asScala == INIT_CONTAINER_SECRET_DATA
       case _ => false
     })
     verifyConfigMapWasCreated(createdResources)
@@ -208,15 +223,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Run without dependency uploader") {
-    when(initContainerComponentsProvider
-      .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
-      .thenReturn(None)
-    when(initContainerComponentsProvider
-      .provideSubmittedDependenciesSecretBuilder(None))
-      .thenReturn(None)
-    when(initContainerComponentsProvider
-      .provideInitContainerConfigMapBuilder(None))
-      .thenReturn(initContainerConfigMapBuilder)
+    expectationsForNoMountedCredentials()
+    expectationsForNoDependencyUploader()
     runAndVerifyDriverPodHasCorrectProperties()
     val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
     verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
@@ -232,6 +240,65 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       .provideSubmittedDependenciesSecretBuilder(None)
   }
 
+  test("Run with mounted credentials") {
+    expectationsForNoDependencyUploader()
+    when(credentialsMounter.createCredentialsSecret()).thenReturn(Some(CREDENTIALS_SECRET))
+    when(credentialsMounter.mountDriverKubernetesCredentials(
+        any(), mockitoEq(DRIVER_CONTAINER_NAME), mockitoEq(Some(CREDENTIALS_SECRET))))
+        .thenAnswer(new Answer[PodBuilder] {
+          override def answer(invocation: InvocationOnMock): PodBuilder = {
+            invocation.getArgumentAt(0, classOf[PodBuilder]).editMetadata()
+              .addToAnnotations(CREDENTIALS_SET_ANNOTATION, TRUE)
+              .endMetadata()
+          }
+        })
+    when(credentialsMounter.setDriverPodKubernetesCredentialLocations(any()))
+        .thenAnswer(new Answer[SparkConf] {
+          override def answer(invocation: InvocationOnMock): SparkConf = {
+            invocation.getArgumentAt(0, classOf[SparkConf]).clone().set(CREDENTIALS_SET_CONF, TRUE)
+          }
+        })
+    runAndVerifyPodMatchesPredicate { p =>
+      Option(p)
+        .filter(pod => containerHasCorrectJvmOptions(pod, _(CREDENTIALS_SET_CONF) == TRUE))
+        .exists { pod =>
+          pod.getMetadata.getAnnotations.asScala(CREDENTIALS_SET_ANNOTATION) == TRUE
+        }
+    }
+    val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
+    verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
+    val createdResources = resourceListArgumentCaptor.getAllValues.asScala
+    assert(createdResources.size === 2)
+    verifyCreatedResourcesHaveOwnerReferences(createdResources)
+    assert(createdResources.exists {
+      case secret: Secret =>
+        secret.getMetadata.getName == CREDENTIALS_SECRET_NAME &&
+            secret.getData.asScala == CREDENTIALS_SECRET_DATA
+      case _ => false
+    })
+  }
+
+  private def expectationsForNoDependencyUploader(): Unit = {
+    when(initContainerComponentsProvider
+      .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
+      .thenReturn(None)
+    when(initContainerComponentsProvider
+      .provideSubmittedDependenciesSecretBuilder(None))
+      .thenReturn(None)
+    when(initContainerComponentsProvider
+      .provideInitContainerConfigMapBuilder(None))
+      .thenReturn(initContainerConfigMapBuilder)
+  }
+
+  private def expectationsForNoMountedCredentials(): Unit = {
+    when(credentialsMounter.setDriverPodKubernetesCredentialLocations(any()))
+        .thenAnswer(AdditionalAnswers.returnsFirstArg())
+    when(credentialsMounter.createCredentialsSecret()).thenReturn(None)
+    when(credentialsMounter.mountDriverKubernetesCredentials(
+        any(), mockitoEq(DRIVER_CONTAINER_NAME), mockitoEq(None)))
+        .thenAnswer(AdditionalAnswers.returnsFirstArg())
+  }
+
   private def verifyCreatedResourcesHaveOwnerReferences(
       createdResources: mutable.Buffer[HasMetadata]): Unit = {
     assert(createdResources.forall { resource =>
@@ -248,14 +315,36 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private def verifyConfigMapWasCreated(createdResources: mutable.Buffer[HasMetadata]): Unit = {
     assert(createdResources.exists {
       case configMap: ConfigMap =>
-        val expectedConfigMapData = Map(CONFIG_MAP_KEY -> CONFIG_MAP_DATA)
         configMap.getMetadata.getName == CONFIG_MAP_NAME &&
-          configMap.getData.asScala == expectedConfigMapData
+            configMap.getData.asScala == CONFIG_MAP_DATA
       case _ => false
     })
   }
 
   private def runAndVerifyDriverPodHasCorrectProperties(): Unit = {
+    val expectedOptions = SPARK_CONF.getAll
+      .filterNot(_._1 == org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS.key)
+      .toMap ++
+      Map(
+        "spark.app.id" -> APP_ID,
+        KUBERNETES_DRIVER_POD_NAME.key -> APP_ID,
+        EXECUTOR_INIT_CONF_KEY -> TRUE,
+        CUSTOM_JAVA_OPTION_KEY -> CUSTOM_JAVA_OPTION_VALUE,
+        "spark.jars" -> RESOLVED_SPARK_JARS.mkString(","),
+        "spark.files" -> RESOLVED_SPARK_FILES.mkString(","))
+    runAndVerifyPodMatchesPredicate { p =>
+      Option(p)
+        .filter(_.getMetadata.getName == APP_ID)
+        .filter(podHasCorrectAnnotations)
+        .filter(_.getMetadata.getLabels.asScala == ALL_EXPECTED_LABELS)
+        .filter(containerHasCorrectBasicContainerConfiguration)
+        .filter(containerHasCorrectBasicEnvs)
+        .filter(containerHasCorrectMountedClasspath)
+        .exists(pod => containerHasCorrectJvmOptions(pod, _ == expectedOptions))
+    }
+  }
+
+  private def runAndVerifyPodMatchesPredicate(pred: (Pod => Boolean)): Unit = {
     new Client(
       APP_NAME,
       APP_ID,
@@ -265,49 +354,31 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       SPARK_JARS,
       SPARK_FILES,
       kubernetesClientProvider,
-      initContainerComponentsProvider).run()
+      initContainerComponentsProvider,
+      credentialsMounterProvider).run()
     val podMatcher = new BaseMatcher[Pod] {
       override def matches(o: scala.Any): Boolean = {
         o match {
-          case p: Pod =>
-            Option(p)
-              .filter(_.getMetadata.getName == APP_ID)
-              .filter(podHasCorrectAnnotations)
-              .filter(_.getMetadata.getLabels.asScala == ALL_EXPECTED_LABELS)
-              .filter(containerHasCorrectBasicContainerConfiguration)
-              .filter(containerHasCorrectBasicEnvs)
-              .filter(containerHasCorrectMountedClasspath)
-              .exists(containerHasCorrectJvmOptions)
-          case _ =>
-            false
+          case p: Pod => pred(p)
+          case _ => false
         }
       }
-
       override def describeTo(description: Description): Unit = {}
     }
     verify(podOps).create(argThat(podMatcher))
   }
 
-  private def containerHasCorrectJvmOptions(pod: Pod): Boolean = {
+  private def containerHasCorrectJvmOptions(
+      pod: Pod, optionsCorrectnessPredicate: (Map[String, String] => Boolean)): Boolean = {
     val driverContainer = pod.getSpec.getContainers.asScala.head
     val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
     envs.toMap.get(ENV_DRIVER_JAVA_OPTS).exists { javaOptions =>
       val splitOptions = javaOptions.split(" ")
-      val expectedOptions = SPARK_CONF.getAll
-        .filterNot(_._1 == org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS.key)
-        .toMap ++
-        Map(
-          "spark.app.id" -> APP_ID,
-          KUBERNETES_DRIVER_POD_NAME.key -> APP_ID,
-          EXECUTOR_INIT_CONF_KEY -> TRUE,
-          CUSTOM_JAVA_OPTION_KEY -> CUSTOM_JAVA_OPTION_VALUE,
-          "spark.jars" -> RESOLVED_SPARK_JARS.mkString(","),
-          "spark.files" -> RESOLVED_SPARK_FILES.mkString(","))
       splitOptions.forall(_.startsWith("-D")) &&
-        splitOptions.map { option =>
+        optionsCorrectnessPredicate(splitOptions.map { option =>
           val withoutPrefix = option.substring(2)
           (withoutPrefix.split("=", 2)(0), withoutPrefix.split("=", 2)(1))
-        }.toMap == expectedOptions
+        }.toMap)
     }
   }
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala
new file mode 100644
index 0000000000000..d4413076fb092
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import io.fabric8.kubernetes.api.model.{PodBuilder, SecretBuilder}
+import org.scalatest.prop.TableDrivenPropertyChecks
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
+
+class DriverPodKubernetesCredentialsMounterSuite
+    extends SparkFunSuite with TableDrivenPropertyChecks {
+
+  private val CLIENT_KEY_DATA = "client-key-data"
+  private val CLIENT_CERT_DATA = "client-cert-data"
+  private val OAUTH_TOKEN_DATA = "oauth-token"
+  private val CA_CERT_DATA = "ca-cert-data"
+  private val SUBMITTER_LOCAL_DRIVER_KUBERNETES_CREDENTIALS = KubernetesCredentials(
+    caCertDataBase64 = Some(CA_CERT_DATA),
+    clientKeyDataBase64 = Some(CLIENT_KEY_DATA),
+    clientCertDataBase64 = Some(CLIENT_CERT_DATA),
+    oauthTokenBase64 = Some(OAUTH_TOKEN_DATA))
+  private val APP_ID = "app-id"
+  private val USER_SPECIFIED_CLIENT_KEY_FILE = Some("/var/data/client-key.pem")
+  private val USER_SPECIFIED_CLIENT_CERT_FILE = Some("/var/data/client-cert.pem")
+  private val USER_SPECIFIED_OAUTH_TOKEN_FILE = Some("/var/data/token.txt")
+  private val USER_SPECIFIED_CA_CERT_FILE = Some("/var/data/ca.pem")
+
+  // Different configurations of credentials mounters
+  private val credentialsMounterWithPreMountedFiles =
+    new DriverPodKubernetesCredentialsMounterImpl(
+      kubernetesAppId = APP_ID,
+      submitterLocalDriverPodKubernetesCredentials = SUBMITTER_LOCAL_DRIVER_KUBERNETES_CREDENTIALS,
+      maybeUserSpecifiedMountedClientKeyFile = USER_SPECIFIED_CLIENT_KEY_FILE,
+      maybeUserSpecifiedMountedClientCertFile = USER_SPECIFIED_CLIENT_CERT_FILE,
+      maybeUserSpecifiedMountedOAuthTokenFile = USER_SPECIFIED_OAUTH_TOKEN_FILE,
+      maybeUserSpecifiedMountedCaCertFile = USER_SPECIFIED_CA_CERT_FILE)
+  private val credentialsMounterWithoutPreMountedFiles =
+    new DriverPodKubernetesCredentialsMounterImpl(
+      kubernetesAppId = APP_ID,
+      submitterLocalDriverPodKubernetesCredentials = SUBMITTER_LOCAL_DRIVER_KUBERNETES_CREDENTIALS,
+      maybeUserSpecifiedMountedClientKeyFile = None,
+      maybeUserSpecifiedMountedClientCertFile = None,
+      maybeUserSpecifiedMountedOAuthTokenFile = None,
+      maybeUserSpecifiedMountedCaCertFile = None)
+  private val credentialsMounterWithoutAnyDriverCredentials =
+    new DriverPodKubernetesCredentialsMounterImpl(
+      APP_ID, KubernetesCredentials(None, None, None, None), None, None, None, None)
+
+  // Test matrices
+  private val TEST_MATRIX_EXPECTED_SPARK_CONFS = Table(
+      ("Credentials Mounter Implementation",
+        "Expected client key file",
+        "Expected client cert file",
+        "Expected CA Cert file",
+        "Expected OAuth Token File"),
+      (credentialsMounterWithoutAnyDriverCredentials,
+        None,
+        None,
+        None,
+        None),
+      (credentialsMounterWithoutPreMountedFiles,
+        Some(DRIVER_CREDENTIALS_CLIENT_KEY_PATH),
+        Some(DRIVER_CREDENTIALS_CLIENT_CERT_PATH),
+        Some(DRIVER_CREDENTIALS_CA_CERT_PATH),
+        Some(DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)),
+      (credentialsMounterWithPreMountedFiles,
+        USER_SPECIFIED_CLIENT_KEY_FILE,
+        USER_SPECIFIED_CLIENT_CERT_FILE,
+        USER_SPECIFIED_CA_CERT_FILE,
+        USER_SPECIFIED_OAUTH_TOKEN_FILE))
+
+  private val TEST_MATRIX_EXPECTED_CREDENTIALS_SECRET = Table(
+      ("Credentials Mounter Implementation", "Expected Credentials Secret Data"),
+      (credentialsMounterWithoutAnyDriverCredentials, None),
+      (credentialsMounterWithoutPreMountedFiles,
+        Some(KubernetesSecretNameAndData(
+          data = Map[String, String](
+            DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME -> CLIENT_KEY_DATA,
+            DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME -> CLIENT_CERT_DATA,
+            DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME -> CA_CERT_DATA,
+            DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME -> OAUTH_TOKEN_DATA
+          ),
+          name = s"$APP_ID-kubernetes-credentials"))),
+      (credentialsMounterWithPreMountedFiles, None))
+
+  test("Credentials mounter should set the driver's Kubernetes credentials locations") {
+    forAll(TEST_MATRIX_EXPECTED_SPARK_CONFS) {
+      case (credentialsMounter,
+           expectedClientKeyFile,
+           expectedClientCertFile,
+           expectedCaCertFile,
+           expectedOAuthTokenFile) =>
+        val baseSparkConf = new SparkConf()
+        val resolvedSparkConf =
+          credentialsMounter.setDriverPodKubernetesCredentialLocations(baseSparkConf)
+        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE) ===
+            expectedClientKeyFile)
+        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE) ===
+            expectedClientCertFile)
+        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE) ===
+            expectedCaCertFile)
+        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN) ===
+            expectedOAuthTokenFile)
+    }
+  }
+
+  test("Credentials mounter should create the correct credentials secret.") {
+    forAll(TEST_MATRIX_EXPECTED_CREDENTIALS_SECRET) {
+      case (credentialsMounter, expectedSecretNameAndData) =>
+        val builtSecret = credentialsMounter.createCredentialsSecret()
+        val secretNameAndData = builtSecret.map { secret =>
+          KubernetesSecretNameAndData(secret.getMetadata.getName, secret.getData.asScala.toMap)
+        }
+        assert(secretNameAndData === expectedSecretNameAndData)
+    }
+  }
+
+  test("When credentials secret is provided, driver pod should mount the secret volume.") {
+    val credentialsSecret = new SecretBuilder()
+      .withNewMetadata().withName("secret").endMetadata()
+      .addToData("secretKey", "secretValue")
+      .build()
+    val originalPodSpec = new PodBuilder()
+      .withNewMetadata().withName("pod").endMetadata()
+      .withNewSpec()
+        .addNewContainer()
+          .withName("container")
+          .endContainer()
+        .endSpec()
+    val podSpecWithMountedDriverKubernetesCredentials =
+        credentialsMounterWithoutPreMountedFiles.mountDriverKubernetesCredentials(
+          originalPodSpec, "container", Some(credentialsSecret)).build()
+    val volumes = podSpecWithMountedDriverKubernetesCredentials.getSpec.getVolumes.asScala
+    assert(volumes.exists(_.getName == DRIVER_CREDENTIALS_SECRET_VOLUME_NAME))
+    volumes.find(_.getName == DRIVER_CREDENTIALS_SECRET_VOLUME_NAME).foreach { secretVolume =>
+      assert(secretVolume.getSecret != null && secretVolume.getSecret.getSecretName == "secret")
+    }
+  }
+
+  test("When credentials secret is absent, driver pod should not be changed.") {
+    val originalPodSpec = new PodBuilder()
+    val nonAdjustedPodSpec =
+      credentialsMounterWithoutAnyDriverCredentials.mountDriverKubernetesCredentials(
+        originalPodSpec, "driver", None)
+    assert(nonAdjustedPodSpec === originalPodSpec)
+  }
+}
+
+private case class KubernetesSecretNameAndData(name: String, data: Map[String, String])
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
index 08be8af30b3bc..4ef12e8686bb0 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
@@ -27,8 +27,7 @@ import org.scalatest.BeforeAndAfter
 import retrofit2.Call
 
 import org.apache.spark.{SparkFunSuite, SSLOptions}
-import org.apache.spark.deploy.kubernetes.SSLUtils
-import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
+import org.apache.spark.deploy.kubernetes.{KubernetesCredentials, SSLUtils}
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
index 60850bb877540..9677d12681a16 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
@@ -22,7 +22,7 @@ import java.nio.file.Paths
 import com.google.common.io.Files
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.rest.kubernetes.v1.KubernetesCredentials
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
index 3be4507ac105a..ba9d088bfcfcc 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
@@ -135,6 +135,19 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
   }
 
+  test("Use client key and client cert file when requesting executors") {
+    sparkConf.setJars(Seq(
+        KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+        KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
+        kubernetesTestComponents.clientConfig.getClientKeyFile)
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
+        kubernetesTestComponents.clientConfig.getClientCertFile)
+    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
+        kubernetesTestComponents.clientConfig.getCaCertFile)
+    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+  }
+
   private def launchStagingServer(resourceStagingServerSslOptions: SSLOptions): Unit = {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 

From 88306b2c2b566b0fbad8e924fb9c8e79c37d19d8 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 19 May 2017 12:18:33 -0700
Subject: [PATCH 482/534] Allow client certificate PEM for resource staging
 server. (#257)

---
 .../spark/deploy/kubernetes/config.scala      |  7 +-
 .../deploy/kubernetes/submit/v2/Client.scala  |  4 +-
 ...riverInitContainerComponentsProvider.scala |  7 +-
 .../v2/SubmittedDependencySecretBuilder.scala |  1 -
 .../v1/KubernetesSparkRestServer.scala        | 26 +++----
 .../v1/PemsToKeyStoreConverter.scala          | 22 ++++--
 ...SparkDependencyDownloadInitContainer.scala | 13 ++--
 ...ourceStagingServerSslOptionsProvider.scala | 72 +++++++++++++++----
 .../spark/deploy/kubernetes/SSLUtils.scala    |  9 +--
 .../kubernetes/submit/v2/SSLFilePairs.scala   | 23 ++++++
 ...DependencyDownloadInitContainerSuite.scala |  4 +-
 ...StagingServerSslOptionsProviderSuite.scala | 40 +++++++++--
 .../v2/ResourceStagingServerSuite.scala       |  6 +-
 .../integrationtest/KubernetesV1Suite.scala   | 19 ++---
 .../integrationtest/KubernetesV2Suite.scala   | 43 +++++++----
 .../ResourceStagingServerLauncher.scala       | 54 +++++++++-----
 .../minikube/MinikubeTestBackend.scala        |  2 -
 17 files changed, 256 insertions(+), 96 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 45e5a46a26258..ab442131ad271 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -364,10 +364,15 @@ package object config extends Logging {
   private[spark] val RESOURCE_STAGING_SERVER_SSL_NAMESPACE = "kubernetes.resourceStagingServer"
   private[spark] val RESOURCE_STAGING_SERVER_CERT_PEM =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.serverCertPem")
-      .doc("Certificate PEM file to use when having the Kubernetes dependency server" +
+      .doc("Certificate PEM file to use when having the resource staging server" +
         " listen on TLS.")
       .stringConf
       .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM =
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.clientCertPem")
+      .doc("Certificate PEM file to use when the client contacts the resource staging server.")
+      .stringConf
+      .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_PASSWORD_FILE =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.keyStorePasswordFile")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
index da08e17dee85b..23e3e09834372 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
@@ -25,6 +25,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.rest.kubernetes.v2.ResourceStagingServerSslOptionsProviderImpl
 import org.apache.spark.internal.Logging
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
@@ -267,8 +268,9 @@ private[spark] object Client {
     val appName = sparkConf.getOption("spark.app.name")
       .getOrElse("spark")
     val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+    val sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
     val initContainerComponentsProvider = new DriverInitContainerComponentsProviderImpl(
-      sparkConf, kubernetesAppId, sparkJars, sparkFiles)
+      sparkConf, kubernetesAppId, sparkJars, sparkFiles, sslOptionsProvider.getSslOptions)
     val kubernetesClientProvider = new SubmissionKubernetesClientProviderImpl(sparkConf)
     val kubernetesCredentialsMounterProvider =
         new DriverPodKubernetesCredentialsMounterProviderImpl(sparkConf, kubernetesAppId)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
index 5b649735f2b3d..7f6ae2ec47675 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.v2
 
-import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.{SparkConf, SSLOptions}
 import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
@@ -46,12 +46,11 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
     sparkConf: SparkConf,
     kubernetesAppId: String,
     sparkJars: Seq[String],
-    sparkFiles: Seq[String])
+    sparkFiles: Seq[String],
+    resourceStagingServerSslOptions: SSLOptions)
     extends DriverInitContainerComponentsProvider {
 
   private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
-  private val resourceStagingServerSslOptions = new SecurityManager(sparkConf)
-      .getSSLOptions(RESOURCE_STAGING_SERVER_SSL_NAMESPACE)
   private val jarsDownloadPath = sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
   private val filesDownloadPath = sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
   private val maybeSecretName = maybeResourceStagingServerUri.map { _ =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
index 1853b2ecce6d2..b8fa43d0573f7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
@@ -22,7 +22,6 @@ import io.fabric8.kubernetes.api.model.{Secret, SecretBuilder}
 import scala.collection.JavaConverters._
 
 import org.apache.spark.SSLOptions
-import org.apache.spark.deploy.kubernetes.constants._
 
 private[spark] trait SubmittedDependencySecretBuilder {
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
index 52ca3ef956a79..5cd24a8f9b75e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
@@ -414,18 +414,20 @@ private[spark] object KubernetesSparkRestServer {
         // If keystore password isn't set but we're using PEM files, generate a password
         .orElse(parsedArguments.keyPemFile.map(_ => randomPassword()))
       val resolvedKeyStore = parsedArguments.keyStoreFile.map(new File(_)).orElse(
-        parsedArguments.keyPemFile.map(keyPemFile => {
-          parsedArguments.certPemFile.map(certPemFile => {
-            PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
-              new File(keyPemFile),
-              new File(certPemFile),
-              "provided-key",
-              keyStorePassword,
-              keyPassword,
-              parsedArguments.keyStoreType)
-          })
-        }).getOrElse(throw new SparkException("When providing PEM files to set up TLS for the" +
-          " submission server, both the key and the certificate must be specified.")))
+        for {
+          keyPemFile <- parsedArguments.keyPemFile
+          certPemFile <- parsedArguments.certPemFile
+          resolvedKeyStorePassword <- keyStorePassword
+          resolvedKeyPassword <- keyPassword
+        } yield {
+          PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
+            new File(keyPemFile),
+            new File(certPemFile),
+            "provided-key",
+            resolvedKeyStorePassword,
+            resolvedKeyPassword,
+            parsedArguments.keyStoreType)
+        })
       new SSLOptions(
         enabled = true,
         keyStore = resolvedKeyStore,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
index 2c68b150baf91..178956a136d1c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
@@ -43,8 +43,8 @@ private[spark] object PemsToKeyStoreConverter {
       keyPemFile: File,
       certPemFile: File,
       keyAlias: String,
-      keyStorePassword: Option[String],
-      keyPassword: Option[String],
+      keyStorePassword: String,
+      keyPassword: String,
       keyStoreType: Option[String]): File = {
     require(keyPemFile.isFile, s"Key PEM file provided at ${keyPemFile.getAbsolutePath}" +
       " does not exist or is not a file.")
@@ -58,12 +58,12 @@ private[spark] object PemsToKeyStoreConverter {
     keyStore.setKeyEntry(
       keyAlias,
       privateKey,
-      keyPassword.map(_.toCharArray).orNull,
+      keyPassword.toCharArray,
       certificates)
     val keyStoreDir = Utils.createTempDir("temp-keystores")
     val keyStoreFile = new File(keyStoreDir, s"keystore-${UUID.randomUUID()}.$resolvedKeyStoreType")
     Utils.tryWithResource(new FileOutputStream(keyStoreFile)) { storeStream =>
-      keyStore.store(storeStream, keyStorePassword.map(_.toCharArray).orNull)
+      keyStore.store(storeStream, keyStorePassword.toCharArray)
     }
     keyStoreFile
   }
@@ -81,6 +81,20 @@ private[spark] object PemsToKeyStoreConverter {
     trustStore
   }
 
+  def convertCertPemToTempTrustStoreFile(
+      certPemFile: File,
+      trustStorePassword: String,
+      trustStoreType: Option[String]): File = {
+    val trustStore = convertCertPemToTrustStore(certPemFile, trustStoreType)
+    val tempTrustStoreDir = Utils.createTempDir(namePrefix = "temp-trustStore")
+    val tempTrustStoreFile = new File(tempTrustStoreDir,
+      s"trustStore.${trustStoreType.getOrElse(KeyStore.getDefaultType)}")
+    Utils.tryWithResource(new FileOutputStream(tempTrustStoreFile)) {
+      trustStore.store(_, trustStorePassword.toCharArray)
+    }
+    tempTrustStoreFile
+  }
+
   private def withPemParsedFromFile[T](pemFile: File)(f: (PEMParser => T)): T = {
     Utils.tryWithResource(new FileInputStream(pemFile)) { pemStream =>
       Utils.tryWithResource(new InputStreamReader(pemStream, Charsets.UTF_8)) { pemReader =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
index 67caa176930ea..7f21087159145 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
@@ -28,7 +28,7 @@ import retrofit2.{Call, Callback, Response}
 import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.Duration
 
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.CompressionUtils
@@ -95,7 +95,7 @@ private[spark] class KubernetesSparkDependencyDownloadInitContainer(
     sparkConf: SparkConf,
     retrofitClientFactory: RetrofitClientFactory,
     fileFetcher: FileFetcher,
-    securityManager: SparkSecurityManager) extends Logging {
+    resourceStagingServerSslOptions: SSLOptions) extends Logging {
 
   private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("download-executor"))
@@ -177,9 +177,10 @@ private[spark] class KubernetesSparkDependencyDownloadInitContainer(
       maybeResourceId.foreach { resourceId =>
         require(resourceSecretLocation.isFile, errMessageOnSecretNotAFile)
         require(resourceDownloadDir.isDirectory, errMessageOnDownloadDirNotADirectory)
-        val sslOptions = securityManager.getSSLOptions("kubernetes.resourceStagingServer")
         val service = retrofitClientFactory.createRetrofitClient(
-          resourceStagingServerUri, classOf[ResourceStagingServiceRetrofit], sslOptions)
+          resourceStagingServerUri,
+          classOf[ResourceStagingServiceRetrofit],
+          resourceStagingServerSslOptions)
         val resourceSecret = Files.toString(resourceSecretLocation, Charsets.UTF_8)
         val downloadResourceCallback = new DownloadTarGzCallback(resourceDownloadDir)
         logInfo(downloadStartMessage)
@@ -219,12 +220,14 @@ object KubernetesSparkDependencyDownloadInitContainer extends Logging {
       new SparkConf(true)
     }
     val securityManager = new SparkSecurityManager(sparkConf)
+    val resourceStagingServerSslOptions =
+      new ResourceStagingServerSslOptionsProviderImpl(sparkConf).getSslOptions
     val fileFetcher = new FileFetcherImpl(sparkConf, securityManager)
     new KubernetesSparkDependencyDownloadInitContainer(
       sparkConf,
       RetrofitClientFactoryImpl,
       fileFetcher,
-      securityManager).run()
+      resourceStagingServerSslOptions).run()
     logInfo("Finished downloading application dependencies.")
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
index 2744ed0a74616..6b88426d00e72 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
@@ -17,9 +17,11 @@
 package org.apache.spark.deploy.rest.kubernetes.v2
 
 import java.io.File
+import java.security.SecureRandom
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
+import org.apache.commons.lang3.RandomStringUtils
 
 import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.config._
@@ -32,20 +34,29 @@ private[spark] trait ResourceStagingServerSslOptionsProvider {
 
 private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: SparkConf)
     extends ResourceStagingServerSslOptionsProvider with Logging {
+
+  private val SECURE_RANDOM = new SecureRandom()
+
   def getSslOptions: SSLOptions = {
     val baseSslOptions = new SparkSecurityManager(sparkConf)
-      .getSSLOptions("kubernetes.resourceStagingServer")
+      .getSSLOptions(RESOURCE_STAGING_SERVER_SSL_NAMESPACE)
     val maybeKeyPem = sparkConf.get(RESOURCE_STAGING_SERVER_KEY_PEM)
-    val maybeCertPem = sparkConf.get(RESOURCE_STAGING_SERVER_CERT_PEM)
+    val maybeServerCertPem = sparkConf.get(RESOURCE_STAGING_SERVER_CERT_PEM)
     val maybeKeyStorePasswordFile = sparkConf.get(RESOURCE_STAGING_SERVER_KEYSTORE_PASSWORD_FILE)
     val maybeKeyPasswordFile = sparkConf.get(RESOURCE_STAGING_SERVER_KEYSTORE_KEY_PASSWORD_FILE)
+    val maybeClientCertPem = sparkConf.get(RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM)
 
     logSslConfigurations(
-      baseSslOptions, maybeKeyPem, maybeCertPem, maybeKeyStorePasswordFile, maybeKeyPasswordFile)
+      baseSslOptions,
+      maybeKeyPem,
+      maybeServerCertPem,
+      maybeKeyStorePasswordFile,
+      maybeKeyPasswordFile,
+      maybeClientCertPem)
 
     requireNandDefined(baseSslOptions.keyStore, maybeKeyPem,
       "Shouldn't provide both key PEM and keyStore files for TLS.")
-    requireNandDefined(baseSslOptions.keyStore, maybeCertPem,
+    requireNandDefined(baseSslOptions.keyStore, maybeServerCertPem,
       "Shouldn't provide both certificate PEM and keyStore files for TLS.")
     requireNandDefined(baseSslOptions.keyStorePassword, maybeKeyStorePasswordFile,
       "Shouldn't provide both the keyStore password value and the keyStore password file.")
@@ -53,42 +64,68 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
       "Shouldn't provide both the keyStore key password value and the keyStore key password file.")
     requireBothOrNeitherDefined(
       maybeKeyPem,
-      maybeCertPem,
+      maybeServerCertPem,
       "When providing a certificate PEM file, the key PEM file must also be provided.",
       "When providing a key PEM file, the certificate PEM file must also be provided.")
+    requireNandDefined(baseSslOptions.trustStore, maybeClientCertPem,
+      "Shouldn't provide both the trustStore and a client certificate PEM file.")
 
     val resolvedKeyStorePassword = baseSslOptions.keyStorePassword
       .orElse(maybeKeyStorePasswordFile.map { keyStorePasswordFile =>
         safeFileToString(keyStorePasswordFile, "KeyStore password file")
       })
+      .orElse(maybeKeyPem.map { _ => randomPassword()})
     val resolvedKeyStoreKeyPassword = baseSslOptions.keyPassword
       .orElse(maybeKeyPasswordFile.map { keyPasswordFile =>
         safeFileToString(keyPasswordFile, "KeyStore key password file")
       })
-    val resolvedKeyStore = baseSslOptions.keyStore
-      .orElse(maybeKeyPem.map { keyPem =>
+      .orElse(maybeKeyPem.map { _ => randomPassword()})
+    val resolvedKeyStore = baseSslOptions.keyStore.orElse {
+      for {
+        keyPem <- maybeKeyPem
+        certPem <- maybeServerCertPem
+        keyStorePassword <- resolvedKeyStorePassword
+        keyPassword <- resolvedKeyStoreKeyPassword
+      } yield {
         val keyPemFile = new File(keyPem)
-        val certPemFile = new File(maybeCertPem.get)
+        val certPemFile = new File(certPem)
         PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
           keyPemFile,
           certPemFile,
           "key",
-          resolvedKeyStorePassword,
-          resolvedKeyStoreKeyPassword,
+          keyStorePassword,
+          keyPassword,
           baseSslOptions.keyStoreType)
-      })
+      }
+    }
+    val resolvedTrustStorePassword = baseSslOptions.trustStorePassword
+      .orElse(maybeClientCertPem.map( _ => "defaultTrustStorePassword"))
+    val resolvedTrustStore = baseSslOptions.trustStore.orElse {
+      for {
+        clientCertPem <- maybeClientCertPem
+        trustStorePassword <- resolvedTrustStorePassword
+      } yield {
+        val certPemFile = new File(clientCertPem)
+        PemsToKeyStoreConverter.convertCertPemToTempTrustStoreFile(
+          certPemFile,
+          trustStorePassword,
+          baseSslOptions.trustStoreType)
+      }
+    }
     baseSslOptions.copy(
       keyStore = resolvedKeyStore,
       keyStorePassword = resolvedKeyStorePassword,
-      keyPassword = resolvedKeyStoreKeyPassword)
+      keyPassword = resolvedKeyStoreKeyPassword,
+      trustStore = resolvedTrustStore)
   }
 
   private def logSslConfigurations(
       baseSslOptions: SSLOptions,
       maybeKeyPem: Option[String],
-      maybeCertPem: Option[String],
+      maybeServerCertPem: Option[String],
       maybeKeyStorePasswordFile: Option[String],
-      maybeKeyPasswordFile: Option[String]) = {
+      maybeKeyPasswordFile: Option[String],
+      maybeClientCertPem: Option[String]) = {
     logDebug("The following SSL configurations were provided for the resource staging server:")
     logDebug(s"KeyStore File: ${baseSslOptions.keyStore.map(_.getAbsolutePath).getOrElse("N/A")}")
     logDebug("KeyStore Password: " +
@@ -99,7 +136,8 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
     logDebug(s"Key Password File: ${maybeKeyPasswordFile.getOrElse("N/A")}")
     logDebug(s"KeyStore Type: ${baseSslOptions.keyStoreType.getOrElse("N/A")}")
     logDebug(s"Key PEM: ${maybeKeyPem.getOrElse("N/A")}")
-    logDebug(s"Certificate PEM: ${maybeCertPem.getOrElse("N/A")}")
+    logDebug(s"Server-side certificate PEM: ${maybeServerCertPem.getOrElse("N/A")}")
+    logDebug(s"Client-side certificate PEM: ${maybeClientCertPem.getOrElse("N/A")}")
   }
 
   private def requireBothOrNeitherDefined(
@@ -130,4 +168,8 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
     }
     Files.toString(file, Charsets.UTF_8)
   }
+
+  private def randomPassword(): String = {
+    RandomStringUtils.random(1024, 0, Integer.MAX_VALUE, false, false, null, SECURE_RANDOM)
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
index 0cb056dcf5493..886484ffb4692 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
@@ -30,6 +30,7 @@ import org.bouncycastle.cert.jcajce.{JcaX509CertificateConverter, JcaX509v3Certi
 import org.bouncycastle.openssl.jcajce.JcaPEMWriter
 import org.bouncycastle.operator.jcajce.JcaContentSignerBuilder
 
+import org.apache.spark.deploy.kubernetes.submit.v2.{KeyAndCertPem, KeyStoreAndTrustStore}
 import org.apache.spark.util.Utils
 
 private[spark] object SSLUtils {
@@ -38,7 +39,7 @@ private[spark] object SSLUtils {
       ipAddress: String,
       keyStorePassword: String,
       keyPassword: String,
-      trustStorePassword: String): (File, File) = {
+      trustStorePassword: String): KeyStoreAndTrustStore = {
     val keyPairGenerator = KeyPairGenerator.getInstance("RSA")
     keyPairGenerator.initialize(512)
     val keyPair = keyPairGenerator.generateKeyPair()
@@ -60,10 +61,10 @@ private[spark] object SSLUtils {
     Utils.tryWithResource(new FileOutputStream(trustStoreFile)) {
       trustStore.store(_, trustStorePassword.toCharArray)
     }
-    (keyStoreFile, trustStoreFile)
+    KeyStoreAndTrustStore(keyStoreFile, trustStoreFile)
   }
 
-  def generateKeyCertPemPair(ipAddress: String): (File, File) = {
+  def generateKeyCertPemPair(ipAddress: String): KeyAndCertPem = {
     val keyPairGenerator = KeyPairGenerator.getInstance("RSA")
     keyPairGenerator.initialize(512)
     val keyPair = keyPairGenerator.generateKeyPair()
@@ -90,7 +91,7 @@ private[spark] object SSLUtils {
         }
       }
     }
-    (keyPemFile, certPemFile)
+    KeyAndCertPem(keyPemFile, certPemFile)
   }
 
   private def generateCertificate(ipAddress: String, keyPair: KeyPair): X509Certificate = {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala
new file mode 100644
index 0000000000000..3d3ff7ad7011a
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.v2
+
+import java.io.File
+
+case class KeyAndCertPem(keyPem: File, certPem: File)
+
+case class KeyStoreAndTrustStore(keyStore: File, trustStore: File)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
index 6ab37185b8d07..c551fbc01d060 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
@@ -99,7 +99,7 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
       sparkConf,
       retrofitClientFactory,
       fileFetcher,
-      securityManager = new SparkSecurityManager(sparkConf))
+      resourceStagingServerSslOptions = STAGING_SERVER_SSL_OPTIONS)
     when(retrofitClient.downloadResources(JARS_RESOURCE_ID, downloadJarsSecretValue))
       .thenReturn(downloadJarsCall)
     when(retrofitClient.downloadResources(FILES_RESOURCE_ID, downloadFilesSecretValue))
@@ -126,7 +126,7 @@ class KubernetesSparkDependencyDownloadInitContainerSuite
       sparkConf,
       retrofitClientFactory,
       fileFetcher,
-      securityManager = new SparkSecurityManager(sparkConf))
+      resourceStagingServerSslOptions = STAGING_SERVER_SSL_OPTIONS)
     initContainerUnderTest.run()
     Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/jar1.jar", downloadJarsDir)
     Mockito.verify(fileFetcher).fetchFile("hdfs://localhost:9000/jar2.jar", downloadJarsDir)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
index 10aced9000bf8..c33d8beb2c397 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
@@ -66,10 +66,12 @@ class ResourceStagingServerSslOptionsProviderSuite extends SparkFunSuite with Be
   }
 
   test("Setting key and certificate pem files should write an appropriate keyStore.") {
-    val (keyPemFile, certPemFile) = SSLUtils.generateKeyCertPemPair("127.0.0.1")
+    val keyAndCertPem = SSLUtils.generateKeyCertPemPair("127.0.0.1")
     sparkConf.set("spark.ssl.kubernetes.resourceStagingServer.enabled", "true")
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyPem", keyPemFile.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.serverCertPem", certPemFile.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPem",
+          keyAndCertPem.keyPem.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.serverCertPem",
+          keyAndCertPem.certPem.getAbsolutePath)
       .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStorePassword")
       .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "keyPassword")
     val sslOptions = sslOptionsProvider.getSslOptions
@@ -81,9 +83,37 @@ class ResourceStagingServerSslOptionsProviderSuite extends SparkFunSuite with Be
         keyStore.load(_, "keyStorePassword".toCharArray)
       }
       val key = keyStore.getKey("key", "keyPassword".toCharArray)
-      compareJcaPemObjectToFileString(key, keyPemFile)
+      compareJcaPemObjectToFileString(key, keyAndCertPem.keyPem)
       val certificate = keyStore.getCertificateChain("key")(0)
-      compareJcaPemObjectToFileString(certificate, certPemFile)
+      compareJcaPemObjectToFileString(certificate, keyAndCertPem.certPem)
+    }
+  }
+
+  test("Setting pem files without setting passwords should use random passwords.") {
+    val keyAndCertPem = SSLUtils.generateKeyCertPemPair("127.0.0.1")
+    sparkConf.set("spark.ssl.kubernetes.resourceStagingServer.enabled", "true")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPem",
+          keyAndCertPem.keyPem.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.serverCertPem",
+          keyAndCertPem.certPem.getAbsolutePath)
+    val sslOptions = sslOptionsProvider.getSslOptions
+    assert(sslOptions.enabled, "SSL should be enabled.")
+    assert(sslOptions.keyStore.isDefined, "KeyStore should be defined.")
+    assert(sslOptions.keyStorePassword.isDefined)
+    assert(sslOptions.keyPassword.isDefined)
+    for {
+      keyStoreFile <- sslOptions.keyStore
+      keyStorePassword <- sslOptions.keyStorePassword
+      keyPassword <- sslOptions.keyPassword
+    } {
+      val keyStore = KeyStore.getInstance(KeyStore.getDefaultType)
+      Utils.tryWithResource(new FileInputStream(keyStoreFile)) {
+        keyStore.load(_, keyStorePassword.toCharArray)
+      }
+      val key = keyStore.getKey("key", keyPassword.toCharArray)
+      compareJcaPemObjectToFileString(key, keyAndCertPem.keyPem)
+      val certificate = keyStore.getCertificateChain("key")(0)
+      compareJcaPemObjectToFileString(certificate, keyAndCertPem.certPem)
     }
   }
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
index 4ef12e8686bb0..4ffb0d4dfa887 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
@@ -57,17 +57,17 @@ class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   test("Enable SSL on the server") {
-    val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
+    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
       ipAddress = "127.0.0.1",
       keyStorePassword = "keyStore",
       keyPassword = "key",
       trustStorePassword = "trustStore")
     val sslOptions = SSLOptions(
       enabled = true,
-      keyStore = Some(keyStore),
+      keyStore = Some(keyStoreAndTrustStore.keyStore),
       keyStorePassword = Some("keyStore"),
       keyPassword = Some("key"),
-      trustStore = Some(trustStore),
+      trustStore = Some(keyStoreAndTrustStore.trustStore),
       trustStorePassword = Some("trustStore"))
     sslOptionsProvider.setOptions(sslOptions)
     server.start()
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
index f09339a9c3e08..559cb281c7c62 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
-import org.apache.spark.deploy.kubernetes.integrationtest.constants.{GCE_TEST_BACKEND, MINIKUBE_TEST_BACKEND}
+import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
 import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
 import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
 import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
@@ -190,16 +190,17 @@ private[spark] class KubernetesV1Suite(testBackend: IntegrationTestBackend)
   test("Enable SSL on the driver submit server") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
-    val (keyStoreFile, trustStoreFile) = SSLUtils.generateKeyStoreTrustStorePair(
+    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
       Minikube.getMinikubeIp,
       "changeit",
       "changeit",
       "changeit")
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE, s"file://${keyStoreFile.getAbsolutePath}")
+    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE,
+        s"file://${keyStoreAndTrustStore.keyStore.getAbsolutePath}")
     sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyStorePassword", "changeit")
     sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyPassword", "changeit")
     sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
-      s"file://${trustStoreFile.getAbsolutePath}")
+        s"file://${keyStoreAndTrustStore.trustStore.getAbsolutePath}")
     sparkConf.set("spark.ssl.kubernetes.driversubmitserver.trustStorePassword", "changeit")
     sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
     new Client(
@@ -212,10 +213,12 @@ private[spark] class KubernetesV1Suite(testBackend: IntegrationTestBackend)
   test("Enable SSL on the driver submit server using PEM files") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
-    val (keyPem, certPem) = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
-    sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyPem.getAbsolutePath}")
-    sparkConf.set(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${certPem.getAbsolutePath}")
-    sparkConf.set(DRIVER_SUBMIT_SSL_SERVER_CERT_PEM, s"file://${certPem.getAbsolutePath}")
+    val keyAndCertPem = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
+    sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyAndCertPem.keyPem.getAbsolutePath}")
+    sparkConf.set(
+        DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${keyAndCertPem.certPem.getAbsolutePath}")
+    sparkConf.set(
+        DRIVER_SUBMIT_SSL_SERVER_CERT_PEM, s"file://${keyAndCertPem.certPem.getAbsolutePath}")
     sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
     new Client(
       sparkConf = sparkConf,
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
index ba9d088bfcfcc..e9900b90cb588 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
@@ -29,7 +29,7 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.submit.v2.Client
+import org.apache.spark.deploy.kubernetes.submit.v2.{Client, KeyAndCertPem}
 import org.apache.spark.launcher.SparkLauncher
 
 @DoNotDiscover
@@ -65,31 +65,34 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   test("Use submission v2.") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
-    launchStagingServer(SSLOptions())
+    launchStagingServer(SSLOptions(), None)
     runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
   test("Enable SSL on the submission server") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
-    val (keyStore, trustStore) = SSLUtils.generateKeyStoreTrustStorePair(
+    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
       ipAddress = Minikube.getMinikubeIp,
       keyStorePassword = "keyStore",
       keyPassword = "key",
       trustStorePassword = "trustStore")
     sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore", keyStore.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore", trustStore.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore",
+          keyStoreAndTrustStore.keyStore.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore",
+          keyStoreAndTrustStore.trustStore.getAbsolutePath)
       .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStore")
       .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "key")
       .set("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword", "trustStore")
     launchStagingServer(SSLOptions(
       enabled = true,
-      keyStore = Some(keyStore),
-      trustStore = Some(trustStore),
+      keyStore = Some(keyStoreAndTrustStore.keyStore),
+      trustStore = Some(keyStoreAndTrustStore.trustStore),
       keyStorePassword = Some("keyStore"),
       keyPassword = Some("key"),
-      trustStorePassword = Some("trustStore")))
+      trustStorePassword = Some("trustStore")),
+      None)
     runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
   }
 
@@ -104,7 +107,7 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   test("Dynamic executor scaling basic test") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
-    launchStagingServer(SSLOptions())
+    launchStagingServer(SSLOptions(), None)
     createShuffleServiceDaemonSet()
 
     sparkConf.setJars(Seq(KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
@@ -117,6 +120,7 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   }
 
   test("Use remote resources without the resource staging server.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
     val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
     sparkConf.setJars(Seq(
       s"$assetServerUri/${KubernetesSuite.EXAMPLES_JAR_FILE.getName}",
@@ -126,7 +130,8 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
   }
 
   test("Mix remote resources with submitted ones.") {
-    launchStagingServer(SSLOptions())
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    launchStagingServer(SSLOptions(), None)
     val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
     sparkConf.setJars(Seq(
       KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
@@ -135,7 +140,20 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
   }
 
+  test("Use key and certificate PEM files for TLS.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    val keyAndCertificate = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
+    launchStagingServer(
+        SSLOptions(enabled = true),
+        Some(keyAndCertificate))
+    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+        .set(
+            RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key, keyAndCertificate.certPem.getAbsolutePath)
+    runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
   test("Use client key and client cert file when requesting executors") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
     sparkConf.setJars(Seq(
         KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
         KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
@@ -148,11 +166,12 @@ private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
     runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
   }
 
-  private def launchStagingServer(resourceStagingServerSslOptions: SSLOptions): Unit = {
+  private def launchStagingServer(
+      resourceStagingServerSslOptions: SSLOptions, keyAndCertPem: Option[KeyAndCertPem]): Unit = {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
     val resourceStagingServerPort = resourceStagingServerLauncher.launchStagingServer(
-      resourceStagingServerSslOptions)
+      resourceStagingServerSslOptions, keyAndCertPem)
     val resourceStagingServerUriScheme = if (resourceStagingServerSslOptions.enabled) {
       "https"
     } else {
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
index 3a99f907d15fd..1ba54c131c196 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
@@ -16,21 +16,17 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest
 
-import java.io.StringWriter
+import java.io.{File, StringWriter}
 import java.util.Properties
-import java.util.concurrent.TimeUnit
 
 import com.google.common.io.{BaseEncoding, Files}
-import com.google.common.util.concurrent.SettableFuture
-import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, Endpoints, HasMetadata, HTTPGetActionBuilder, KeyToPathBuilder, Pod, PodBuilder, SecretBuilder, ServiceBuilder}
-import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
-import io.fabric8.kubernetes.client.internal.readiness.Readiness
+import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, Endpoints, HTTPGetActionBuilder, KeyToPathBuilder, Pod, PodBuilder, SecretBuilder, ServiceBuilder}
+import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.JavaConverters._
 
 import org.apache.spark.SSLOptions
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.submit.v2.ContainerNameEqualityPredicate
+import org.apache.spark.deploy.kubernetes.submit.v2.{ContainerNameEqualityPredicate, KeyAndCertPem}
 import org.apache.spark.util.Utils
 
 /**
@@ -38,23 +34,39 @@ import org.apache.spark.util.Utils
  */
 private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesClient) {
 
-  private val KEYSTORE_DIR = "/mnt/secrets/spark-staging"
-  private val KEYSTORE_FILE = s"$KEYSTORE_DIR/keyStore"
+  private val SECRETS_ROOT_DIR = "/mnt/secrets/spark-staging"
+  private val KEYSTORE_SECRET_KEY = "keyStore"
+  private val KEYSTORE_FILE = s"$SECRETS_ROOT_DIR/$KEYSTORE_SECRET_KEY"
+  private val KEY_PEM_SECRET_KEY = "keyPem"
+  private val CERT_PEM_SECRET_KEY = "certPem"
+  private val KEY_PEM_FILE = s"$SECRETS_ROOT_DIR/$KEY_PEM_SECRET_KEY"
+  private val CERT_PEM_FILE = s"$SECRETS_ROOT_DIR/$CERT_PEM_SECRET_KEY"
+  private val SSL_SECRET_NAME = "resource-staging-server-ssl-secrets"
   private val PROPERTIES_FILE_NAME = "staging-server.properties"
   private val PROPERTIES_DIR = "/var/data/spark-staging-server"
   private val PROPERTIES_FILE_PATH = s"$PROPERTIES_DIR/$PROPERTIES_FILE_NAME"
 
   // Returns the NodePort the staging server is listening on
-  def launchStagingServer(sslOptions: SSLOptions): Int = {
+  def launchStagingServer(
+      sslOptions: SSLOptions,
+      keyAndCertPem: Option[KeyAndCertPem] = None): Int = {
     val stagingServerProperties = new Properties()
     val stagingServerSecret = sslOptions.keyStore.map { keyStore =>
       val keyStoreBytes = Files.toByteArray(keyStore)
       val keyStoreBase64 = BaseEncoding.base64().encode(keyStoreBytes)
+      Map(KEYSTORE_SECRET_KEY -> keyStoreBase64)
+    }.orElse {
+      keyAndCertPem.map { keyAndCert =>
+        val keyPemBytes = Files.toByteArray(keyAndCert.keyPem)
+        val keyPemBase64 = BaseEncoding.base64().encode(keyPemBytes)
+        val certPemBytes = Files.toByteArray(keyAndCert.certPem)
+        val certPemBase64 = BaseEncoding.base64().encode(certPemBytes)
+        Map(KEY_PEM_SECRET_KEY -> keyPemBase64, CERT_PEM_SECRET_KEY -> certPemBase64)
+      }
+    }.map { secretData =>
       new SecretBuilder()
-        .withNewMetadata()
-          .withName("resource-staging-server-keystore")
-          .endMetadata()
-        .addToData("keyStore", keyStoreBase64)
+        .withNewMetadata().withName(SSL_SECRET_NAME).endMetadata()
+        .withData(secretData.asJava)
         .build()
     }
     stagingServerProperties.setProperty(
@@ -67,10 +79,18 @@ private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesC
       stagingServerProperties.setProperty(
         "spark.ssl.kubernetes.resourceStagingServer.keyPassword", password)
     }
-    stagingServerSecret.foreach { _ =>
+    sslOptions.keyStore.foreach { _ =>
       stagingServerProperties.setProperty(
         "spark.ssl.kubernetes.resourceStagingServer.keyStore", KEYSTORE_FILE)
     }
+    keyAndCertPem.foreach { _ =>
+      stagingServerProperties.setProperty(
+          RESOURCE_STAGING_SERVER_KEY_PEM.key, KEY_PEM_FILE)
+    }
+    keyAndCertPem.foreach { _ =>
+      stagingServerProperties.setProperty(
+          RESOURCE_STAGING_SERVER_CERT_PEM.key, CERT_PEM_FILE)
+    }
     val propertiesWriter = new StringWriter()
     stagingServerProperties.store(propertiesWriter, "Resource staging server properties.")
     val stagingServerConfigMap = new ConfigMapBuilder()
@@ -126,7 +146,7 @@ private[spark] class ResourceStagingServerLauncher(kubernetesClient: KubernetesC
         .editMatchingContainer(new ContainerNameEqualityPredicate("staging-server-container"))
           .addNewVolumeMount()
             .withName("keystore-volume")
-            .withMountPath(KEYSTORE_DIR)
+            .withMountPath(SECRETS_ROOT_DIR)
             .endVolumeMount()
           .endContainer()
         .endSpec()
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
index 6e0049b813719..461264877edc2 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
@@ -42,6 +42,4 @@ private[spark] class MinikubeTestBackend extends IntegrationTestBackend {
   }
 
   override def name(): String = MINIKUBE_TEST_BACKEND
-
-
 }

From 8f6f0a041873d6d6fb1a80111d9baf70f05524d3 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 19 May 2017 15:47:11 -0700
Subject: [PATCH 483/534] Differentiate between URI and SSL settings for
 in-cluster vs. submission (#281)

---
 .../kubernetes/OptionRequirements.scala       |  40 ++++++
 .../spark/deploy/kubernetes/config.scala      |  69 ++++++++--
 .../spark/deploy/kubernetes/constants.scala   |   1 +
 ...riverInitContainerComponentsProvider.scala |  64 ++++++++-
 ...dDependencyInitContainerConfigPlugin.scala |  55 ++++++--
 .../v2/SubmittedDependencySecretBuilder.scala |  44 +++++--
 ...ourceStagingServerSslOptionsProvider.scala | 121 ++++++++----------
 .../KubernetesClusterSchedulerBackend.scala   |   3 +-
 ...ndencyInitContainerConfigPluginSuite.scala |  34 +++--
 ...ubmittedDependencySecretBuilderSuite.scala |  40 +++++-
 10 files changed, 341 insertions(+), 130 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/OptionRequirements.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/OptionRequirements.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/OptionRequirements.scala
new file mode 100644
index 0000000000000..eda43de0a9a5b
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/OptionRequirements.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+private[spark] object OptionRequirements {
+
+  def requireBothOrNeitherDefined(
+      opt1: Option[_],
+      opt2: Option[_],
+      errMessageWhenFirstIsMissing: String,
+      errMessageWhenSecondIsMissing: String): Unit = {
+    requireSecondIfFirstIsDefined(opt1, opt2, errMessageWhenSecondIsMissing)
+    requireSecondIfFirstIsDefined(opt2, opt1, errMessageWhenFirstIsMissing)
+  }
+
+  def requireSecondIfFirstIsDefined(
+      opt1: Option[_], opt2: Option[_], errMessageWhenSecondIsMissing: String): Unit = {
+    opt1.foreach { _ =>
+      require(opt2.isDefined, errMessageWhenSecondIsMissing)
+    }
+  }
+
+  def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
+    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index ab442131ad271..759a7df505829 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -362,6 +362,8 @@ package object config extends Logging {
       .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_SSL_NAMESPACE = "kubernetes.resourceStagingServer"
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_SSL_NAMESPACE =
+      "kubernetes.resourceStagingServer.internal"
   private[spark] val RESOURCE_STAGING_SERVER_CERT_PEM =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.serverCertPem")
       .doc("Certificate PEM file to use when having the resource staging server" +
@@ -370,35 +372,70 @@ package object config extends Logging {
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.clientCertPem")
-      .doc("Certificate PEM file to use when the client contacts the resource staging server.")
+      .doc("Certificate PEM file to use when the client contacts the resource staging server." +
+        " This must strictly be a path to a file on the submitting machine's disk.")
+      .stringConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_CLIENT_CERT_PEM =
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_INTERNAL_SSL_NAMESPACE.clientCertPem")
+      .doc("Certificate PEM file to use when the init-container contacts the resource staging" +
+        " server. If this is not provided, it defaults to the value of" +
+        " spark.ssl.kubernetes.resourceStagingServer.clientCertPem. This can be a URI with" +
+        " a scheme of local:// which denotes that the file is pre-mounted on the init-container's" +
+        " disk. A uri without a scheme or a scheme of file:// will result in this file being" +
+        " mounted from the submitting machine's disk as a secret into the pods.")
       .stringConf
       .createOptional
-
   private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_PASSWORD_FILE =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.keyStorePasswordFile")
-      .doc("File containing the keystore password for the Kubernetes dependency server.")
+      .doc("File containing the keystore password for the Kubernetes resource staging server.")
       .stringConf
       .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_KEYSTORE_KEY_PASSWORD_FILE =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.keyPasswordFile")
-      .doc("File containing the key password for the Kubernetes dependency server.")
+      .doc("File containing the key password for the Kubernetes resource staging server.")
       .stringConf
       .createOptional
 
   private[spark] val RESOURCE_STAGING_SERVER_SSL_ENABLED =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.enabled")
-      .doc("Whether or not to use SSL when communicating with the dependency server.")
+      .doc("Whether or not to use SSL when communicating with the resource staging server.")
+      .booleanConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_SSL_ENABLED =
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_INTERNAL_SSL_NAMESPACE.enabled")
+      .doc("Whether or not to use SSL when communicating with the resource staging server from" +
+        " the init-container. If this is not provided, defaults to" +
+        " the value of spark.ssl.kubernetes.resourceStagingServer.enabled")
       .booleanConf
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.trustStore")
-      .doc("File containing the trustStore to communicate with the Kubernetes dependency server.")
+      .doc("File containing the trustStore to communicate with the Kubernetes dependency server." +
+        " This must strictly be a path on the submitting machine's disk.")
+      .stringConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_FILE =
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_INTERNAL_SSL_NAMESPACE.trustStore")
+      .doc("File containing the trustStore to communicate with the Kubernetes dependency server" +
+        " from the init-container. If this is not provided, defaults to the value of" +
+        " spark.ssl.kubernetes.resourceStagingServer.trustStore. This can be a URI with a scheme" +
+        " of local:// indicating that the trustStore is pre-mounted on the init-container's" +
+        " disk. If no scheme, or a scheme of file:// is provided, this file is mounted from the" +
+        " submitting machine's disk as a Kubernetes secret into the pods.")
       .stringConf
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD =
     ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_SSL_NAMESPACE.trustStorePassword")
-      .doc("Password for the trustStore for talking to the dependency server.")
+      .doc("Password for the trustStore for communicating to the dependency server.")
+      .stringConf
+      .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_PASSWORD =
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_INTERNAL_SSL_NAMESPACE.trustStorePassword")
+      .doc("Password for the trustStore for communicating to the dependency server from the" +
+        " init-container. If this is not provided, defaults to" +
+        " spark.ssl.kubernetes.resourceStagingServer.trustStorePassword.")
       .stringConf
       .createOptional
   private[spark] val RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE =
@@ -406,11 +443,27 @@ package object config extends Logging {
       .doc("Type of trustStore for communicating with the dependency server.")
       .stringConf
       .createOptional
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_TYPE =
+    ConfigBuilder(s"spark.ssl.$RESOURCE_STAGING_SERVER_INTERNAL_SSL_NAMESPACE.trustStoreType")
+      .doc("Type of trustStore for communicating with the dependency server from the" +
+        " init-container. If this is not provided, defaults to" +
+        " spark.ssl.kubernetes.resourceStagingServer.trustStoreType")
+      .stringConf
+      .createOptional
 
   // Driver and Init-Container parameters for submission v2
   private[spark] val RESOURCE_STAGING_SERVER_URI =
     ConfigBuilder("spark.kubernetes.resourceStagingServer.uri")
-      .doc("Base URI for the Spark resource staging server")
+      .doc("Base URI for the Spark resource staging server.")
+      .stringConf
+      .createOptional
+
+  private[spark] val RESOURCE_STAGING_SERVER_INTERNAL_URI =
+    ConfigBuilder("spark.kubernetes.resourceStagingServer.internal.uri")
+      .doc("Base URI for the Spark resource staging server when the init-containers access it for" +
+        " downloading resources. If this is not provided, it defaults to the value provided in" +
+        " spark.kubernetes.resourceStagingServer.uri, the URI that the submission client uses to" +
+        " upload the resources from outside the cluster.")
       .stringConf
       .createOptional
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 8d0965078aaa8..ea11ca2ec8f21 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -115,6 +115,7 @@ package object constants {
   private[spark] val INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY =
     "downloadSubmittedFilesSecret"
   private[spark] val INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY = "trustStore"
+  private[spark] val INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY = "ssl-certificate"
   private[spark] val INIT_CONTAINER_CONFIG_MAP_KEY = "download-submitted-files"
   private[spark] val INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME = "download-jars-volume"
   private[spark] val INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME = "download-files"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
index 7f6ae2ec47675..0a5e6cd216011 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
@@ -17,10 +17,11 @@
 package org.apache.spark.deploy.kubernetes.submit.v2
 
 import org.apache.spark.{SparkConf, SSLOptions}
-import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.kubernetes.v2.RetrofitClientFactoryImpl
+import org.apache.spark.util.Utils
 
 /**
  * Interface that wraps the provision of everything the submission client needs to set up the
@@ -47,10 +48,51 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
     kubernetesAppId: String,
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
-    resourceStagingServerSslOptions: SSLOptions)
+    resourceStagingServerExternalSslOptions: SSLOptions)
     extends DriverInitContainerComponentsProvider {
 
   private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
+  private val maybeResourceStagingServerInternalUri =
+      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_URI)
+  private val maybeResourceStagingServerInternalTrustStore =
+      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_FILE)
+          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE))
+  private val maybeResourceStagingServerInternalTrustStorePassword =
+      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_PASSWORD)
+          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD))
+  private val maybeResourceStagingServerInternalTrustStoreType =
+      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_TYPE)
+          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE))
+  private val maybeResourceStagingServerInternalClientCert =
+      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_CLIENT_CERT_PEM)
+          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM))
+  private val resourceStagingServerInternalSslEnabled =
+      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_SSL_ENABLED)
+          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_SSL_ENABLED))
+          .getOrElse(false)
+
+  OptionRequirements.requireNandDefined(
+      maybeResourceStagingServerInternalClientCert,
+      maybeResourceStagingServerInternalTrustStore,
+      "Cannot provide both a certificate file and a trustStore file for init-containers to" +
+        " use for contacting the resource staging server over TLS.")
+
+  require(maybeResourceStagingServerInternalTrustStore.forall { trustStore =>
+    Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") match {
+      case "file" | "local" => true
+      case _ => false
+    }
+  }, "TrustStore URI used for contacting the resource staging server from init containers must" +
+    " have no scheme, or scheme file://, or scheme local://.")
+
+  require(maybeResourceStagingServerInternalClientCert.forall { trustStore =>
+    Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") match {
+      case "file" | "local" => true
+      case _ => false
+    }
+  }, "Client cert file URI used for contacting the resource staging server from init containers" +
+    " must have no scheme, or scheme file://, or scheme local://.")
+
   private val jarsDownloadPath = sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
   private val filesDownloadPath = sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
   private val maybeSecretName = maybeResourceStagingServerUri.map { _ =>
@@ -71,14 +113,20 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
       filesResourceId <- maybeSubmittedResourceIds.map(_.filesResourceId)
     } yield {
       new SubmittedDependencyInitContainerConfigPluginImpl(
-        stagingServerUri,
+        // Configure the init-container with the internal URI over the external URI.
+        maybeResourceStagingServerInternalUri.getOrElse(stagingServerUri),
         jarsResourceId,
         filesResourceId,
         INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
         INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
         INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
-        INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH,
-        resourceStagingServerSslOptions)
+        INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
+        resourceStagingServerInternalSslEnabled,
+        maybeResourceStagingServerInternalTrustStore,
+        maybeResourceStagingServerInternalClientCert,
+        maybeResourceStagingServerInternalTrustStorePassword,
+        maybeResourceStagingServerInternalTrustStoreType,
+        INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
     }
     new SparkInitContainerConfigMapBuilderImpl(
       sparkJars,
@@ -113,7 +161,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         stagingServerUri,
         sparkJars,
         sparkFiles,
-        resourceStagingServerSslOptions,
+        resourceStagingServerExternalSslOptions,
         RetrofitClientFactoryImpl)
     }
   }
@@ -133,7 +181,9 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
         INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
         INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
-        resourceStagingServerSslOptions)
+        INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
+        maybeResourceStagingServerInternalTrustStore,
+        maybeResourceStagingServerInternalClientCert)
     }
   }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
index bc9abc4eaba81..1b086e60d3d0d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
@@ -16,9 +16,10 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.v2
 
-import org.apache.spark.SSLOptions
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.config.OptionalConfigEntry
+import org.apache.spark.util.Utils
 
 private[spark] trait SubmittedDependencyInitContainerConfigPlugin {
   /**
@@ -34,36 +35,62 @@ private[spark] trait SubmittedDependencyInitContainerConfigPlugin {
 }
 
 private[spark] class SubmittedDependencyInitContainerConfigPluginImpl(
-    resourceStagingServerUri: String,
+    internalResourceStagingServerUri: String,
     jarsResourceId: String,
     filesResourceId: String,
     jarsSecretKey: String,
     filesSecretKey: String,
     trustStoreSecretKey: String,
-    secretsVolumeMountPath: String,
-    resourceStagingServiceSslOptions: SSLOptions)
+    clientCertSecretKey: String,
+    resourceStagingServerSslEnabled: Boolean,
+    maybeInternalTrustStoreUri: Option[String],
+    maybeInternalClientCertUri: Option[String],
+    maybeInternalTrustStorePassword: Option[String],
+    maybeInternalTrustStoreType: Option[String],
+    secretsVolumeMountPath: String)
     extends SubmittedDependencyInitContainerConfigPlugin {
 
   override def configurationsToFetchSubmittedDependencies(): Map[String, String] = {
     Map[String, String](
-      RESOURCE_STAGING_SERVER_URI.key -> resourceStagingServerUri,
+      RESOURCE_STAGING_SERVER_URI.key -> internalResourceStagingServerUri,
       INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> jarsResourceId,
       INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
         s"$secretsVolumeMountPath/$jarsSecretKey",
       INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> filesResourceId,
       INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
         s"$secretsVolumeMountPath/$filesSecretKey",
-      RESOURCE_STAGING_SERVER_SSL_ENABLED.key ->
-        resourceStagingServiceSslOptions.enabled.toString) ++
-      resourceStagingServiceSslOptions.trustStore.map { _ =>
-        (RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key,
-          s"$secretsVolumeMountPath/$trustStoreSecretKey")
-      }.toMap ++
-      resourceStagingServiceSslOptions.trustStorePassword.map { password =>
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> resourceStagingServerSslEnabled.toString) ++
+      resolveSecretPath(
+        maybeInternalTrustStoreUri,
+        trustStoreSecretKey,
+        RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE,
+        "TrustStore URI") ++
+      resolveSecretPath(
+        maybeInternalClientCertUri,
+        clientCertSecretKey,
+        RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM,
+        "Client certificate URI") ++
+      maybeInternalTrustStorePassword.map { password =>
         (RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key, password)
       }.toMap ++
-      resourceStagingServiceSslOptions.trustStoreType.map { storeType =>
+      maybeInternalTrustStoreType.map { storeType =>
         (RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key, storeType)
       }.toMap
   }
+
+  private def resolveSecretPath(
+      maybeUri: Option[String],
+      secretKey: String,
+      configEntry: OptionalConfigEntry[String],
+      uriType: String): Map[String, String] = {
+    maybeUri.map(Utils.resolveURI).map { uri =>
+      val resolvedPath = Option(uri.getScheme).getOrElse("file") match {
+        case "file" => s"$secretsVolumeMountPath/$secretKey"
+        case "local" => uri.getPath
+        case invalid => throw new SparkException(s"$uriType has invalid scheme $invalid must be" +
+          s" local://, file://, or empty.")
+      }
+      (configEntry.key, resolvedPath)
+    }.toMap
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
index b8fa43d0573f7..1a33757e45aa0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
@@ -16,12 +16,14 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.v2
 
+import java.io.File
+
 import com.google.common.base.Charsets
 import com.google.common.io.{BaseEncoding, Files}
 import io.fabric8.kubernetes.api.model.{Secret, SecretBuilder}
 import scala.collection.JavaConverters._
 
-import org.apache.spark.SSLOptions
+import org.apache.spark.util.Utils
 
 private[spark] trait SubmittedDependencySecretBuilder {
   /**
@@ -32,28 +34,30 @@ private[spark] trait SubmittedDependencySecretBuilder {
 }
 
 private[spark] class SubmittedDependencySecretBuilderImpl(
-    secretName: String,
-    jarsResourceSecret: String,
-    filesResourceSecret: String,
-    jarsSecretKey: String,
-    filesSecretKey: String,
-    trustStoreSecretKey: String,
-    resourceStagingServerSslOptions: SSLOptions)
+      secretName: String,
+      jarsResourceSecret: String,
+      filesResourceSecret: String,
+      jarsSecretKey: String,
+      filesSecretKey: String,
+      trustStoreSecretKey: String,
+      clientCertSecretKey: String,
+      internalTrustStoreUri: Option[String],
+      internalClientCertUri: Option[String])
     extends SubmittedDependencySecretBuilder {
 
   override def build(): Secret = {
-    val trustStoreBase64 = resourceStagingServerSslOptions.trustStore.map { trustStoreFile =>
-      require(trustStoreFile.isFile, "Dependency server trustStore provided at" +
-        trustStoreFile.getAbsolutePath + " does not exist or is not a file.")
-      (trustStoreSecretKey, BaseEncoding.base64().encode(Files.toByteArray(trustStoreFile)))
-    }.toMap
+    val trustStoreBase64 = convertFileToBase64IfSubmitterLocal(
+        trustStoreSecretKey, internalTrustStoreUri)
+    val clientCertBase64 = convertFileToBase64IfSubmitterLocal(
+        clientCertSecretKey, internalClientCertUri)
     val jarsSecretBase64 = BaseEncoding.base64().encode(jarsResourceSecret.getBytes(Charsets.UTF_8))
     val filesSecretBase64 = BaseEncoding.base64().encode(
       filesResourceSecret.getBytes(Charsets.UTF_8))
     val secretData = Map(
       jarsSecretKey -> jarsSecretBase64,
       filesSecretKey -> filesSecretBase64) ++
-      trustStoreBase64
+      trustStoreBase64 ++
+      clientCertBase64
     val kubernetesSecret = new SecretBuilder()
       .withNewMetadata()
       .withName(secretName)
@@ -62,4 +66,16 @@ private[spark] class SubmittedDependencySecretBuilderImpl(
       .build()
     kubernetesSecret
   }
+
+  private def convertFileToBase64IfSubmitterLocal(secretKey: String, secretUri: Option[String])
+      : Map[String, String] = {
+    secretUri.filter { trustStore =>
+      Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") == "file"
+    }.map { uri =>
+      val file = new File(Utils.resolveURI(uri).getPath)
+      require(file.isFile, "Dependency server trustStore provided at" +
+        file.getAbsolutePath + " does not exist or is not a file.")
+      (secretKey, BaseEncoding.base64().encode(Files.toByteArray(file)))
+    }.toMap
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
index 6b88426d00e72..0dd0b08433def 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
@@ -23,7 +23,8 @@ import com.google.common.base.Charsets
 import com.google.common.io.Files
 import org.apache.commons.lang3.RandomStringUtils
 
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.{SecurityManager, SparkConf, SparkException, SSLOptions}
+import org.apache.spark.deploy.kubernetes.OptionRequirements
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.rest.kubernetes.v1.PemsToKeyStoreConverter
 import org.apache.spark.internal.Logging
@@ -38,7 +39,7 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
   private val SECURE_RANDOM = new SecureRandom()
 
   def getSslOptions: SSLOptions = {
-    val baseSslOptions = new SparkSecurityManager(sparkConf)
+    val baseSslOptions = new SecurityManager(sparkConf)
       .getSSLOptions(RESOURCE_STAGING_SERVER_SSL_NAMESPACE)
     val maybeKeyPem = sparkConf.get(RESOURCE_STAGING_SERVER_KEY_PEM)
     val maybeServerCertPem = sparkConf.get(RESOURCE_STAGING_SERVER_CERT_PEM)
@@ -47,39 +48,47 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
     val maybeClientCertPem = sparkConf.get(RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM)
 
     logSslConfigurations(
-      baseSslOptions,
-      maybeKeyPem,
-      maybeServerCertPem,
-      maybeKeyStorePasswordFile,
-      maybeKeyPasswordFile,
-      maybeClientCertPem)
-
-    requireNandDefined(baseSslOptions.keyStore, maybeKeyPem,
-      "Shouldn't provide both key PEM and keyStore files for TLS.")
-    requireNandDefined(baseSslOptions.keyStore, maybeServerCertPem,
-      "Shouldn't provide both certificate PEM and keyStore files for TLS.")
-    requireNandDefined(baseSslOptions.keyStorePassword, maybeKeyStorePasswordFile,
-      "Shouldn't provide both the keyStore password value and the keyStore password file.")
-    requireNandDefined(baseSslOptions.keyPassword, maybeKeyPasswordFile,
-      "Shouldn't provide both the keyStore key password value and the keyStore key password file.")
-    requireBothOrNeitherDefined(
-      maybeKeyPem,
-      maybeServerCertPem,
-      "When providing a certificate PEM file, the key PEM file must also be provided.",
-      "When providing a key PEM file, the certificate PEM file must also be provided.")
-    requireNandDefined(baseSslOptions.trustStore, maybeClientCertPem,
-      "Shouldn't provide both the trustStore and a client certificate PEM file.")
+        baseSslOptions,
+        maybeKeyPem,
+        maybeServerCertPem,
+        maybeKeyStorePasswordFile,
+        maybeKeyPasswordFile,
+        maybeClientCertPem)
+
+    OptionRequirements.requireNandDefined(
+        baseSslOptions.keyStore,
+        maybeKeyPem,
+        "Shouldn't provide both key PEM and keyStore files for TLS.")
+    OptionRequirements.requireNandDefined(
+        baseSslOptions.keyStore,
+        maybeServerCertPem,
+        "Shouldn't provide both certificate PEM and keyStore files for TLS.")
+    OptionRequirements.requireNandDefined(
+        baseSslOptions.keyStorePassword,
+        maybeKeyStorePasswordFile,
+        "Shouldn't provide both the keyStore password value and the keyStore password file.")
+    OptionRequirements.requireNandDefined(
+        baseSslOptions.keyPassword,
+        maybeKeyPasswordFile,
+        "Shouldn't provide both a keyStore key password value and a keyStore key password file.")
+    OptionRequirements.requireBothOrNeitherDefined(
+        maybeKeyPem,
+        maybeServerCertPem,
+        "When providing a certificate PEM file, the key PEM file must also be provided.",
+        "When providing a key PEM file, the certificate PEM file must also be provided.")
+    OptionRequirements.requireNandDefined(baseSslOptions.trustStore, maybeClientCertPem,
+        "Shouldn't provide both the trustStore and a client certificate PEM file.")
 
     val resolvedKeyStorePassword = baseSslOptions.keyStorePassword
-      .orElse(maybeKeyStorePasswordFile.map { keyStorePasswordFile =>
-        safeFileToString(keyStorePasswordFile, "KeyStore password file")
-      })
-      .orElse(maybeKeyPem.map { _ => randomPassword()})
+        .orElse(maybeKeyStorePasswordFile.map { keyStorePasswordFile =>
+          safeFileToString(keyStorePasswordFile, "KeyStore password file")
+        })
+        .orElse(maybeKeyPem.map { _ => randomPassword()})
     val resolvedKeyStoreKeyPassword = baseSslOptions.keyPassword
-      .orElse(maybeKeyPasswordFile.map { keyPasswordFile =>
-        safeFileToString(keyPasswordFile, "KeyStore key password file")
-      })
-      .orElse(maybeKeyPem.map { _ => randomPassword()})
+        .orElse(maybeKeyPasswordFile.map { keyPasswordFile =>
+          safeFileToString(keyPasswordFile, "KeyStore key password file")
+        })
+        .orElse(maybeKeyPem.map { _ => randomPassword()})
     val resolvedKeyStore = baseSslOptions.keyStore.orElse {
       for {
         keyPem <- maybeKeyPem
@@ -90,16 +99,16 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
         val keyPemFile = new File(keyPem)
         val certPemFile = new File(certPem)
         PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
-          keyPemFile,
-          certPemFile,
-          "key",
-          keyStorePassword,
-          keyPassword,
-          baseSslOptions.keyStoreType)
+            keyPemFile,
+            certPemFile,
+            "key",
+            keyStorePassword,
+            keyPassword,
+            baseSslOptions.keyStoreType)
       }
     }
     val resolvedTrustStorePassword = baseSslOptions.trustStorePassword
-      .orElse(maybeClientCertPem.map( _ => "defaultTrustStorePassword"))
+        .orElse(maybeClientCertPem.map( _ => "defaultTrustStorePassword"))
     val resolvedTrustStore = baseSslOptions.trustStore.orElse {
       for {
         clientCertPem <- maybeClientCertPem
@@ -107,16 +116,16 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
       } yield {
         val certPemFile = new File(clientCertPem)
         PemsToKeyStoreConverter.convertCertPemToTempTrustStoreFile(
-          certPemFile,
-          trustStorePassword,
-          baseSslOptions.trustStoreType)
+            certPemFile,
+            trustStorePassword,
+            baseSslOptions.trustStoreType)
       }
     }
     baseSslOptions.copy(
-      keyStore = resolvedKeyStore,
-      keyStorePassword = resolvedKeyStorePassword,
-      keyPassword = resolvedKeyStoreKeyPassword,
-      trustStore = resolvedTrustStore)
+        keyStore = resolvedKeyStore,
+        keyStorePassword = resolvedKeyStorePassword,
+        keyPassword = resolvedKeyStoreKeyPassword,
+        trustStore = resolvedTrustStore)
   }
 
   private def logSslConfigurations(
@@ -140,26 +149,6 @@ private[spark] class ResourceStagingServerSslOptionsProviderImpl(sparkConf: Spar
     logDebug(s"Client-side certificate PEM: ${maybeClientCertPem.getOrElse("N/A")}")
   }
 
-  private def requireBothOrNeitherDefined(
-      opt1: Option[_],
-      opt2: Option[_],
-      errMessageWhenFirstIsMissing: String,
-      errMessageWhenSecondIsMissing: String): Unit = {
-    requireSecondIfFirstIsDefined(opt1, opt2, errMessageWhenSecondIsMissing)
-    requireSecondIfFirstIsDefined(opt2, opt1, errMessageWhenFirstIsMissing)
-  }
-
-  private def requireSecondIfFirstIsDefined(
-      opt1: Option[_], opt2: Option[_], errMessageWhenSecondIsMissing: String): Unit = {
-    opt1.foreach { _ =>
-      require(opt2.isDefined, errMessageWhenSecondIsMissing)
-    }
-  }
-
-  private def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
-    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
-  }
-
   private def safeFileToString(filePath: String, fileType: String): String = {
     val file = new File(filePath)
     if (!file.isFile) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 0dd875b307a6d..5627f7c20de3d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -322,7 +322,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
             .addToRequests("cpu", executorCpuQuantity)
             .addToLimits("cpu", executorCpuQuantity)
           .endResources()
-          .withEnv(requiredEnv.asJava)
+          .addAllToEnv(requiredEnv.asJava)
+          .addToEnv(executorExtraClasspathEnv.toSeq: _*)
           .withPorts(requiredPorts.asJava)
         .endContainer()
       .endSpec()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
index 11a671085c201..09b41dc1bcaaf 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
@@ -23,20 +23,18 @@ import org.apache.spark.deploy.kubernetes.config._
 
 class SubmittedDependencyInitContainerConfigPluginSuite extends SparkFunSuite {
   private val STAGING_SERVER_URI = "http://localhost:9000"
+  private val STAGING_SERVER_INTERNAL_URI = "http://internalHost:9000"
   private val JARS_RESOURCE_ID = "jars-id"
   private val FILES_RESOURCE_ID = "files-id"
   private val JARS_SECRET_KEY = "jars"
   private val FILES_SECRET_KEY = "files"
   private val TRUSTSTORE_SECRET_KEY = "trustStore"
-  private val SECRETS_VOLUME_MOUNT_PATH = "/var/data/"
+  private val CLIENT_CERT_SECRET_KEY = "client-cert"
+  private val SECRETS_VOLUME_MOUNT_PATH = "/var/data"
   private val TRUSTSTORE_PASSWORD = "trustStore"
   private val TRUSTSTORE_FILE = "/mnt/secrets/trustStore.jks"
+  private val CLIENT_CERT_URI = "local:///mnt/secrets/client-cert.pem"
   private val TRUSTSTORE_TYPE = "jks"
-  private val RESOURCE_STAGING_SERVICE_SSL_OPTIONS = SSLOptions(
-    enabled = true,
-    trustStore = Some(new File(TRUSTSTORE_FILE)),
-    trustStorePassword = Some(TRUSTSTORE_PASSWORD),
-    trustStoreType = Some(TRUSTSTORE_TYPE))
 
   test("Plugin should provide configuration for fetching uploaded dependencies") {
     val configPluginUnderTest = new SubmittedDependencyInitContainerConfigPluginImpl(
@@ -46,8 +44,13 @@ class SubmittedDependencyInitContainerConfigPluginSuite extends SparkFunSuite {
       JARS_SECRET_KEY,
       FILES_SECRET_KEY,
       TRUSTSTORE_SECRET_KEY,
-      SECRETS_VOLUME_MOUNT_PATH,
-      SSLOptions())
+      CLIENT_CERT_SECRET_KEY,
+      false,
+      None,
+      None,
+      None,
+      None,
+      SECRETS_VOLUME_MOUNT_PATH)
     val addedConfigurations = configPluginUnderTest.configurationsToFetchSubmittedDependencies()
     val expectedConfigurations = Map(
       RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
@@ -65,19 +68,24 @@ class SubmittedDependencyInitContainerConfigPluginSuite extends SparkFunSuite {
     val configPluginUnderTest = new SubmittedDependencyInitContainerConfigPluginImpl(
       STAGING_SERVER_URI,
       JARS_RESOURCE_ID,
-      FILES_RESOURCE_ID,
-      JARS_SECRET_KEY,
+      FILES_RESOURCE_ID, JARS_SECRET_KEY,
       FILES_SECRET_KEY,
       TRUSTSTORE_SECRET_KEY,
-      SECRETS_VOLUME_MOUNT_PATH,
-      RESOURCE_STAGING_SERVICE_SSL_OPTIONS)
+      CLIENT_CERT_SECRET_KEY,
+      true,
+      Some(TRUSTSTORE_FILE),
+      Some(CLIENT_CERT_URI),
+      Some(TRUSTSTORE_PASSWORD),
+      Some(TRUSTSTORE_TYPE),
+      SECRETS_VOLUME_MOUNT_PATH)
     val addedConfigurations = configPluginUnderTest.configurationsToFetchSubmittedDependencies()
     val expectedSslConfigurations = Map(
       RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "true",
       RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key ->
           s"$SECRETS_VOLUME_MOUNT_PATH/$TRUSTSTORE_SECRET_KEY",
       RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASSWORD,
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE)
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE,
+      RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key -> "/mnt/secrets/client-cert.pem")
     assert(expectedSslConfigurations.toSet.subsetOf(addedConfigurations.toSet))
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
index 189d87e27a28a..358edbecf8708 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
@@ -35,7 +35,9 @@ class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
   private val JARS_SECRET_KEY = "jars-secret-key"
   private val FILES_SECRET_KEY = "files-secret-key"
   private val TRUSTSTORE_SECRET_KEY = "truststore-secret-key"
+  private val CLIENT_CERT_SECRET_KEY = "client-cert"
   private val TRUSTSTORE_STRING_CONTENTS = "trustStore-contents"
+  private val CLIENT_CERT_STRING_CONTENTS = "client-certificate-contents"
 
   test("Building the secret without a trustStore") {
     val builder = new SubmittedDependencySecretBuilderImpl(
@@ -45,7 +47,9 @@ class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
       JARS_SECRET_KEY,
       FILES_SECRET_KEY,
       TRUSTSTORE_SECRET_KEY,
-      SSLOptions())
+      CLIENT_CERT_SECRET_KEY,
+      None,
+      None)
     val secret = builder.build()
     assert(secret.getMetadata.getName === SECRET_NAME)
     val secretDecodedData = decodeSecretData(secret)
@@ -60,10 +64,12 @@ class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
   }
 
   test("Building the secret with a trustStore") {
-    val tempTrustStoreDir = Utils.createTempDir(namePrefix = "temp-truststores")
+    val tempSslDir = Utils.createTempDir(namePrefix = "temp-ssl-tests")
     try {
-      val trustStoreFile = new File(tempTrustStoreDir, "trustStore.jks")
+      val trustStoreFile = new File(tempSslDir, "trustStore.jks")
       Files.write(TRUSTSTORE_STRING_CONTENTS, trustStoreFile, Charsets.UTF_8)
+      val clientCertFile = new File(tempSslDir, "cert.pem")
+      Files.write(CLIENT_CERT_STRING_CONTENTS, clientCertFile, Charsets.UTF_8)
       val builder = new SubmittedDependencySecretBuilderImpl(
         SECRET_NAME,
         JARS_SECRET,
@@ -71,13 +77,33 @@ class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
         JARS_SECRET_KEY,
         FILES_SECRET_KEY,
         TRUSTSTORE_SECRET_KEY,
-        SSLOptions(trustStore = Some(trustStoreFile)))
+        CLIENT_CERT_SECRET_KEY,
+        Some(trustStoreFile.getAbsolutePath),
+        Some(clientCertFile.getAbsolutePath))
       val secret = builder.build()
-      val secretDecodedData = decodeSecretData(secret)
-      assert(secretDecodedData(TRUSTSTORE_SECRET_KEY) === TRUSTSTORE_STRING_CONTENTS)
+      val decodedSecretData = decodeSecretData(secret)
+      assert(decodedSecretData(TRUSTSTORE_SECRET_KEY) === TRUSTSTORE_STRING_CONTENTS)
+      assert(decodedSecretData(CLIENT_CERT_SECRET_KEY) === CLIENT_CERT_STRING_CONTENTS)
     } finally {
-      tempTrustStoreDir.delete()
+      tempSslDir.delete()
     }
   }
 
+  test("If trustStore and certificate are container-local, don't add secret entries") {
+    val builder = new SubmittedDependencySecretBuilderImpl(
+      SECRET_NAME,
+      JARS_SECRET,
+      FILES_SECRET,
+      JARS_SECRET_KEY,
+      FILES_SECRET_KEY,
+      TRUSTSTORE_SECRET_KEY,
+      CLIENT_CERT_SECRET_KEY,
+      Some("local:///mnt/secrets/trustStore.jks"),
+      Some("local:///mnt/secrets/cert.pem"))
+    val secret = builder.build()
+    val decodedSecretData = decodeSecretData(secret)
+    assert(!decodedSecretData.contains(TRUSTSTORE_SECRET_KEY))
+    assert(!decodedSecretData.contains(CLIENT_CERT_SECRET_KEY))
+  }
+
 }

From 408c65f65c569a63508ada134139e4d5b186cd90 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 22 May 2017 14:31:42 -0700
Subject: [PATCH 484/534] Monitor pod status in submission v2. (#283)

* Monitor pod status in submission v2.

* Address comments
---
 .../{v1 => }/LoggingPodStatusWatcher.scala    | 90 ++++++++++++++-----
 .../deploy/kubernetes/submit/v1/Client.scala  | 14 +--
 .../deploy/kubernetes/submit/v2/Client.scala  | 65 +++++++++-----
 .../kubernetes/submit/v2/ClientV2Suite.scala  | 42 +++++++--
 4 files changed, 154 insertions(+), 57 deletions(-)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v1 => }/LoggingPodStatusWatcher.scala (54%)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala
similarity index 54%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala
index 537bcccaa1458..1633a084e463c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala
@@ -14,32 +14,36 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v1
+package org.apache.spark.deploy.kubernetes.submit
 
-import java.util.concurrent.{CountDownLatch, Executors, TimeUnit}
+import java.util.concurrent.{CountDownLatch, TimeUnit}
 
-import io.fabric8.kubernetes.api.model.Pod
+import io.fabric8.kubernetes.api.model.{ContainerStateRunning, ContainerStateTerminated, ContainerStateWaiting, ContainerStatus, Pod}
 import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import scala.collection.JavaConverters._
 
+import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.ThreadUtils
 
+private[kubernetes] trait LoggingPodStatusWatcher extends Watcher[Pod] {
+  def awaitCompletion(): Unit
+}
+
 /**
  * A monitor for the running Kubernetes pod of a Spark application. Status logging occurs on
  * every state change and also at an interval for liveness.
  *
- * @param podCompletedFuture a CountDownLatch that is set to true when the watched pod finishes
  * @param appId
- * @param interval ms between each state request.  If set to 0 or a negative number, the periodic
- *                 logging will be disabled.
+ * @param maybeLoggingInterval ms between each state request. If provided, must be a positive
+ *                             number.
  */
-private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownLatch,
-                                                  appId: String,
-                                                  interval: Long)
-    extends Watcher[Pod] with Logging {
+private[kubernetes] class LoggingPodStatusWatcherImpl(
+      appId: String, maybeLoggingInterval: Option[Long])
+    extends LoggingPodStatusWatcher with Logging {
 
+  private val podCompletedFuture = new CountDownLatch(1)
   // start timer for periodic logging
   private val scheduler =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("logging-pod-status-watcher")
@@ -47,13 +51,13 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
     override def run() = logShortStatus()
   }
 
-  private var pod: Option[Pod] = Option.empty
-  private def phase: String = pod.map(_.getStatus().getPhase()).getOrElse("unknown")
-  private def status: String = pod.map(_.getStatus().getContainerStatuses().toString())
-    .getOrElse("unknown")
+  private var pod = Option.empty[Pod]
+
+  private def phase: String = pod.map(_.getStatus.getPhase).getOrElse("unknown")
 
   def start(): Unit = {
-    if (interval > 0) {
+    maybeLoggingInterval.foreach { interval =>
+      require(interval > 0, s"Logging interval must be a positive time value, got: $interval ms.")
       scheduler.scheduleAtFixedRate(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
     }
   }
@@ -98,7 +102,7 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
   }
 
   private def formatPodState(pod: Pod): String = {
-
+    // TODO include specific container state
     val details = Seq[(String, String)](
       // pod metadata
       ("pod name", pod.getMetadata.getName()),
@@ -116,17 +120,59 @@ private[kubernetes] class LoggingPodStatusWatcher(podCompletedFuture: CountDownL
       ("start time", pod.getStatus.getStartTime),
       ("container images",
         pod.getStatus.getContainerStatuses()
-            .asScala
-            .map(_.getImage)
-            .mkString(", ")),
+          .asScala
+          .map(_.getImage)
+          .mkString(", ")),
       ("phase", pod.getStatus.getPhase()),
       ("status", pod.getStatus.getContainerStatuses().toString)
     )
+    formatPairsBundle(details)
+  }
 
+  private def formatPairsBundle(pairs: Seq[(String, String)]) = {
     // Use more loggable format if value is null or empty
-    details.map { case (k, v) =>
-      val newValue = Option(v).filter(_.nonEmpty).getOrElse("N/A")
-      s"\n\t $k: $newValue"
+    pairs.map {
+      case (k, v) => s"\n\t $k: ${Option(v).filter(_.nonEmpty).getOrElse("N/A")}"
     }.mkString("")
   }
+
+  override def awaitCompletion(): Unit = {
+    podCompletedFuture.countDown()
+    logInfo(pod.map { p =>
+      s"Container final statuses:\n\n${containersDescription(p)}"
+    }.getOrElse("No containers were found in the driver pod."))
+  }
+
+  private def containersDescription(p: Pod): String = {
+    p.getStatus.getContainerStatuses.asScala.map { status =>
+      Seq(
+        ("Container name", status.getName),
+        ("Container image", status.getImage)) ++
+        containerStatusDescription(status)
+    }.map(formatPairsBundle).mkString("\n\n")
+  }
+
+  private def containerStatusDescription(
+      containerStatus: ContainerStatus): Seq[(String, String)] = {
+    val state = containerStatus.getState
+    Option(state.getRunning)
+        .orElse(Option(state.getTerminated))
+        .orElse(Option(state.getWaiting))
+        .map {
+          case running: ContainerStateRunning =>
+            Seq(
+              ("Container state", "Running"),
+              ("Container started at", running.getStartedAt))
+          case waiting: ContainerStateWaiting =>
+            Seq(
+              ("Container state", "Waiting"),
+              ("Pending reason", waiting.getReason))
+          case terminated: ContainerStateTerminated =>
+            Seq(
+              ("Container state", "Terminated"),
+              ("Exit code", terminated.getExitCode.toString))
+          case unknown =>
+            throw new SparkException(s"Unexpected container status type ${unknown.getClass}.")
+        }.getOrElse(Seq(("Container state", "N/A")))
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
index fa3c97c6957b5..32fc434cb693a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
@@ -33,7 +33,7 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.{DriverPodKubernetesCredentialsProvider, KubernetesFileUtils}
+import org.apache.spark.deploy.kubernetes.submit.{DriverPodKubernetesCredentialsProvider, KubernetesFileUtils, LoggingPodStatusWatcherImpl}
 import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ShutdownHookManager, Utils}
@@ -83,7 +83,9 @@ private[spark] class Client(
       MEMORY_OVERHEAD_MIN))
   private val driverContainerMemoryWithOverhead = driverContainerMemoryMb + memoryOverheadMb
 
-  private val waitForAppCompletion: Boolean = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+  private val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+  private val loggingInterval = Some(sparkConf.get(REPORT_INTERVAL))
+    .filter( _ => waitForAppCompletion)
 
   private val secretBase64String = {
     val secretBytes = new Array[Byte](128)
@@ -147,10 +149,8 @@ private[spark] class Client(
       driverServiceManager.start(kubernetesClient, kubernetesAppId, sparkConf)
       // start outer watch for status logging of driver pod
       // only enable interval logging if in waitForAppCompletion mode
-      val loggingInterval = if (waitForAppCompletion) sparkConf.get(REPORT_INTERVAL) else 0
-      val driverPodCompletedLatch = new CountDownLatch(1)
-      val loggingWatch = new LoggingPodStatusWatcher(driverPodCompletedLatch, kubernetesAppId,
-        loggingInterval)
+      val loggingWatch = new LoggingPodStatusWatcherImpl(
+        kubernetesAppId, loggingInterval)
       Utils.tryWithResource(kubernetesClient
           .pods()
           .withName(kubernetesDriverPodName)
@@ -230,7 +230,7 @@ private[spark] class Client(
         // wait if configured to do so
         if (waitForAppCompletion) {
           logInfo(s"Waiting for application $kubernetesAppId to finish...")
-          driverPodCompletedLatch.await()
+          loggingWatch.awaitCompletion()
           logInfo(s"Application $kubernetesAppId finished.")
         } else {
           logInfo(s"Application $kubernetesAppId successfully launched.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
index 23e3e09834372..e4ca5c1458abe 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
@@ -25,6 +25,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.{LoggingPodStatusWatcher, LoggingPodStatusWatcherImpl}
 import org.apache.spark.deploy.rest.kubernetes.v2.ResourceStagingServerSslOptionsProviderImpl
 import org.apache.spark.internal.Logging
 import org.apache.spark.launcher.SparkLauncher
@@ -48,9 +49,11 @@ private[spark] class Client(
     appArgs: Array[String],
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
+    waitForAppCompletion: Boolean,
     kubernetesClientProvider: SubmissionKubernetesClientProvider,
     initContainerComponentsProvider: DriverInitContainerComponentsProvider,
-    kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider)
+    kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider,
+    loggingPodStatusWatcher: LoggingPodStatusWatcher)
     extends Logging {
 
   private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
@@ -186,27 +189,40 @@ private[spark] class Client(
           .endContainer()
         .endSpec()
         .build()
-      val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
-      try {
-        val driverOwnedResources = Seq(initContainerConfigMap) ++
-          maybeSubmittedDependenciesSecret.toSeq ++
-          credentialsSecret.toSeq
-        val driverPodOwnerReference = new OwnerReferenceBuilder()
-          .withName(createdDriverPod.getMetadata.getName)
-          .withApiVersion(createdDriverPod.getApiVersion)
-          .withUid(createdDriverPod.getMetadata.getUid)
-          .withKind(createdDriverPod.getKind)
-          .withController(true)
-          .build()
-        driverOwnedResources.foreach { resource =>
-          val originalMetadata = resource.getMetadata
-          originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
+      Utils.tryWithResource(
+          kubernetesClient
+              .pods()
+              .withName(resolvedDriverPod.getMetadata.getName)
+              .watch(loggingPodStatusWatcher)) { _ =>
+        val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
+        try {
+          val driverOwnedResources = Seq(initContainerConfigMap) ++
+            maybeSubmittedDependenciesSecret.toSeq ++
+            credentialsSecret.toSeq
+          val driverPodOwnerReference = new OwnerReferenceBuilder()
+            .withName(createdDriverPod.getMetadata.getName)
+            .withApiVersion(createdDriverPod.getApiVersion)
+            .withUid(createdDriverPod.getMetadata.getUid)
+            .withKind(createdDriverPod.getKind)
+            .withController(true)
+            .build()
+          driverOwnedResources.foreach { resource =>
+            val originalMetadata = resource.getMetadata
+            originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
+          }
+          kubernetesClient.resourceList(driverOwnedResources: _*).createOrReplace()
+        } catch {
+          case e: Throwable =>
+            kubernetesClient.pods().delete(createdDriverPod)
+            throw e
+        }
+        if (waitForAppCompletion) {
+          logInfo(s"Waiting for application $kubernetesAppId to finish...")
+          loggingPodStatusWatcher.awaitCompletion()
+          logInfo(s"Application $kubernetesAppId finished.")
+        } else {
+          logInfo(s"Deployed Spark application $kubernetesAppId into Kubernetes.")
         }
-        kubernetesClient.resourceList(driverOwnedResources: _*).createOrReplace()
-      } catch {
-        case e: Throwable =>
-          kubernetesClient.pods().delete(createdDriverPod)
-          throw e
       }
     }
   }
@@ -274,6 +290,9 @@ private[spark] object Client {
     val kubernetesClientProvider = new SubmissionKubernetesClientProviderImpl(sparkConf)
     val kubernetesCredentialsMounterProvider =
         new DriverPodKubernetesCredentialsMounterProviderImpl(sparkConf, kubernetesAppId)
+    val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+    val loggingInterval = Option(sparkConf.get(REPORT_INTERVAL)).filter( _ => waitForAppCompletion)
+    val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(kubernetesAppId, loggingInterval)
     new Client(
       appName,
       kubernetesAppId,
@@ -282,8 +301,10 @@ private[spark] object Client {
       appArgs,
       sparkJars,
       sparkFiles,
+      waitForAppCompletion,
       kubernetesClientProvider,
       initContainerComponentsProvider,
-      kubernetesCredentialsMounterProvider).run()
+      kubernetesCredentialsMounterProvider,
+      loggingPodStatusWatcher).run()
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
index f0282dbb6d31a..9ad46e52747fd 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.kubernetes.submit.v2
 import java.io.File
 
 import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
-import io.fabric8.kubernetes.client.KubernetesClient
+import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
 import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
 import org.hamcrest.{BaseMatcher, Description}
 import org.mockito.{AdditionalAnswers, ArgumentCaptor, Mock, MockitoAnnotations}
@@ -35,6 +35,7 @@ import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.SparkPodInitContainerBootstrap
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.LoggingPodStatusWatcher
 
 class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
@@ -59,13 +60,13 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val SPARK_JARS = Seq(
       "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
   private val RESOLVED_SPARK_JARS = Seq(
-    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///var/data/spark-jars/jar2.jar")
+      "hdfs://localhost:9000/app/jars/jar1.jar", "file:///var/data/spark-jars/jar2.jar")
   private val RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS = Seq(
-    "/var/data/spark-jars/jar1.jar", "/var/data/spark-jars/jar2.jar")
+      "/var/data/spark-jars/jar1.jar", "/var/data/spark-jars/jar2.jar")
   private val SPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
+      "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
   private val RESOLVED_SPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.txt", "file:///var/data/spark-files/file2.txt")
+      "hdfs://localhost:9000/app/files/file1.txt", "file:///var/data/spark-files/file2.txt")
   private val INIT_CONTAINER_SECRET = new SecretBuilder()
     .withNewMetadata()
       .withName(INIT_CONTAINER_SECRET_NAME)
@@ -140,6 +141,12 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private var credentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider = _
   @Mock
   private var credentialsMounter: DriverPodKubernetesCredentialsMounter = _
+  @Mock
+  private var loggingPodStatusWatcher: LoggingPodStatusWatcher = _
+  @Mock
+  private var namedPodResource: PodResource[Pod, DoneablePod] = _
+  @Mock
+  private var watch: Watch = _
 
   before {
     MockitoAnnotations.initMocks(this)
@@ -177,6 +184,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
           .build()
       }
     })
+    when(podOps.withName(APP_ID)).thenReturn(namedPodResource)
+    when(namedPodResource.watch(loggingPodStatusWatcher)).thenReturn(watch)
     when(containerLocalizedFilesResolver.resolveSubmittedAndRemoteSparkJars())
         .thenReturn(RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS)
     when(containerLocalizedFilesResolver.resolveSubmittedSparkJars())
@@ -278,6 +287,25 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     })
   }
 
+  test("Waiting for completion should await completion on the status watcher.") {
+    expectationsForNoMountedCredentials()
+    expectationsForNoDependencyUploader()
+    new Client(
+      APP_NAME,
+      APP_ID,
+      MAIN_CLASS,
+      SPARK_CONF,
+      APP_ARGS,
+      SPARK_JARS,
+      SPARK_FILES,
+      true,
+      kubernetesClientProvider,
+      initContainerComponentsProvider,
+      credentialsMounterProvider,
+      loggingPodStatusWatcher).run()
+    verify(loggingPodStatusWatcher).awaitCompletion()
+  }
+
   private def expectationsForNoDependencyUploader(): Unit = {
     when(initContainerComponentsProvider
       .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
@@ -353,9 +381,11 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       APP_ARGS,
       SPARK_JARS,
       SPARK_FILES,
+      false,
       kubernetesClientProvider,
       initContainerComponentsProvider,
-      credentialsMounterProvider).run()
+      credentialsMounterProvider,
+      loggingPodStatusWatcher).run()
     val podMatcher = new BaseMatcher[Pod] {
       override def matches(o: scala.Any): Boolean = {
         o match {

From 8f3d96532f966c5ba8cd5d9bebd2e0743067fb91 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 23 May 2017 12:57:45 -0700
Subject: [PATCH 485/534] Replace submission v1 with submission v2. (#286)

* Replace submission v1 with submission v2.

* Address documentation changes.

* Fix documentation
---
 conf/kubernetes-resource-staging-server.yaml  |  63 ++
 .../org/apache/spark/deploy/SparkSubmit.scala |   2 +-
 dev/.rat-excludes                             |   1 -
 docs/running-on-kubernetes.md                 | 416 ++++++----
 ....kubernetes.submit.v1.DriverServiceManager |   2 -
 .../deploy/kubernetes/CompressionUtils.scala  |  74 +-
 .../SparkPodInitContainerBootstrap.scala      |   2 +-
 .../spark/deploy/kubernetes/config.scala      |  84 +-
 .../kubernetes/submit/{v2 => }/Client.scala   |   5 +-
 .../ContainerLocalizedFilesResolver.scala     |   2 +-
 .../ContainerNameEqualityPredicate.scala      |   2 +-
 ...riverInitContainerComponentsProvider.scala |   4 +-
 ...riverPodKubernetesCredentialsMounter.scala |   2 +-
 ...KubernetesCredentialsMounterProvider.scala |   3 +-
 .../ExecutorInitContainerConfiguration.scala  |   2 +-
 .../submit/{v2 => }/InitContainerUtil.scala   |   2 +-
 ...opertiesConfigMapFromScalaMapBuilder.scala |   2 +-
 .../SparkInitContainerConfigMapBuilder.scala  |   3 +-
 .../SubmissionKubernetesClientProvider.scala  |   2 +-
 ...dDependencyInitContainerConfigPlugin.scala |   2 +-
 .../SubmittedDependencySecretBuilder.scala    |   2 +-
 .../SubmittedDependencyUploaderImpl.scala     |   5 +-
 .../submit/{v2 => }/SubmittedResources.scala  |   2 +-
 .../deploy/kubernetes/submit/v1/Client.scala  | 743 ------------------
 .../submit/v1/DriverServiceManager.scala      |  99 ---
 ...DriverSubmitSslConfigurationProvider.scala | 354 ---------
 ...rnalSuppliedUrisDriverServiceManager.scala | 105 ---
 .../submit/v1/KubernetesResourceCleaner.scala |  53 --
 .../v1/NodePortUrisDriverServiceManager.scala |  70 --
 ...esSparkRestApi.scala => FileFetcher.scala} |  24 +-
 ...SparkDependencyDownloadInitContainer.scala |  50 +-
 .../{v1 => }/PemsToKeyStoreConverter.scala    |   3 +-
 .../{v2 => }/ResourceStagingServer.scala      |   2 +-
 ...ourceStagingServerSslOptionsProvider.scala |   3 +-
 .../{v2 => }/ResourceStagingService.scala     |   4 +-
 .../{v2 => }/ResourceStagingServiceImpl.scala |  11 +-
 .../ResourceStagingServiceRetrofit.scala      |   4 +-
 .../{v2 => }/RetrofitClientFactory.scala      |   2 +-
 .../{v2 => }/SparkConfPropertiesParser.scala  |   4 +-
 .../rest/kubernetes/v1/HttpClientUtil.scala   | 131 ---
 .../v1/KubernetesRestProtocolMessages.scala   |  75 --
 .../v1/KubernetesSparkRestServer.scala        | 483 ------------
 .../v1/MultiServerFeignTarget.scala           |  89 ---
 .../spark/deploy/kubernetes/SSLUtils.scala    |   2 +-
 .../submit/{v2 => }/ClientV2Suite.scala       |   3 +-
 ...ContainerLocalizedFilesResolverSuite.scala |   2 +-
 ...PodKubernetesCredentialsMounterSuite.scala |   4 +-
 ...cutorInitContainerConfigurationSuite.scala |   2 +-
 .../submit/{v2 => }/SSLFilePairs.scala        |   2 +-
 ...rkInitContainerConfigMapBuilderSuite.scala |   2 +-
 ...ndencyInitContainerConfigPluginSuite.scala |   6 +-
 ...ubmittedDependencySecretBuilderSuite.scala |   4 +-
 .../SubmittedDependencyUploaderSuite.scala    |   4 +-
 ...DependencyDownloadInitContainerSuite.scala |   4 +-
 ...StagingServerSslOptionsProviderSuite.scala |   2 +-
 .../{v2 => }/ResourceStagingServerSuite.scala |   2 +-
 .../ResourceStagingServiceImplSuite.scala     |   2 +-
 .../src/main/docker/driver-v2/Dockerfile      |  43 -
 .../src/main/docker/driver/Dockerfile         |  18 +-
 .../Dockerfile                                |   2 +-
 .../docker/resource-staging-server/Dockerfile |   2 +-
 .../integrationtest/KubernetesSuite.scala     | 248 +++++-
 .../KubernetesTestComponents.scala            |  29 +-
 .../integrationtest/KubernetesV1Suite.scala   | 339 --------
 .../integrationtest/KubernetesV2Suite.scala   | 265 -------
 .../ResourceStagingServerLauncher.scala       |   2 +-
 .../docker/SparkDockerImageBuilder.scala      |  10 +-
 67 files changed, 668 insertions(+), 3324 deletions(-)
 create mode 100644 conf/kubernetes-resource-staging-server.yaml
 delete mode 100644 resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/Client.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/ContainerLocalizedFilesResolver.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/ContainerNameEqualityPredicate.scala (95%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/DriverInitContainerComponentsProvider.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/DriverPodKubernetesCredentialsMounter.scala (99%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/DriverPodKubernetesCredentialsMounterProvider.scala (92%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/ExecutorInitContainerConfiguration.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/InitContainerUtil.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/PropertiesConfigMapFromScalaMapBuilder.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SparkInitContainerConfigMapBuilder.scala (95%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmissionKubernetesClientProvider.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedDependencyInitContainerConfigPlugin.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedDependencySecretBuilder.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedDependencyUploaderImpl.scala (95%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedResources.scala (96%)
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v1/KubernetesSparkRestApi.scala => FileFetcher.scala} (56%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/KubernetesSparkDependencyDownloadInitContainer.scala (95%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v1 => }/PemsToKeyStoreConverter.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServer.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServerSslOptionsProvider.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingService.scala (97%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServiceImpl.scala (91%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServiceRetrofit.scala (93%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/RetrofitClientFactory.scala (98%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/SparkConfPropertiesParser.scala (94%)
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/ClientV2Suite.scala (99%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/ContainerLocalizedFilesResolverSuite.scala (98%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/DriverPodKubernetesCredentialsMounterSuite.scala (99%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/ExecutorInitContainerConfigurationSuite.scala (97%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SSLFilePairs.scala (94%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SparkInitContainerConfigMapBuilderSuite.scala (98%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedDependencyInitContainerConfigPluginSuite.scala (96%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedDependencySecretBuilderSuite.scala (97%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/{v2 => }/SubmittedDependencyUploaderSuite.scala (97%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/KubernetesSparkDependencyDownloadInitContainerSuite.scala (98%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServerSslOptionsProviderSuite.scala (99%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServerSuite.scala (99%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/{v2 => }/ResourceStagingServiceImplSuite.scala (98%)
 delete mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile
 rename resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/{driver-init => init-container}/Dockerfile (95%)
 delete mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
 delete mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala

diff --git a/conf/kubernetes-resource-staging-server.yaml b/conf/kubernetes-resource-staging-server.yaml
new file mode 100644
index 0000000000000..de0da3edcb901
--- /dev/null
+++ b/conf/kubernetes-resource-staging-server.yaml
@@ -0,0 +1,63 @@
+---
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: spark-resource-staging-server
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        resource-staging-server-instance: default
+    spec:
+      volumes:
+        - name: resource-staging-server-properties
+          configMap:
+            name: spark-resource-staging-server-config
+      containers:
+        - name: spark-resource-staging-server
+          image: kubespark/spark-resource-staging-server:v2.1.0-kubernetes-0.1.0-alpha.3
+          resources:
+            requests:
+              cpu: 100m
+              memory: 256Mi
+            limits:
+              cpu: 100m
+              memory: 256Mi
+          volumeMounts:
+            - name: resource-staging-server-properties
+              mountPath: '/etc/spark-resource-staging-server'
+          args:
+            - '/etc/spark-resource-staging-server/resource-staging-server.properties'
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: spark-resource-staging-server-config
+data:
+  resource-staging-server.properties: |
+    spark.kubernetes.resourceStagingServer.port=10000
+    spark.ssl.kubernetes.resourceStagingServer.enabled=false
+# Other possible properties are listed below, primarily for setting up TLS. The paths given by KeyStore, password, and PEM files here should correspond to
+# files that are securely mounted into the resource staging server container, via e.g. secret volumes.
+#   spark.ssl.kubernetes.resourceStagingServer.keyStore=/mnt/secrets/resource-staging-server/keyStore.jks
+#   spark.ssl.kubernetes.resourceStagingServer.keyStorePassword=changeit
+#   spark.ssl.kubernetes.resourceStagingServer.keyPassword=changeit
+#   spark.ssl.kubernetes.resourceStagingServer.keyStorePasswordFile=/mnt/secrets/resource-staging-server/keystore-password.txt
+#   spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile=/mnt/secrets/resource-staging-server/keystore-key-password.txt
+#   spark.ssl.kubernetes.resourceStagingServer.keyPem=/mnt/secrets/resource-staging-server/key.pem
+#   spark.ssl.kubernetes.resourceStagingServer.serverCertPem=/mnt/secrets/resource-staging-server/cert.pem
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: spark-resource-staging-service
+spec:
+  type: NodePort
+  selector:
+    resource-staging-server-instance: default
+  ports:
+    - protocol: TCP
+      port: 10000
+      targetPort: 10000
+      nodePort: 31000
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index aeccd0088d76c..59ccf3af24ce7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -619,7 +619,7 @@ object SparkSubmit {
     }
 
     if (isKubernetesCluster) {
-      childMainClass = "org.apache.spark.deploy.kubernetes.submit.v1.Client"
+      childMainClass = "org.apache.spark.deploy.kubernetes.submit.Client"
       childArgs += args.primaryResource
       childArgs += args.mainClass
       childArgs ++= args.childArgs
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index 6a805b3293a6f..6be1c72bc6cfb 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -103,4 +103,3 @@ org.apache.spark.scheduler.ExternalClusterManager
 org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
 spark-warehouse
 structured-streaming/*
-org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 5b7bb6cc612c5..98393cbbbba2d 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -3,15 +3,25 @@ layout: global
 title: Running Spark on Kubernetes
 ---
 
-Support for running on [Kubernetes](https://kubernetes.io/docs/whatisk8s/) is available in experimental status. The feature set is
-currently limited and not well-tested. This should not be used in production environments.
+Support for running on [Kubernetes](https://kubernetes.io/docs/whatisk8s/) is available in experimental status. The
+feature set is currently limited and not well-tested. This should not be used in production environments.
 
 ## Prerequisites
 
-* You must have a running Kubernetes cluster with access configured to it using [kubectl](https://kubernetes.io/docs/user-guide/prereqs/). If you do not already have a working Kubernetes cluster, you may setup a test cluster on your local machine using [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
-  * We recommend that minikube be updated to the most recent version (0.18.0 at the time of this documentation), as some earlier versions may not start up the kubernetes cluster with all the necessary components.
-* You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/), [nodes](https://kubernetes.io/docs/admin/node/) and [services](https://kubernetes.io/docs/user-guide/services/) in your cluster. You can verify that you can list these resources by running `kubectl get nodes`, `kubectl get pods` and `kubectl get svc` which should give you a list of nodes, pods and services (if any) respectively.
-* You must have a spark distribution with Kubernetes support. This may be obtained from the [release tarball](https://github.com/apache-spark-on-k8s/spark/releases) or by [building Spark with Kubernetes support](../resource-managers/kubernetes/README.md#building-spark-with-kubernetes-support).
+* You must have a running Kubernetes cluster with access configured to it
+using [kubectl](https://kubernetes.io/docs/user-guide/prereqs/). If you do not already have a working Kubernetes
+cluster, you may setup a test cluster on your local machine using
+[minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
+  * We recommend that minikube be updated to the most recent version (0.19.0 at the time of this documentation), as some
+  earlier versions may not start up the kubernetes cluster with all the necessary components.
+* You must have appropriate permissions to create and list [pods](https://kubernetes.io/docs/user-guide/pods/),
+[ConfigMaps](https://kubernetes.io/docs/tasks/configure-pod-container/configmap/) and
+[secrets](https://kubernetes.io/docs/concepts/configuration/secret/) in your cluster. You can verify that
+you can list these resources by running `kubectl get pods` `kubectl get configmap`, and `kubectl get secrets` which
+should give you a list of pods and configmaps (if any) respectively.
+* You must have a spark distribution with Kubernetes support. This may be obtained from the
+[release tarball](https://github.com/apache-spark-on-k8s/spark/releases) or by
+[building Spark with Kubernetes support](../resource-managers/kubernetes/README.md#building-spark-with-kubernetes-support).
 
 ## Driver & Executor Images
 
@@ -19,7 +29,8 @@ Kubernetes requires users to supply images that can be deployed into containers
 be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
 frequently used with Kubernetes, so Spark provides some support for working with Docker to get started quickly.
 
-If you wish to use pre-built docker images, you may use the images published in [kubespark](https://hub.docker.com/u/kubespark/). The images are as follows:
+If you wish to use pre-built docker images, you may use the images published in
+[kubespark](https://hub.docker.com/u/kubespark/). The images are as follows:
 
 <table class="table">
 <tr><th>Component</th><th>Image</th></tr>
@@ -31,20 +42,27 @@ If you wish to use pre-built docker images, you may use the images published in
   <td>Spark Executor Image</td>
   <td><code>kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
 </tr>
+<tr>
+  <td>Spark Initialization Image</td>
+  <td><code>kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
+</tr>
 </table>
 
-You may also build these docker images from sources, or customize them as required. Spark distributions include the Docker files for the driver and the executor at
-`dockerfiles/driver/Dockerfile` and `dockerfiles/executor/Dockerfile`, respectively. Use these Docker files to build the
-Docker images, and then tag them with the registry that the images should be sent to. Finally, push the images to the
-registry.
+You may also build these docker images from sources, or customize them as required. Spark distributions include the
+Docker files for the driver, executor, and init-container at `dockerfiles/driver/Dockerfile`,
+`dockerfiles/executor/Dockerfile`, and `dockerfiles/init-container/Dockerfile` respectively. Use these Docker files to
+build the Docker images, and then tag them with the registry that the images should be sent to. Finally, push the images
+to the registry.
 
 For example, if the registry host is `registry-host` and the registry is listening on port 5000:
 
     cd $SPARK_HOME
     docker build -t registry-host:5000/spark-driver:latest -f dockerfiles/driver/Dockerfile .
     docker build -t registry-host:5000/spark-executor:latest -f dockerfiles/executor/Dockerfile .
+    docker build -t registry-host:5000/spark-init:latest -f dockerfiles/init-container/Dockerfile .
     docker push registry-host:5000/spark-driver:latest
     docker push registry-host:5000/spark-executor:latest
+    docker push registry-host:5000/spark-init:latest
 
 ## Submitting Applications to Kubernetes
 
@@ -60,7 +78,8 @@ are set up as described above:
       --conf spark.app.name=spark-pi \
       --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
       --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      examples/jars/spark_examples_2.11-2.2.0.jar
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      local:///opt/spark/examples/jars/spark_examples_2.11-2.2.0.jar
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
 `spark.master` in the application's configuration, must be a URL with the format `k8s://<api_server_url>`. Prefixing the
@@ -80,13 +99,53 @@ In the above example, the specific Kubernetes cluster can be used with spark sub
 Note that applications can currently only be executed in cluster mode, where the driver and its executors are running on
 the cluster.
 
-### Specifying input files
+Finally, notice that in the above example we specify a jar with a specific URI with a scheme of `local://`. This URI is
+the location of the example jar that is already in the Docker image. Using dependencies that are on your machine's local
+disk is discussed below.
+
+## Dependency Management
+
+Application dependencies that are being submitted from your machine need to be sent to a **resource staging server**
+that the driver and executor can then communicate with to retrieve those dependencies. A YAML file denoting a minimal
+set of Kubernetes resources that runs this service is located in the file `conf/kubernetes-resource-staging-server.yaml`.
+This YAML file configures a Deployment with one pod running the resource staging server configured with a ConfigMap,
+and exposes the server through a Service with a fixed NodePort. Deploying a resource staging server with the included
+YAML file requires you to have permissions to create Deployments, Services, and ConfigMaps.
+
+To run the resource staging server with default configurations, the Kubernetes resources can be created:
+
+    kubectl create -f conf/kubernetes-resource-staging-server.yaml
+
+and then you can compute the value of Pi as follows:
+
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class org.apache.spark.examples.SparkPi \
+      --master k8s://<k8s-apiserver-host>:<k8s-apiserver-port> \
+      --kubernetes-namespace default \
+      --conf spark.executor.instances=5 \
+      --conf spark.app.name=spark-pi \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.resourceStagingServer.uri=http://<address-of-any-cluster-node>:31000 \
+      examples/jars/spark_examples_2.11-2.2.0.jar
+
+The Docker image for the resource staging server may also be built from source, in a similar manner to the driver
+and executor images. The Dockerfile is provided in `dockerfiles/resource-staging-server/Dockerfile`.
+
+The provided YAML file specifically sets the NodePort to 31000 on the service's specification. If port 31000 is not
+available on any of the nodes of your cluster, you should remove the NodePort field from the service's specification
+and allow the Kubernetes cluster to determine the NodePort itself. Be sure to provide the correct port in the resource
+staging server URI when submitting your application, in accordance to the NodePort chosen by the Kubernetes cluster.
+
+### Dependency Management Without The Resource Staging Server
 
-Spark supports specifying JAR paths that are either on the submitting host's disk, or are located on the disk of the
-driver and executors. Refer to the [application submission](submitting-applications.html#advanced-dependency-management)
-section for details. Note that files specified with the `local://` scheme should be added to the container image of both
-the driver and the executors. Files without a scheme or with the scheme `file://` are treated as being on the disk of
-the submitting machine, and are uploaded to the driver running in Kubernetes before launching the application.
+Note that this resource staging server is only required for submitting local dependencies. If your application's
+dependencies are all hosted in remote locations like HDFS or http servers, they may be referred to by their appropriate
+remote URIs. Also, application dependencies can be pre-mounted into custom-built Docker images. Those dependencies
+can be added to the classpath by referencing them with `local://` URIs and/or setting the `SPARK_EXTRA_CLASSPATH`
+environment variable in your Dockerfiles.
 
 ### Accessing Kubernetes Clusters
 
@@ -111,70 +170,127 @@ If our local proxy were listening on port 8001, we would have our submission loo
       --conf spark.app.name=spark-pi \
       --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
       --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      examples/jars/spark_examples_2.11-2.2.0.jar
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      local:///opt/spark/examples/jars/spark_examples_2.11-2.2.0.jar
 
 Communication between Spark and Kubernetes clusters is performed using the fabric8 kubernetes-client library.
 The above mechanism using `kubectl proxy` can be used when we have authentication providers that the fabric8
 kubernetes-client library does not support. Authentication using X509 Client Certs and OAuth tokens
 is currently supported.
 
+## Dynamic Executor Scaling
+
+Spark on Kubernetes supports Dynamic Allocation with cluster mode. This mode requires running
+an external shuffle service. This is typically a [daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/)
+with a provisioned [hostpath](https://kubernetes.io/docs/concepts/storage/volumes/#hostpath) volume.
+This shuffle service may be shared by executors belonging to different SparkJobs. Using Spark with dynamic allocation
+on Kubernetes assumes that a cluster administrator has set up one or more shuffle-service daemonsets in the cluster.
+
+A sample configuration file is provided in `conf/kubernetes-shuffle-service.yaml` which can be customized as needed
+for a particular cluster. It is important to note that `spec.template.metadata.labels` are setup appropriately for the shuffle
+service because there may be multiple shuffle service instances running in a cluster. The labels give Spark applications
+a way to target a particular shuffle service.
+
+For example, if the shuffle service we want to use is in the default namespace, and
+has pods with labels `app=spark-shuffle-service` and `spark-version=2.1.0`, we can
+use those tags to target that particular shuffle service at job launch time. In order to run a job with dynamic allocation enabled,
+the command may then look like the following:
+
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class org.apache.spark.examples.GroupByTest \
+      --master k8s://<k8s-master>:<port> \
+      --kubernetes-namespace default \
+      --conf spark.app.name=group-by-test \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:latest \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:latest \
+      --conf spark.dynamicAllocation.enabled=true \
+      --conf spark.shuffle.service.enabled=true \
+      --conf spark.kubernetes.shuffle.namespace=default \
+      --conf spark.kubernetes.shuffle.labels="app=spark-shuffle-service,spark-version=2.1.0" \
+      local:///opt/spark/examples/jars/spark_examples_2.11-2.2.0.jar 10 400000 2
+
 ## Advanced
- 
-### Setting Up TLS For Submitting the Driver
-
-When submitting to Kubernetes, a pod is started for the driver, and the pod starts an HTTP server. This HTTP server
-receives the driver's configuration, including uploaded driver jars, from the client before starting the application.
-Spark supports using TLS to encrypt the traffic in this bootstrapping process. It is recommended to configure this
-whenever possible. 
-
-See the [security page](security.html) and [configuration](configuration.html) sections for more information on
-configuring TLS; use the prefix `spark.ssl.kubernetes.driversubmitserver` in configuring the TLS-related fields in the context
-of submitting to Kubernetes. For example, to set the trustStore used when the local machine communicates with the driver
-pod in starting the application, set `spark.ssl.kubernetes.driversubmitserver.trustStore`.
-
-One note about the keyStore is that it can be specified as either a file on the client machine or a file in the
-container image's disk. Thus `spark.ssl.kubernetes.driversubmitserver.keyStore` can be a URI with a scheme of either `file:`
-or `local:`. A scheme of `file:` corresponds to the keyStore being located on the client machine; it is mounted onto
-the driver container as a [secret volume](https://kubernetes.io/docs/user-guide/secrets/). When the URI has the scheme
-`local:`, the file is assumed to already be on the container's disk at the appropriate path.
-
-Finally, the submission server and client can be configured to use PEM files instead of Java keyStores. When using
-this mode, set `spark.ssl.kubernetes.driversubmitserver.keyPem` and
-`spark.ssl.kubernetes.driversubmitserver.serverCertPem` to configure the key and certificate files on the driver
-submission server. These files can be uploaded from the submitter's machine if they have no scheme or a scheme of
-`file:`, or they can be located on the container's disk if they have the scheme `local:`. The client's certificate
-file should be provided via setting `spark.ssl.kubernetes.driversubmitserver.clientCertPem`, and this file must be
-located on the submitting machine's local disk.
-
-### Submission of Local Files through Ingress/External controller
-
-Kubernetes pods run with their own IP address space. If Spark is run in cluster mode, the driver pod may not be
-accessible to the submitter. However, the submitter needs to send local dependencies from its local disk to the driver
-pod.
-
-By default, Spark will place a [Service](https://kubernetes.io/docs/user-guide/services/#type-nodeport) with a NodePort
-that is opened on every node. The submission client will then contact the driver at one of the node's
-addresses with the appropriate service port.
-
-There may be cases where the nodes cannot be reached by the submission client. For example, the cluster may
-only be reachable through an external load balancer. The user may provide their own external URI for Spark driver
-services. To use a your own external URI instead of a node's IP and node port, first set
-`spark.kubernetes.driver.serviceManagerType` to `ExternalAnnotation`. A service will be created with the annotation
-`spark-job.alpha.apache.org/provideExternalUri`, and this service routes to the driver pod. You will need to run a
-separate process that watches the API server for services that are created with this annotation in the application's
-namespace (set by `spark.kubernetes.namespace`). The process should determine a URI that routes to this service
-(potentially configuring infrastructure to handle the URI behind the scenes), and patch the service to include an
-annotation `spark-job.alpha.apache.org/resolvedExternalUri`, which has its value as the external URI that your process
-has provided (e.g. `https://example.com:8080/my-job`).
-
-Note that the URI provided in the annotation needs to route traffic to the appropriate destination on the pod, which has
-a empty path portion of the URI. This means the external URI provider will likely need to rewrite the path from the
-external URI to the destination on the pod, e.g. https://example.com:8080/spark-app-1/submit will need to route traffic
-to https://<pod_ip>:<service_port>/. Note that the paths of these two URLs are different.
-
-If the above is confusing, keep in mind that this functionality is only necessary if the submitter cannot reach any of
-the nodes at the driver's node port. It is recommended to use the default configuration with the node port service
-whenever possible.
+
+### Securing the Resource Staging Server with TLS
+
+The default configuration of the resource staging server is not secured with TLS. It is highly recommended to configure
+this to protect the secrets and jars/files being submitted through the staging server.
+
+The YAML file in `conf/kubernetes-resource-staging-server.yaml` includes a ConfigMap resource that holds the resource
+staging server's configuration. The properties can be adjusted here to make the resource staging server listen over TLS.
+Refer to the [security](security.html) page for the available settings related to TLS. The namespace for the
+resource staging server is `kubernetes.resourceStagingServer`, so for example the path to the server's keyStore would
+be set by `spark.ssl.kubernetes.resourceStagingServer.keyStore`.
+
+In addition to the settings specified by the previously linked security page, the resource staging server supports the
+following additional configurations:
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.keyPem</code></td>
+  <td>(none)</td>
+  <td>
+    Private key file encoded in PEM format that the resource staging server uses to secure connections over TLS. If this
+    is specified, the associated public key file must be specified in
+    <code>spark.ssl.kubernetes.resourceStagingServer.serverCertPem</code>. PEM files and a keyStore file (set by
+    <code>spark.ssl.kubernetes.resourceStagingServer.keyStore</code>) cannot both be specified at the same time.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.serverCertPem</code></td>
+  <td>(none)</td>
+  <td>
+    Certificate file encoded in PEM format that the resource staging server uses to secure connections over TLS. If this
+    is specified, the associated private key file must be specified in
+    <code>spark.ssl.kubernetes.resourceStagingServer.keyPem</code>. PEM files and a keyStore file (set by
+    <code>spark.ssl.kubernetes.resourceStagingServer.keyStore</code>) cannot both be specified at the same time.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.keyStorePasswordFile</code></td>
+  <td>(none)</td>
+  <td>
+    Provides the KeyStore password through a file in the container instead of a static value. This is useful if the
+    keyStore's password is to be mounted into the container with a secret.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.keyPasswordFile</code></td>
+  <td>(none)</td>
+  <td>
+    Provides the keyStore's key password using a file in the container instead of a static value. This is useful if the
+    keyStore's key password is to be mounted into the container with a secret.
+  </td>
+</tr>
+</table>
+
+Note that while the properties can be set in the ConfigMap, you will still need to consider the means of mounting the
+appropriate secret files into the resource staging server's container. A common mechanism that is used for this is
+to use [Kubernetes secrets](https://kubernetes.io/docs/concepts/configuration/secret/) that are mounted as secret
+volumes. Refer to the appropriate Kubernetes documentation for guidance and adjust the resource staging server's
+specification in the provided YAML file accordingly.
+
+Finally, when you submit your application, you must specify either a trustStore or a PEM-encoded certificate file to
+communicate with the resource staging server over TLS. The trustStore can be set with
+`spark.ssl.kubernetes.resourceStagingServer.trustStore`, or a certificate file can be set with
+`spark.ssl.kubernetes.resourceStagingServer.clientCertPem`. For example, our SparkPi example now looks like this:
+
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --class org.apache.spark.examples.SparkPi \
+      --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
+      --kubernetes-namespace default \
+      --conf spark.executor.instances=5 \
+      --conf spark.app.name=spark-pi \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.resourceStagingServer.uri=https://<address-of-any-cluster-node>:31000 \
+      --conf spark.ssl.kubernetes.resourceStagingServer.enabled=true \
+      --conf spark.ssl.kubernetes.resourceStagingServer.clientCertPem=/home/myuser/cert.pem \
+      examples/jars/spark_examples_2.11-2.2.0.jar
 
 ### Spark Properties
 
@@ -208,6 +324,16 @@ from the other deployment modes. See the [configuration page](configuration.html
     <a href="https://docs.docker.com/engine/reference/commandline/tag/">Docker tag</a> format.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.initcontainer.docker.image</code></td>
+  <td><code>spark-init:2.2.0</code></td>
+  <td>
+    Docker image to use for the init-container that is run before the driver and executor containers. Specify this using
+    the standard <a href="https://docs.docker.com/engine/reference/commandline/tag/">Docker tag</a> format. The
+    init-container is responsible for fetching application dependencies from both remote locations like HDFS or S3,
+    and from the resource staging server, if applicable.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.shuffle.namespace</code></td>
   <td><code>default</code></td>
@@ -218,7 +344,7 @@ from the other deployment modes. See the [configuration page](configuration.html
 </tr>
 <tr>
   <td><code>spark.kubernetes.shuffle.labels</code></td>
-  <td><code>(none)</code></td>
+  <td>(none)</td>
   <td>
     Labels that will be used to look up shuffle service pods. This should be a comma-separated list of label key-value pairs,
     where each label is in the format <code>key=value</code>. The labels chosen must be such that
@@ -334,123 +460,113 @@ from the other deployment modes. See the [configuration page](configuration.html
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.submissionServerMemory</code></td>
-  <td>256m</td>
+  <td><code>spark.kubernetes.driver.labels</code></td>
+  <td>(none)</td>
   <td>
-    The amount of memory to allocate for the driver submission server.
+    Custom labels that will be added to the driver pod. This should be a comma-separated list of label key-value pairs,
+    where each label is in the format <code>key=value</code>. Note that Spark also adds its own labels to the driver pod
+    for bookkeeping purposes.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.memoryOverhead</code></td>
-  <td>(driverMemory + driverSubmissionServerMemory) * 0.10, with minimum of 384</td>
+  <td><code>spark.kubernetes.driver.annotations</code></td>
+  <td>(none)</td>
   <td>
-    The amount of off-heap memory (in megabytes) to be allocated for the driver and the driver submission server. This
-    is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to
-    grow with the driver size (typically 6-10%).
+    Custom annotations that will be added to the driver pod. This should be a comma-separated list of label key-value
+    pairs, where each annotation is in the format <code>key=value</code>.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.labels</code></td>
+  <td><code>spark.kubernetes.driver.pod.name</code></td>
   <td>(none)</td>
   <td>
-    Custom labels that will be added to the driver pod. This should be a comma-separated list of label key-value pairs,
-    where each label is in the format <code>key=value</code>. Note that Spark also adds its own labels to the driver pod
-    for bookkeeping purposes.
+    Name of the driver pod. If not set, the driver pod name is set to "spark.app.name" suffixed by the current timestamp
+    to avoid name conflicts.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.annotations</code></td>
+  <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
+  <td><code>true</code></td>
+  <td>
+    In cluster mode, whether to wait for the application to finish before exiting the launcher process.  When changed to
+    false, the launcher has a "fire-and-forget" behavior when launching the Spark job.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.resourceStagingServer.port</code></td>
+  <td><code>10000</code></td>
+  <td>
+    Port for the resource staging server to listen on when it is deployed.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.resourceStagingServer.uri</code></td>
   <td>(none)</td>
   <td>
-    Custom annotations that will be added to the driver pod. This should be a comma-separated list of label key-value
-    pairs, where each annotation is in the format <code>key=value</code>.
+    URI of the resource staging server that Spark should use to distribute the application's local dependencies. Note
+    that by default, this URI must be reachable by both the submitting machine and the pods running in the cluster. If
+    one URI is not simultaneously reachable both by the submitter and the driver/executor pods, configure the pods to
+    access the staging server at a different URI by setting
+    <code>spark.kubernetes.resourceStagingServer.internal.uri</code> as discussed below.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driverSubmissionTimeout</code></td>
-  <td>60s</td>
+  <td><code>spark.kubernetes.resourceStagingServer.internal.uri</code></td>
+  <td>Value of <code>spark.kubernetes.resourceStagingServer.uri</code></td>
   <td>
-    Time to wait for the driver pod to start running before aborting its execution.
+    URI of the resource staging server to communicate with when init-containers bootstrap the driver and executor pods
+    with submitted local dependencies. Note that this URI must by the pods running in the cluster. This is useful to
+    set if the resource staging server has a separate "internal" URI that must be accessed by components running in the
+    cluster.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.service.exposeUiPort</code></td>
-  <td><code>false</code></td>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.internal.trustStore</code></td>
+  <td>Value of <code>spark.ssl.kubernetes.resourceStagingServer.trustStore</code></td>
   <td>
-    Whether to expose the driver Web UI port as a service NodePort. Turned off by default because NodePort is a limited
-    resource.
+    Location of the trustStore file to use when communicating with the resource staging server over TLS, as
+    init-containers bootstrap the driver and executor pods with submitted local dependencies. This can be a URI with a
+    scheme of <code>local://</code>, which denotes that the file is pre-mounted on the pod's disk. A uri without a
+    scheme or a scheme of <code>file://</code> will result in this file being mounted from the submitting machine's
+    disk as a secret into the init-containers.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.pod.name</code></td>
-  <td><code>(none)</code></td>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.internal.trustStorePassword</code></td>
+  <td>Value of <code><code>spark.ssl.kubernetes.resourceStagingServer.trustStorePassword</code></td>
   <td>
-    Name of the driver pod. If not set, the driver pod name is set to "spark.app.name" suffixed by the current timestamp to avoid name conflicts.
+    Password of the trustStore file that is used when communicating with the resource staging server over TLS, as
+    init-containers bootstrap the driver and executor pods with submitted local dependencies.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
-  <td><code>true</code></td>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.internal.trustStoreType</code></td>
+  <td>Value of <code><code>spark.ssl.kubernetes.resourceStagingServer.trustStoreType</code></td>
   <td>
-    In cluster mode, whether to wait for the application to finish before exiting the launcher process.  When changed to
-    false, the launcher has a "fire-and-forget" behavior when launching the Spark job.
+    Type of the trustStore file that is used when communicating with the resource staging server over TLS, when
+    init-containers bootstrap the driver and executor pods with submitted local dependencies.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.report.interval</code></td>
-  <td><code>1s</code></td>
+  <td><code>spark.ssl.kubernetes.resourceStagingServer.internal.clientCertPem</code></td>
+  <td>Value of <code>spark.ssl.kubernetes.resourceStagingServer.clientCertPem</code></td>
   <td>
-    Interval between reports of the current Spark job status in cluster mode.
+    Location of the certificate file to use when communicating with the resource staging server over TLS, as
+    init-containers bootstrap the driver and executor pods with submitted local dependencies. This can be a URI with a
+    scheme of <code>local://</code>, which denotes that the file is pre-mounted on the pod's disk. A uri without a
+    scheme or a scheme of <code>file://</code> will result in this file being mounted from the submitting machine's
+    disk as a secret into the init-containers.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.serviceManagerType</code></td>
-  <td><code>NodePort</code></td>
+  <td><code>spark.kubernetes.report.interval</code></td>
+  <td><code>1s</code></td>
   <td>
-    A tag indicating which class to use for creating the Kubernetes service and determining its URI for the submission
-    client. Valid values are currently <code>NodePort</code> and <code>ExternalAnnotation</code>. By default, a service
-    is created with the <code>NodePort</code> type, and the driver will be contacted at one of the nodes at the port
-    that the nodes expose for the service. If the nodes cannot be contacted from the submitter's machine, consider
-    setting this to <code>ExternalAnnotation</code> as described in "Determining the Driver Base URI" above. One may
-    also include a custom implementation of <code>org.apache.spark.deploy.rest.kubernetes.DriverServiceManager</code> on
-    the submitter's classpath - spark-submit service loads an instance of that class. To use the custom
-    implementation, set this value to the custom implementation's return value of 
-    <code>DriverServiceManager#getServiceManagerType()</code>. This method should only be done as a last resort.
+    Interval between reports of the current Spark job status in cluster mode.
   </td>
 </tr>
 </table>
 
-## Dynamic Executor Scaling
-
-Spark on Kubernetes supports Dynamic Allocation with cluster mode. This mode requires running
-an external shuffle service. This is typically a [daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/)
-with a provisioned [hostpath](https://kubernetes.io/docs/concepts/storage/volumes/#hostpath) volume.
-This shuffle service may be shared by executors belonging to different SparkJobs. Using Spark with dynamic allocation
-on Kubernetes assumes that a cluster administrator has set up one or more shuffle-service daemonsets in the cluster.
-
-A sample configuration file is provided in `conf/kubernetes-shuffle-service.yaml` which can be customized as needed
-for a particular cluster. It is important to note that `spec.template.metadata.labels` are setup appropriately for the shuffle
-service because there may be multiple shuffle service instances running in a cluster. The labels give us a way to target a particular
-shuffle service.
-
-For example, if the shuffle service we want to use is in the default namespace, and
-has pods with labels `app=spark-shuffle-service` and `spark-version=2.1.0`, we can
-use those tags to target that particular shuffle service at job launch time. In order to run a job with dynamic allocation enabled,
-the command may then look like the following:
-
-    bin/spark-submit \
-      --deploy-mode cluster \
-      --class org.apache.spark.examples.GroupByTest \
-      --master k8s://<k8s-master>:<port> \
-      --kubernetes-namespace default \
-      --conf spark.app.name=group-by-test \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:latest \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:latest \
-      --conf spark.dynamicAllocation.enabled=true \
-      --conf spark.shuffle.service.enabled=true \
-      --conf spark.kubernetes.shuffle.namespace=default \
-      --conf spark.kubernetes.shuffle.labels="app=spark-shuffle-service,spark-version=2.1.0" \
-      examples/jars/spark_examples_2.11-2.2.0.jar 10 400000 2
 
 ## Current Limitations
 
diff --git a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
deleted file mode 100644
index 2ed0387c51bc6..0000000000000
--- a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.deploy.kubernetes.submit.v1.DriverServiceManager
+++ /dev/null
@@ -1,2 +0,0 @@
-org.apache.spark.deploy.kubernetes.submit.v1.ExternalSuppliedUrisDriverServiceManager
-org.apache.spark.deploy.kubernetes.submit.v1.NodePortUrisDriverServiceManager
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala
index 03991ba26a6f7..a6f0ca502f6f0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/CompressionUtils.scala
@@ -16,19 +16,17 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream, InputStream, OutputStream}
+import java.io.{File, FileInputStream, FileOutputStream, InputStream, OutputStream}
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 
 import com.google.common.io.Files
-import org.apache.commons.codec.binary.Base64
 import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream, TarArchiveOutputStream}
 import org.apache.commons.compress.utils.CharsetNames
 import org.apache.commons.io.IOUtils
 import scala.collection.mutable
 
-import org.apache.spark.deploy.rest.kubernetes.v1.TarGzippedData
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{ByteBufferOutputStream, Utils}
+import org.apache.spark.util.Utils
 
 private[spark] object CompressionUtils extends Logging {
   // Defaults from TarArchiveOutputStream
@@ -36,30 +34,6 @@ private[spark] object CompressionUtils extends Logging {
   private val RECORD_SIZE = 512
   private val ENCODING = CharsetNames.UTF_8
 
-  /**
-   * Compresses all of the given paths into a gzipped-tar archive, returning the compressed data in
-   * memory as an instance of {@link TarGzippedData}. The files are taken without consideration to
-   * their original folder structure, and are added to the tar archive in a flat hierarchy.
-   * Directories are not allowed, and duplicate file names are de-duplicated by appending a numeric
-   * suffix to the file name, before the file extension. For example, if paths a/b.txt and b/b.txt
-   * were provided, then the files added to the tar archive would be b.txt and b-1.txt.
-   * @param paths A list of file paths to be archived
-   * @return An in-memory representation of the compressed data.
-   */
-  def createTarGzip(paths: Iterable[String]): TarGzippedData = {
-    val compressedBytesStream = Utils.tryWithResource(new ByteBufferOutputStream()) { raw =>
-      writeTarGzipToStream(raw, paths)
-      raw
-    }
-    val compressedAsBase64 = Base64.encodeBase64String(compressedBytesStream.toByteBuffer.array)
-    TarGzippedData(
-      dataBase64 = compressedAsBase64,
-      blockSize = BLOCK_SIZE,
-      recordSize = RECORD_SIZE,
-      encoding = ENCODING
-    )
-  }
-
   def writeTarGzipToStream(outputStream: OutputStream, paths: Iterable[String]): Unit = {
     Utils.tryWithResource(new GZIPOutputStream(outputStream)) { gzipping =>
       Utils.tryWithResource(new TarArchiveOutputStream(
@@ -98,50 +72,14 @@ private[spark] object CompressionUtils extends Logging {
     }
   }
 
-  /**
-   * Decompresses the provided tar archive to a directory.
-   * @param compressedData In-memory representation of the compressed data, ideally created via
-   *                       {@link createTarGzip}.
-   * @param rootOutputDir  Directory to write the output files to. All files from the tarball
-   *                       are written here in a flat hierarchy.
-   * @return List of file paths for each file that was unpacked from the archive.
-   */
-  def unpackAndWriteCompressedFiles(
-      compressedData: TarGzippedData,
-      rootOutputDir: File): Seq[String] = {
-    val compressedBytes = Base64.decodeBase64(compressedData.dataBase64)
-    if (!rootOutputDir.exists) {
-      if (!rootOutputDir.mkdirs) {
-        throw new IllegalStateException(s"Failed to create output directory for unpacking" +
-          s" files at ${rootOutputDir.getAbsolutePath}")
-      }
-    } else if (rootOutputDir.isFile) {
-      throw new IllegalArgumentException(s"Root dir for writing decompressed files: " +
-         s"${rootOutputDir.getAbsolutePath} exists and is not a directory.")
-    }
-    Utils.tryWithResource(new ByteArrayInputStream(compressedBytes)) { compressedBytesStream =>
-      unpackTarStreamToDirectory(
-        compressedBytesStream,
-        rootOutputDir,
-        compressedData.blockSize,
-        compressedData.recordSize,
-        compressedData.encoding)
-    }
-  }
-
-  def unpackTarStreamToDirectory(
-      inputStream: InputStream,
-      outputDir: File,
-      blockSize: Int = BLOCK_SIZE,
-      recordSize: Int = RECORD_SIZE,
-      encoding: String = ENCODING): Seq[String] = {
+  def unpackTarStreamToDirectory(inputStream: InputStream, outputDir: File): Seq[String] = {
     val paths = mutable.Buffer.empty[String]
     Utils.tryWithResource(new GZIPInputStream(inputStream)) { gzipped =>
       Utils.tryWithResource(new TarArchiveInputStream(
           gzipped,
-          blockSize,
-          recordSize,
-          encoding)) { tarInputStream =>
+          BLOCK_SIZE,
+          RECORD_SIZE,
+          ENCODING)) { tarInputStream =>
         var nextTarEntry = tarInputStream.getNextTarEntry
         while (nextTarEntry != null) {
           val outputFile = new File(outputDir, nextTarEntry.getName)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
index 227420db4636d..0d4e82566643d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.kubernetes
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, PodBuilder, VolumeMount, VolumeMountBuilder}
 
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.v2.{ContainerNameEqualityPredicate, InitContainerUtil}
+import org.apache.spark.deploy.kubernetes.submit.{ContainerNameEqualityPredicate, InitContainerUtil}
 
 private[spark] trait SparkPodInitContainerBootstrap {
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 759a7df505829..bcb9a96cae960 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -20,7 +20,6 @@ import java.util.concurrent.TimeUnit
 
 import org.apache.spark.{SPARK_VERSION => sparkVersion}
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.v1.NodePortUrisDriverServiceManager
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.ConfigBuilder
 import org.apache.spark.network.util.ByteUnit
@@ -212,77 +211,6 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
-  private[spark] val KUBERNETES_DRIVER_SUBMIT_TIMEOUT =
-    ConfigBuilder("spark.kubernetes.driverSubmissionTimeout")
-      .doc("Time to wait for the driver process to start running before aborting its execution.")
-      .timeConf(TimeUnit.SECONDS)
-      .createWithDefault(60L)
-
-  private[spark] val KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE =
-    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.keyStore")
-      .doc("KeyStore file for the driver submission server listening on SSL. Can be pre-mounted" +
-        " on the driver container or uploaded from the submitting client.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE =
-    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.trustStore")
-      .doc("TrustStore containing certificates for communicating to the driver submission server" +
-        " over SSL.")
-      .stringConf
-      .createOptional
-
-  private[spark] val DRIVER_SUBMIT_SSL_ENABLED =
-    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.enabled")
-      .doc("Whether or not to use SSL when sending the application dependencies to the driver pod.")
-      .booleanConf
-      .createWithDefault(false)
-
-  private[spark] val DRIVER_SUBMIT_SSL_KEY_PEM =
-    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.keyPem")
-      .doc("Key PEM file that the driver submission server will use when setting up TLS" +
-        " connections. Can be pre-mounted on the driver pod's disk or uploaded from the" +
-        " submitting client's machine.")
-      .stringConf
-      .createOptional
-
-  private[spark] val DRIVER_SUBMIT_SSL_SERVER_CERT_PEM =
-    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.serverCertPem")
-      .doc("Certificate PEM file that is associated with the key PEM file" +
-        " the submission server uses to set up TLS connections. Can be pre-mounted" +
-        " on the driver pod's disk or uploaded from the submitting client's machine.")
-      .stringConf
-      .createOptional
-
-  private[spark] val DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM =
-    ConfigBuilder("spark.ssl.kubernetes.driversubmitserver.clientCertPem")
-      .doc("Certificate pem file that the submission client uses to connect to the submission" +
-        " server over TLS. This should often be the same as the server certificate, but can be" +
-        " different if the submission client will contact the driver through a proxy instead of" +
-        " the driver service directly.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_SERVICE_NAME =
-    ConfigBuilder("spark.kubernetes.driver.service.name")
-        .doc("Kubernetes service that exposes the driver pod for external access.")
-        .internal()
-        .stringConf
-        .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY =
-    ConfigBuilder("spark.kubernetes.driver.submissionServerMemory")
-      .doc("The amount of memory to allocate for the driver submission server.")
-      .bytesConf(ByteUnit.MiB)
-      .createWithDefaultString("256m")
-
-  private[spark] val EXPOSE_KUBERNETES_DRIVER_SERVICE_UI_PORT =
-    ConfigBuilder("spark.kubernetes.driver.service.exposeUiPort")
-      .doc("Whether to expose the driver Web UI port as a service NodePort. Turned off by default" +
-        " because NodePort is a limited resource. Use alternatives if possible.")
-      .booleanConf
-      .createWithDefault(false)
-
   private[spark] val KUBERNETES_DRIVER_POD_NAME =
     ConfigBuilder("spark.kubernetes.driver.pod.name")
       .doc("Name of the driver pod.")
@@ -327,13 +255,6 @@ package object config extends Logging {
       .longConf
       .createWithDefault(1)
 
-  private[spark] val DRIVER_SERVICE_MANAGER_TYPE =
-    ConfigBuilder("spark.kubernetes.driver.serviceManagerType")
-      .doc("A tag indicating which class to use for creating the Kubernetes service and" +
-        " determining its URI for the submission client.")
-      .stringConf
-      .createWithDefault(NodePortUrisDriverServiceManager.TYPE)
-
   private[spark] val WAIT_FOR_APP_COMPLETION =
     ConfigBuilder("spark.kubernetes.submission.waitAppCompletion")
       .doc("In cluster mode, whether to wait for the application to finish before exiting the" +
@@ -347,8 +268,7 @@ package object config extends Logging {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("1s")
 
-  // Spark dependency server for submission v2
-
+  // Spark resource staging server.
   private[spark] val RESOURCE_STAGING_SERVER_PORT =
     ConfigBuilder("spark.kubernetes.resourceStagingServer.port")
       .doc("Port for the Kubernetes resource staging server to listen on.")
@@ -451,7 +371,7 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
-  // Driver and Init-Container parameters for submission v2
+  // Driver and Init-Container parameters
   private[spark] val RESOURCE_STAGING_SERVER_URI =
     ConfigBuilder("spark.kubernetes.resourceStagingServer.uri")
       .doc("Base URI for the Spark resource staging server.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index e4ca5c1458abe..bfb0bc3ffb0f3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 import java.util.Collections
@@ -25,8 +25,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.{LoggingPodStatusWatcher, LoggingPodStatusWatcherImpl}
-import org.apache.spark.deploy.rest.kubernetes.v2.ResourceStagingServerSslOptionsProviderImpl
+import org.apache.spark.deploy.rest.kubernetes.ResourceStagingServerSslOptionsProviderImpl
 import org.apache.spark.internal.Logging
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
index 5505d87fa8072..c635484c4c124 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolver.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerNameEqualityPredicate.scala
similarity index 95%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerNameEqualityPredicate.scala
index 5101e1506e4d5..434919208ba2e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerNameEqualityPredicate.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerNameEqualityPredicate.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.lang.Boolean
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
index 0a5e6cd216011..7fbb0c9274bf5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -14,13 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.{SparkConf, SSLOptions}
 import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.v2.RetrofitClientFactoryImpl
+import org.apache.spark.deploy.rest.kubernetes.RetrofitClientFactoryImpl
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
index 9759669335774..ded0237732ce0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import io.fabric8.kubernetes.api.model.{PodBuilder, Secret, SecretBuilder}
 import scala.collection.JavaConverters._
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
similarity index 92%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
index e981c54d23a9d..3f0e7d97275a5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
@@ -14,11 +14,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.submit.DriverPodKubernetesCredentialsProvider
 
 private[spark] trait DriverPodKubernetesCredentialsMounterProvider {
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala
index adfdc060f0d0f..2292365995d1f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfiguration.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala
index 0526ca53baaab..9b7faaa78a9aa 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/InitContainerUtil.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PropertiesConfigMapFromScalaMapBuilder.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PropertiesConfigMapFromScalaMapBuilder.scala
index cb9194552d2b6..8103272c27518 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/PropertiesConfigMapFromScalaMapBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PropertiesConfigMapFromScalaMapBuilder.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.StringWriter
 import java.util.Properties
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilder.scala
similarity index 95%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilder.scala
index 362fbbdf517dc..4062a3113eddf 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilder.scala
@@ -14,12 +14,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import io.fabric8.kubernetes.api.model.ConfigMap
 
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
 
 private[spark] trait SparkInitContainerConfigMapBuilder {
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala
index af3de6ce85026..17b61d4a6ace0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmissionKubernetesClientProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala
index 1b086e60d3d0d..06d3648efb89f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPlugin.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.SparkException
 import org.apache.spark.deploy.kubernetes.config._
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala
index 1a33757e45aa0..7850853df97e6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
similarity index 95%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
index 5f98facfb691f..9d0d863d174bc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.{File, FileOutputStream}
 import javax.ws.rs.core.MediaType
@@ -26,8 +26,7 @@ import retrofit2.Call
 
 import org.apache.spark.{SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
-import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
-import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
+import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
 import org.apache.spark.util.Utils
 
 private[spark] trait SubmittedDependencyUploader {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedResources.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedResources.scala
similarity index 96%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedResources.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedResources.scala
index f4e5e991180ce..225972c1057f2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedResources.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedResources.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 case class SubmittedResourceIdAndSecret(resourceId: String, resourceSecret: String)
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
deleted file mode 100644
index 32fc434cb693a..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/Client.scala
+++ /dev/null
@@ -1,743 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v1
-
-import java.io.File
-import java.security.SecureRandom
-import java.util.ServiceLoader
-import java.util.concurrent.{CountDownLatch, TimeUnit}
-
-import com.google.common.io.Files
-import com.google.common.util.concurrent.SettableFuture
-import io.fabric8.kubernetes.api.model._
-import io.fabric8.kubernetes.client.{ConfigBuilder => K8SConfigBuilder, DefaultKubernetesClient, KubernetesClient, KubernetesClientException, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
-import org.apache.commons.codec.binary.Base64
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.{DriverPodKubernetesCredentialsProvider, KubernetesFileUtils, LoggingPodStatusWatcherImpl}
-import org.apache.spark.deploy.rest.kubernetes.v1.{AppResource, ContainerAppResource, HttpClientUtil, KubernetesCreateSubmissionRequest, KubernetesSparkRestApi, RemoteAppResource, UploadedAppResource}
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.{ShutdownHookManager, Utils}
-
-private[spark] class Client(
-    sparkConf: SparkConf,
-    mainClass: String,
-    mainAppResource: String,
-    appArgs: Array[String]) extends Logging {
-  import Client._
-
-  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-  private val master = resolveK8sMaster(sparkConf.get("spark.master"))
-
-  private val launchTime = System.currentTimeMillis
-  private val appName = sparkConf.getOption("spark.app.name")
-    .getOrElse("spark")
-  private val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
-  private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
-    .getOrElse(kubernetesAppId)
-  private val secretName = s"$SUBMISSION_APP_SECRET_PREFIX-$kubernetesAppId"
-  private val secretDirectory = s"$DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR/$kubernetesAppId"
-  private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
-  private val uiPort = sparkConf.getInt("spark.ui.port", DEFAULT_UI_PORT)
-  private val driverSubmitTimeoutSecs = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
-  private val driverServiceManagerType = sparkConf.get(DRIVER_SERVICE_MANAGER_TYPE)
-  private val sparkFiles = sparkConf.getOption("spark.files")
-    .map(_.split(","))
-    .getOrElse(Array.empty[String])
-  private val sparkJars = sparkConf.getOption("spark.jars")
-    .map(_.split(","))
-    .getOrElse(Array.empty[String])
-
-  // CPU settings
-  private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
-
-  // Memory settings
-  private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
-  private val driverSubmitServerMemoryMb = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY)
-  private val driverSubmitServerMemoryString = sparkConf.get(
-    KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY.key,
-    KUBERNETES_DRIVER_SUBMIT_SERVER_MEMORY.defaultValueString)
-  private val driverContainerMemoryMb = driverMemoryMb + driverSubmitServerMemoryMb
-  private val memoryOverheadMb = sparkConf
-    .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
-    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverContainerMemoryMb).toInt,
-      MEMORY_OVERHEAD_MIN))
-  private val driverContainerMemoryWithOverhead = driverContainerMemoryMb + memoryOverheadMb
-
-  private val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
-  private val loggingInterval = Some(sparkConf.get(REPORT_INTERVAL))
-    .filter( _ => waitForAppCompletion)
-
-  private val secretBase64String = {
-    val secretBytes = new Array[Byte](128)
-    SECURE_RANDOM.nextBytes(secretBytes)
-    Base64.encodeBase64String(secretBytes)
-  }
-
-  private val serviceAccount = sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
-  private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
-  private val customAnnotations = sparkConf.get(KUBERNETES_DRIVER_ANNOTATIONS)
-
-  private val kubernetesResourceCleaner = new KubernetesResourceCleaner
-
-  def run(): Unit = {
-    logInfo(s"Starting application $kubernetesAppId in Kubernetes...")
-    val submitterLocalFiles = KubernetesFileUtils.getOnlySubmitterLocalFiles(sparkFiles)
-    val submitterLocalJars = KubernetesFileUtils.getOnlySubmitterLocalFiles(sparkJars)
-    (submitterLocalFiles ++ submitterLocalJars).foreach { file =>
-      if (!new File(Utils.resolveURI(file).getPath).isFile) {
-        throw new SparkException(s"File $file does not exist or is a directory.")
-      }
-    }
-    if (KubernetesFileUtils.isUriLocalFile(mainAppResource) &&
-        !new File(Utils.resolveURI(mainAppResource).getPath).isFile) {
-      throw new SparkException(s"Main app resource file $mainAppResource is not a file or" +
-        s" is a directory.")
-    }
-    val driverServiceManager = getDriverServiceManager
-    val parsedCustomLabels = parseKeyValuePairs(customLabels, KUBERNETES_DRIVER_LABELS.key,
-      "labels")
-    parsedCustomLabels.keys.foreach { key =>
-      require(key != SPARK_APP_ID_LABEL, "Label with key" +
-        s" $SPARK_APP_ID_LABEL cannot be used in" +
-        " spark.kubernetes.driver.labels, as it is reserved for Spark's" +
-        " internal configuration.")
-    }
-    val parsedCustomAnnotations = parseKeyValuePairs(
-      customAnnotations,
-      KUBERNETES_DRIVER_ANNOTATIONS.key,
-      "annotations")
-    val driverPodKubernetesCredentials = new DriverPodKubernetesCredentialsProvider(sparkConf).get()
-    var k8ConfBuilder = new K8SConfigBuilder()
-      .withApiVersion("v1")
-      .withMasterUrl(master)
-      .withNamespace(namespace)
-    sparkConf.get(KUBERNETES_SUBMIT_CA_CERT_FILE).foreach {
-      f => k8ConfBuilder = k8ConfBuilder.withCaCertFile(f)
-    }
-    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_KEY_FILE).foreach {
-      f => k8ConfBuilder = k8ConfBuilder.withClientKeyFile(f)
-    }
-    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_CERT_FILE).foreach {
-      f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
-    }
-    sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { token =>
-      k8ConfBuilder = k8ConfBuilder.withOauthToken(token)
-    }
-
-    val k8ClientConfig = k8ConfBuilder.build
-    Utils.tryWithResource(new DefaultKubernetesClient(k8ClientConfig)) { kubernetesClient =>
-      driverServiceManager.start(kubernetesClient, kubernetesAppId, sparkConf)
-      // start outer watch for status logging of driver pod
-      // only enable interval logging if in waitForAppCompletion mode
-      val loggingWatch = new LoggingPodStatusWatcherImpl(
-        kubernetesAppId, loggingInterval)
-      Utils.tryWithResource(kubernetesClient
-          .pods()
-          .withName(kubernetesDriverPodName)
-          .watch(loggingWatch)) { _ =>
-        loggingWatch.start()
-        val resourceCleanShutdownHook = ShutdownHookManager.addShutdownHook(() =>
-          kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient))
-        val cleanupServiceManagerHook = ShutdownHookManager.addShutdownHook(
-          ShutdownHookManager.DEFAULT_SHUTDOWN_PRIORITY)(
-          () => driverServiceManager.stop())
-        // Place the error hook at a higher priority in order for the error hook to run before
-        // the stop hook.
-        val serviceManagerErrorHook = ShutdownHookManager.addShutdownHook(
-          ShutdownHookManager.DEFAULT_SHUTDOWN_PRIORITY + 1)(() =>
-          driverServiceManager.handleSubmissionError(
-            new SparkException("Submission shutting down early...")))
-        try {
-          val sslConfigurationProvider = new DriverSubmitSslConfigurationProvider(
-            sparkConf, kubernetesAppId, kubernetesClient, kubernetesResourceCleaner)
-          val submitServerSecret = kubernetesClient.secrets().createNew()
-            .withNewMetadata()
-            .withName(secretName)
-            .endMetadata()
-            .withData(Map((SUBMISSION_APP_SECRET_NAME, secretBase64String)).asJava)
-            .withType("Opaque")
-            .done()
-          kubernetesResourceCleaner.registerOrUpdateResource(submitServerSecret)
-          val sslConfiguration = sslConfigurationProvider.getSslConfiguration()
-          val (driverPod, driverService) = launchDriverKubernetesComponents(
-            kubernetesClient,
-            driverServiceManager,
-            parsedCustomLabels,
-            parsedCustomAnnotations,
-            submitServerSecret,
-            sslConfiguration)
-          configureOwnerReferences(
-            kubernetesClient,
-            submitServerSecret,
-            sslConfiguration.sslSecret,
-            driverPod,
-            driverService)
-          submitApplicationToDriverServer(
-            kubernetesClient,
-            driverServiceManager,
-            sslConfiguration,
-            driverService,
-            submitterLocalFiles,
-            submitterLocalJars,
-            driverPodKubernetesCredentials)
-          // Now that the application has started, persist the components that were created beyond
-          // the shutdown hook. We still want to purge the one-time secrets, so do not unregister
-          // those.
-          kubernetesResourceCleaner.unregisterResource(driverPod)
-          kubernetesResourceCleaner.unregisterResource(driverService)
-        } catch {
-          case e: Throwable =>
-            driverServiceManager.handleSubmissionError(e)
-            throw e
-        } finally {
-          Utils.tryLogNonFatalError {
-            kubernetesResourceCleaner.deleteAllRegisteredResourcesFromKubernetes(kubernetesClient)
-          }
-          Utils.tryLogNonFatalError {
-            driverServiceManager.stop()
-          }
-          // Remove the shutdown hooks that would be redundant
-          Utils.tryLogNonFatalError {
-            ShutdownHookManager.removeShutdownHook(resourceCleanShutdownHook)
-          }
-          Utils.tryLogNonFatalError {
-            ShutdownHookManager.removeShutdownHook(cleanupServiceManagerHook)
-          }
-          Utils.tryLogNonFatalError {
-            ShutdownHookManager.removeShutdownHook(serviceManagerErrorHook)
-          }
-        }
-        // wait if configured to do so
-        if (waitForAppCompletion) {
-          logInfo(s"Waiting for application $kubernetesAppId to finish...")
-          loggingWatch.awaitCompletion()
-          logInfo(s"Application $kubernetesAppId finished.")
-        } else {
-          logInfo(s"Application $kubernetesAppId successfully launched.")
-        }
-      }
-    }
-  }
-
-  private def submitApplicationToDriverServer(
-      kubernetesClient: KubernetesClient,
-      driverServiceManager: DriverServiceManager,
-      sslConfiguration: DriverSubmitSslConfiguration,
-      driverService: Service,
-      submitterLocalFiles: Iterable[String],
-      submitterLocalJars: Iterable[String],
-      driverPodKubernetesCredentials: KubernetesCredentials): Unit = {
-    sparkConf.getOption("spark.app.id").foreach { id =>
-      logWarning(s"Warning: Provided app id in spark.app.id as $id will be" +
-        s" overridden as $kubernetesAppId")
-    }
-    sparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
-    sparkConf.set(KUBERNETES_DRIVER_SERVICE_NAME, driverService.getMetadata.getName)
-    sparkConf.set("spark.app.id", kubernetesAppId)
-    sparkConf.setIfMissing("spark.app.name", appName)
-    sparkConf.setIfMissing("spark.driver.port", DEFAULT_DRIVER_PORT.toString)
-    sparkConf.setIfMissing("spark.driver.blockManager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
-    sparkConf.setIfMissing("spark.blockManager.port", DEFAULT_BLOCKMANAGER_PORT.toString)
-    sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
-      sparkConf.set(KUBERNETES_SUBMIT_OAUTH_TOKEN, "<present_but_redacted>")
-    }
-    sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
-      sparkConf.set(KUBERNETES_DRIVER_OAUTH_TOKEN, "<present_but_redacted>")
-    }
-    val driverSubmitter = buildDriverSubmissionClient(
-      kubernetesClient,
-      driverServiceManager,
-      driverService,
-      sslConfiguration)
-    // Sanity check to see if the driver submitter is even reachable.
-    driverSubmitter.ping()
-    logInfo(s"Submitting local resources to driver pod for application " +
-      s"$kubernetesAppId ...")
-    val submitRequest = buildSubmissionRequest(
-      submitterLocalFiles,
-      submitterLocalJars,
-      driverPodKubernetesCredentials)
-    driverSubmitter.submitApplication(submitRequest)
-    logInfo("Successfully submitted local resources and driver configuration to" +
-      " driver pod.")
-    // After submitting, adjust the service to only expose the Spark UI
-    val uiServiceType = if (sparkConf.get(EXPOSE_KUBERNETES_DRIVER_SERVICE_UI_PORT)) "NodePort"
-      else "ClusterIP"
-    val uiServicePort = new ServicePortBuilder()
-      .withName(UI_PORT_NAME)
-      .withPort(uiPort)
-      .withNewTargetPort(uiPort)
-      .build()
-    val resolvedService = kubernetesClient.services().withName(kubernetesAppId).edit()
-      .editSpec()
-        .withType(uiServiceType)
-        .withPorts(uiServicePort)
-        .endSpec()
-      .done()
-    kubernetesResourceCleaner.registerOrUpdateResource(resolvedService)
-    logInfo("Finished submitting application to Kubernetes.")
-  }
-
-  private def launchDriverKubernetesComponents(
-      kubernetesClient: KubernetesClient,
-      driverServiceManager: DriverServiceManager,
-      customLabels: Map[String, String],
-      customAnnotations: Map[String, String],
-      submitServerSecret: Secret,
-      sslConfiguration: DriverSubmitSslConfiguration): (Pod, Service) = {
-    val driverKubernetesSelectors = (Map(
-      SPARK_DRIVER_LABEL -> kubernetesAppId,
-      SPARK_APP_ID_LABEL -> kubernetesAppId,
-      SPARK_APP_NAME_LABEL -> appName)
-      ++ customLabels)
-    val endpointsReadyFuture = SettableFuture.create[Endpoints]
-    val endpointsReadyWatcher = new DriverEndpointsReadyWatcher(endpointsReadyFuture)
-    val serviceReadyFuture = SettableFuture.create[Service]
-    val serviceReadyWatcher = new DriverServiceReadyWatcher(serviceReadyFuture)
-    val podReadyFuture = SettableFuture.create[Pod]
-    val podWatcher = new DriverPodReadyWatcher(podReadyFuture)
-    Utils.tryWithResource(kubernetesClient
-        .pods()
-        .withName(kubernetesDriverPodName)
-        .watch(podWatcher)) { _ =>
-      Utils.tryWithResource(kubernetesClient
-          .services()
-          .withName(kubernetesAppId)
-          .watch(serviceReadyWatcher)) { _ =>
-        Utils.tryWithResource(kubernetesClient
-            .endpoints()
-            .withName(kubernetesAppId)
-            .watch(endpointsReadyWatcher)) { _ =>
-          val serviceTemplate = createDriverServiceTemplate(driverKubernetesSelectors)
-          val driverService = kubernetesClient.services().create(
-            driverServiceManager.customizeDriverService(serviceTemplate).build())
-          kubernetesResourceCleaner.registerOrUpdateResource(driverService)
-          val driverPod = createDriverPod(
-            kubernetesClient,
-            driverKubernetesSelectors,
-            customAnnotations,
-            submitServerSecret,
-            sslConfiguration)
-          waitForReadyKubernetesComponents(kubernetesClient, endpointsReadyFuture,
-            serviceReadyFuture, podReadyFuture)
-          (driverPod, driverService)
-        }
-      }
-    }
-  }
-
-  /**
-   * Sets the owner reference for all the kubernetes components to link to the driver pod.
-   *
-   * @return The driver service after it has been adjusted to reflect the new owner
-   * reference.
-   */
-  private def configureOwnerReferences(
-      kubernetesClient: KubernetesClient,
-      submitServerSecret: Secret,
-      sslSecret: Option[Secret],
-      driverPod: Pod,
-      driverService: Service): Service = {
-    val driverPodOwnerRef = new OwnerReferenceBuilder()
-      .withName(driverPod.getMetadata.getName)
-      .withUid(driverPod.getMetadata.getUid)
-      .withApiVersion(driverPod.getApiVersion)
-      .withKind(driverPod.getKind)
-      .withController(true)
-      .build()
-    sslSecret.foreach(secret => {
-      val updatedSecret = kubernetesClient.secrets().withName(secret.getMetadata.getName).edit()
-        .editMetadata()
-        .addToOwnerReferences(driverPodOwnerRef)
-        .endMetadata()
-        .done()
-      kubernetesResourceCleaner.registerOrUpdateResource(updatedSecret)
-    })
-    val updatedSubmitServerSecret = kubernetesClient
-      .secrets()
-      .withName(submitServerSecret.getMetadata.getName)
-      .edit()
-        .editMetadata()
-          .addToOwnerReferences(driverPodOwnerRef)
-          .endMetadata()
-        .done()
-    kubernetesResourceCleaner.registerOrUpdateResource(updatedSubmitServerSecret)
-    val updatedService = kubernetesClient
-      .services()
-      .withName(driverService.getMetadata.getName)
-      .edit()
-        .editMetadata()
-          .addToOwnerReferences(driverPodOwnerRef)
-          .endMetadata()
-        .done()
-    kubernetesResourceCleaner.registerOrUpdateResource(updatedService)
-    updatedService
-  }
-
-  private def waitForReadyKubernetesComponents(
-      kubernetesClient: KubernetesClient,
-      endpointsReadyFuture: SettableFuture[Endpoints],
-      serviceReadyFuture: SettableFuture[Service],
-      podReadyFuture: SettableFuture[Pod]) = {
-    try {
-      podReadyFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
-      logInfo("Driver pod successfully created in Kubernetes cluster.")
-    } catch {
-      case e: Throwable =>
-        val finalErrorMessage: String = buildSubmitFailedErrorMessage(kubernetesClient, e)
-        logError(finalErrorMessage, e)
-        throw new SparkException(finalErrorMessage, e)
-    }
-    try {
-      serviceReadyFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
-      logInfo("Driver service created successfully in Kubernetes.")
-    } catch {
-      case e: Throwable =>
-        throw new SparkException(s"The driver service was not ready" +
-          s" in $driverSubmitTimeoutSecs seconds.", e)
-    }
-    try {
-      endpointsReadyFuture.get(driverSubmitTimeoutSecs, TimeUnit.SECONDS)
-      logInfo("Driver endpoints ready to receive application submission")
-    } catch {
-      case e: Throwable =>
-        throw new SparkException(s"The driver service endpoint was not ready" +
-          s" in $driverSubmitTimeoutSecs seconds.", e)
-    }
-  }
-
-  private def createDriverPod(
-      kubernetesClient: KubernetesClient,
-      driverKubernetesSelectors: Map[String, String],
-      customAnnotations: Map[String, String],
-      submitServerSecret: Secret,
-      sslConfiguration: DriverSubmitSslConfiguration): Pod = {
-    val containerPorts = buildContainerPorts()
-    val probePingHttpGet = new HTTPGetActionBuilder()
-      .withScheme(if (sslConfiguration.enabled) "HTTPS" else "HTTP")
-      .withPath("/v1/submissions/ping")
-      .withNewPort(SUBMISSION_SERVER_PORT_NAME)
-      .build()
-    val driverCpuQuantity = new QuantityBuilder(false)
-      .withAmount(driverCpuCores)
-      .build()
-    val driverMemoryQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverContainerMemoryMb}M")
-      .build()
-    val driverMemoryLimitQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverContainerMemoryWithOverhead}M")
-      .build()
-    val driverPod = kubernetesClient.pods().createNew()
-      .withNewMetadata()
-        .withName(kubernetesDriverPodName)
-        .withLabels(driverKubernetesSelectors.asJava)
-        .withAnnotations(customAnnotations.asJava)
-        .endMetadata()
-      .withNewSpec()
-        .withRestartPolicy("Never")
-        .addNewVolume()
-          .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
-          .withNewSecret()
-            .withSecretName(submitServerSecret.getMetadata.getName)
-            .endSecret()
-          .endVolume()
-        .addToVolumes(sslConfiguration.sslPodVolume.toSeq: _*)
-        .withServiceAccount(serviceAccount.getOrElse("default"))
-        .addNewContainer()
-          .withName(DRIVER_CONTAINER_NAME)
-          .withImage(driverDockerImage)
-          .withImagePullPolicy("IfNotPresent")
-          .addNewVolumeMount()
-            .withName(SUBMISSION_APP_SECRET_VOLUME_NAME)
-            .withMountPath(secretDirectory)
-            .withReadOnly(true)
-            .endVolumeMount()
-          .addToVolumeMounts(sslConfiguration.sslPodVolumeMount.toSeq: _*)
-          .addNewEnv()
-            .withName(ENV_SUBMISSION_SECRET_LOCATION)
-            .withValue(s"$secretDirectory/$SUBMISSION_APP_SECRET_NAME")
-            .endEnv()
-          .addNewEnv()
-            .withName(ENV_SUBMISSION_SERVER_PORT)
-            .withValue(SUBMISSION_SERVER_PORT.toString)
-            .endEnv()
-          // Note that SPARK_DRIVER_MEMORY only affects the REST server via spark-class.
-          .addNewEnv()
-            .withName(ENV_DRIVER_MEMORY)
-            .withValue(driverSubmitServerMemoryString)
-            .endEnv()
-          .addToEnv(sslConfiguration.sslPodEnvVars: _*)
-          .withNewResources()
-            .addToRequests("cpu", driverCpuQuantity)
-            .addToLimits("cpu", driverCpuQuantity)
-            .addToRequests("memory", driverMemoryQuantity)
-            .addToLimits("memory", driverMemoryLimitQuantity)
-            .endResources()
-          .withPorts(containerPorts.asJava)
-          .withNewReadinessProbe().withHttpGet(probePingHttpGet).endReadinessProbe()
-          .endContainer()
-        .endSpec()
-      .done()
-    kubernetesResourceCleaner.registerOrUpdateResource(driverPod)
-    driverPod
-  }
-
-   private def createDriverServiceTemplate(driverKubernetesSelectors: Map[String, String])
-      : ServiceBuilder = {
-    val driverSubmissionServicePort = new ServicePortBuilder()
-      .withName(SUBMISSION_SERVER_PORT_NAME)
-      .withPort(SUBMISSION_SERVER_PORT)
-      .withNewTargetPort(SUBMISSION_SERVER_PORT)
-      .build()
-    new ServiceBuilder()
-      .withNewMetadata()
-        .withName(kubernetesAppId)
-        .withLabels(driverKubernetesSelectors.asJava)
-        .endMetadata()
-      .withNewSpec()
-        .withSelector(driverKubernetesSelectors.asJava)
-        .withPorts(driverSubmissionServicePort)
-        .endSpec()
-  }
-
-  private class DriverPodReadyWatcher(resolvedDriverPod: SettableFuture[Pod]) extends Watcher[Pod] {
-    override def eventReceived(action: Action, pod: Pod): Unit = {
-      if ((action == Action.ADDED || action == Action.MODIFIED)
-          && pod.getStatus.getPhase == "Running"
-          && !resolvedDriverPod.isDone) {
-        pod.getStatus
-          .getContainerStatuses
-          .asScala
-          .find(status =>
-            status.getName == DRIVER_CONTAINER_NAME && status.getReady)
-          .foreach { _ => resolvedDriverPod.set(pod) }
-      }
-    }
-
-    override def onClose(cause: KubernetesClientException): Unit = {
-      logDebug("Driver pod readiness watch closed.", cause)
-    }
-  }
-
-  private class DriverEndpointsReadyWatcher(resolvedDriverEndpoints: SettableFuture[Endpoints])
-      extends Watcher[Endpoints] {
-    override def eventReceived(action: Action, endpoints: Endpoints): Unit = {
-      if ((action == Action.ADDED || action == Action.MODIFIED)
-          && (endpoints != null)
-          && (endpoints.getSubsets != null)
-          && endpoints.getSubsets.asScala.nonEmpty
-          && endpoints.getSubsets.asScala.exists(_.getAddresses.asScala.nonEmpty)
-          && !resolvedDriverEndpoints.isDone) {
-        resolvedDriverEndpoints.set(endpoints)
-      }
-    }
-
-    override def onClose(cause: KubernetesClientException): Unit = {
-      logDebug("Driver endpoints readiness watch closed.", cause)
-    }
-  }
-
-  private class DriverServiceReadyWatcher(resolvedDriverService: SettableFuture[Service])
-      extends Watcher[Service] {
-    override def eventReceived(action: Action, service: Service): Unit = {
-      if ((action == Action.ADDED || action == Action.MODIFIED)
-          && !resolvedDriverService.isDone) {
-        resolvedDriverService.set(service)
-      }
-    }
-
-    override def onClose(cause: KubernetesClientException): Unit = {
-      logDebug("Driver service readiness watch closed.", cause)
-    }
-  }
-
-  private def buildSubmitFailedErrorMessage(
-      kubernetesClient: KubernetesClient,
-      e: Throwable): String = {
-    val driverPod = try {
-      kubernetesClient.pods().withName(kubernetesDriverPodName).get()
-    } catch {
-      case throwable: Throwable =>
-        logError(s"Timed out while waiting $driverSubmitTimeoutSecs seconds for the" +
-          " driver pod to start, but an error occurred while fetching the driver" +
-          " pod's details.", throwable)
-        throw new SparkException(s"Timed out while waiting $driverSubmitTimeoutSecs" +
-          " seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
-          " the latest state of the pod, another error was thrown. Check the logs for" +
-          " the error that was thrown in looking up the driver pod.", e)
-    }
-    val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
-      s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
-      s" $driverSubmitTimeoutSecs seconds."
-    val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
-      s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
-    } else {
-      "The pod had no final phase."
-    }
-    val podStatusMessage = if (driverPod.getStatus.getMessage != null) {
-      s"Latest message from the pod is: ${driverPod.getStatus.getMessage}"
-    } else {
-      "The pod had no final message."
-    }
-    val failedDriverContainerStatusString = driverPod.getStatus
-      .getContainerStatuses
-      .asScala
-      .find(_.getName == DRIVER_CONTAINER_NAME)
-      .map(status => {
-        val lastState = status.getState
-        if (lastState.getRunning != null) {
-          "Driver container last state: Running\n" +
-            s"Driver container started at: ${lastState.getRunning.getStartedAt}"
-        } else if (lastState.getWaiting != null) {
-          "Driver container last state: Waiting\n" +
-            s"Driver container wait reason: ${lastState.getWaiting.getReason}\n" +
-            s"Driver container message: ${lastState.getWaiting.getMessage}\n"
-        } else if (lastState.getTerminated != null) {
-          "Driver container last state: Terminated\n" +
-            s"Driver container started at: ${lastState.getTerminated.getStartedAt}\n" +
-            s"Driver container finished at: ${lastState.getTerminated.getFinishedAt}\n" +
-            s"Driver container exit reason: ${lastState.getTerminated.getReason}\n" +
-            s"Driver container exit code: ${lastState.getTerminated.getExitCode}\n" +
-            s"Driver container message: ${lastState.getTerminated.getMessage}"
-        } else {
-          "Driver container last state: Unknown"
-        }
-      }).getOrElse("The driver container wasn't found in the pod; expected to find" +
-      s" container with name $DRIVER_CONTAINER_NAME")
-    s"$topLevelMessage\n" +
-      s"$podStatusPhase\n" +
-      s"$podStatusMessage\n\n$failedDriverContainerStatusString"
-  }
-
-  private def buildContainerPorts(): Seq[ContainerPort] = {
-    Seq((DRIVER_PORT_NAME, sparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT)),
-      (BLOCK_MANAGER_PORT_NAME,
-        sparkConf.getInt("spark.blockManager.port", DEFAULT_BLOCKMANAGER_PORT)),
-      (SUBMISSION_SERVER_PORT_NAME, SUBMISSION_SERVER_PORT),
-      (UI_PORT_NAME, uiPort)).map(port => new ContainerPortBuilder()
-        .withName(port._1)
-        .withContainerPort(port._2)
-        .build())
-  }
-
-  private def buildSubmissionRequest(
-      submitterLocalFiles: Iterable[String],
-      submitterLocalJars: Iterable[String],
-      driverPodKubernetesCredentials: KubernetesCredentials): KubernetesCreateSubmissionRequest = {
-    val mainResourceUri = Utils.resolveURI(mainAppResource)
-    val resolvedAppResource: AppResource = Option(mainResourceUri.getScheme)
-        .getOrElse("file") match {
-      case "file" =>
-        val appFile = new File(mainResourceUri.getPath)
-        val fileBytes = Files.toByteArray(appFile)
-        val fileBase64 = Base64.encodeBase64String(fileBytes)
-        UploadedAppResource(resourceBase64Contents = fileBase64, name = appFile.getName)
-      case "local" => ContainerAppResource(mainAppResource)
-      case other => RemoteAppResource(other)
-    }
-    val uploadFilesBase64Contents = CompressionUtils.createTarGzip(submitterLocalFiles.map(
-      Utils.resolveURI(_).getPath))
-    val uploadJarsBase64Contents = CompressionUtils.createTarGzip(submitterLocalJars.map(
-      Utils.resolveURI(_).getPath))
-    KubernetesCreateSubmissionRequest(
-      appResource = resolvedAppResource,
-      mainClass = mainClass,
-      appArgs = appArgs,
-      secret = secretBase64String,
-      sparkProperties = sparkConf.getAll.toMap,
-      uploadedJarsBase64Contents = uploadJarsBase64Contents,
-      uploadedFilesBase64Contents = uploadFilesBase64Contents,
-      driverPodKubernetesCredentials = driverPodKubernetesCredentials)
-  }
-
-  private def buildDriverSubmissionClient(
-      kubernetesClient: KubernetesClient,
-      driverServiceManager: DriverServiceManager,
-      service: Service,
-      sslConfiguration: DriverSubmitSslConfiguration): KubernetesSparkRestApi = {
-    val serviceUris = driverServiceManager.getDriverServiceSubmissionServerUris(service)
-    require(serviceUris.nonEmpty, "No uris found to contact the driver!")
-    HttpClientUtil.createClient[KubernetesSparkRestApi](
-      uris = serviceUris,
-      maxRetriesPerServer = 10,
-      sslSocketFactory = sslConfiguration
-        .driverSubmitClientSslContext
-        .getSocketFactory,
-      trustContext = sslConfiguration
-        .driverSubmitClientTrustManager
-        .orNull,
-      connectTimeoutMillis = 5000)
-  }
-
-  private def parseKeyValuePairs(
-      maybeKeyValues: Option[String],
-      configKey: String,
-      keyValueType: String): Map[String, String] = {
-    maybeKeyValues.map(keyValues => {
-      keyValues.split(",").map(_.trim).filterNot(_.isEmpty).map(keyValue => {
-        keyValue.split("=", 2).toSeq match {
-          case Seq(k, v) =>
-            (k, v)
-          case _ =>
-            throw new SparkException(s"Custom $keyValueType set by $configKey must be a" +
-              s" comma-separated list of key-value pairs, with format <key>=<value>." +
-              s" Got value: $keyValue. All values: $keyValues")
-        }
-      }).toMap
-    }).getOrElse(Map.empty[String, String])
-  }
-
-  private def getDriverServiceManager: DriverServiceManager = {
-    val driverServiceManagerLoader = ServiceLoader.load(classOf[DriverServiceManager])
-    val matchingServiceManagers = driverServiceManagerLoader
-      .iterator()
-      .asScala
-      .filter(_.getServiceManagerType == driverServiceManagerType)
-      .toList
-    require(matchingServiceManagers.nonEmpty,
-      s"No driver service manager found matching type $driverServiceManagerType")
-    require(matchingServiceManagers.size == 1, "Multiple service managers found" +
-      s" matching type $driverServiceManagerType, got: " +
-      matchingServiceManagers.map(_.getClass).toList.mkString(","))
-    matchingServiceManagers.head
-  }
-}
-
-private[spark] object Client extends Logging {
-
-  private[spark] val SECURE_RANDOM = new SecureRandom()
-
-  def main(args: Array[String]): Unit = {
-    require(args.length >= 2, s"Too few arguments. Usage: ${getClass.getName} <mainAppResource>" +
-      s" <mainClass> [<application arguments>]")
-    val mainAppResource = args(0)
-    val mainClass = args(1)
-    val appArgs = args.drop(2)
-    val sparkConf = new SparkConf(true)
-    new Client(
-      mainAppResource = mainAppResource,
-      mainClass = mainClass,
-      sparkConf = sparkConf,
-      appArgs = appArgs).run()
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala
deleted file mode 100644
index c7d394fcf00ad..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverServiceManager.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v1
-
-import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
-import io.fabric8.kubernetes.client.KubernetesClient
-
-import org.apache.spark.SparkConf
-
-/**
- * Implementations of this interface are responsible for exposing the driver pod by:
- * - Creating a Kubernetes Service that is backed by the driver pod, and
- * - Providing one or more URIs that the service can be reached at from the submission client.
- *
- * In general, one should not need to implement custom variants of this interface. Consider
- * if the built-in service managers, NodePort and ExternalAnnotation, suit your needs first.
- *
- * This API is in an alpha state and may break without notice.
- */
-trait DriverServiceManager {
-
-  protected var kubernetesClient: KubernetesClient = _
-  protected var serviceName: String = _
-  protected var sparkConf: SparkConf = _
-
-  /**
-   * The tag that identifies this service manager type. This service manager will be loaded
-   * only if the Spark configuration spark.kubernetes.driver.serviceManagerType matches this
-   * value.
-   */
-  def getServiceManagerType: String
-
-  final def start(
-      kubernetesClient: KubernetesClient,
-      serviceName: String,
-      sparkConf: SparkConf): Unit = {
-    this.kubernetesClient = kubernetesClient
-    this.serviceName = serviceName
-    this.sparkConf = sparkConf
-    onStart(kubernetesClient, serviceName, sparkConf)
-  }
-
-  /**
-   * Guaranteed to be called before {@link createDriverService} or
-   * {@link getDriverServiceSubmissionServerUris} is called.
-   */
-  protected def onStart(
-      kubernetesClient: KubernetesClient,
-      serviceName: String,
-      sparkConf: SparkConf): Unit = {}
-
-  /**
-   * Customize the driver service that overlays on the driver pod.
-   *
-   * Implementations are expected to take the service template and adjust it
-   * according to the particular needs of how the Service will be accessed by
-   * URIs provided in {@link getDriverServiceSubmissionServerUris}.
-   *
-   * @param driverServiceTemplate Base settings for the driver service.
-   * @return The same ServiceBuilder object with any required customizations.
-   */
-  def customizeDriverService(driverServiceTemplate: ServiceBuilder): ServiceBuilder
-
-  /**
-   * Return the set of URIs that can be used to reach the submission server that
-   * is running on the driver pod.
-   */
-  def getDriverServiceSubmissionServerUris(driverService: Service): Set[String]
-
-  /**
-   * Called when the Spark application failed to start. Allows the service
-   * manager to clean up any state it may have created that should not be persisted
-   * in the case of an unsuccessful launch. Note that stop() is still called
-   * regardless if this method is called.
-   */
-  def handleSubmissionError(cause: Throwable): Unit = {}
-
-  final def stop(): Unit = onStop()
-
-  /**
-   * Perform any cleanup of this service manager.
-   * the super implementation.
-   */
-  protected def onStop(): Unit = {}
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
deleted file mode 100644
index 174e9c57a65ca..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/DriverSubmitSslConfigurationProvider.scala
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v1
-
-import java.io.{File, FileInputStream}
-import java.security.{KeyStore, SecureRandom}
-import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{EnvVar, EnvVarBuilder, Secret, Volume, VolumeBuilder, VolumeMount, VolumeMountBuilder}
-import io.fabric8.kubernetes.client.KubernetesClient
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkException, SSLOptions}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
-import org.apache.spark.deploy.rest.kubernetes.v1.PemsToKeyStoreConverter
-import org.apache.spark.util.Utils
-
-/**
- * Raw SSL configuration as the user specified in SparkConf for setting up the driver
- * submission server.
- */
-private case class DriverSubmitSslConfigurationParameters(
-    storeBasedSslOptions: SSLOptions,
-    isKeyStoreLocalFile: Boolean,
-    driverSubmitServerKeyPem: Option[File],
-    isDriverSubmitKeyPemLocalFile: Boolean,
-    driverSubmitServerCertPem: Option[File],
-    isDriverSubmitServerCertPemLocalFile: Boolean,
-    submissionClientCertPem: Option[File])
-
-/**
- * Resolved from translating options provided in
- * {@link DriverSubmitSslConfigurationParameters} into Kubernetes volumes, environment variables
- * for the driver pod, Kubernetes secrets, client-side trust managers, and the client-side SSL
- * context. This is used for setting up the SSL connection for the submission server where the
- * application local dependencies and configuration is provided from.
- */
-private[spark] case class DriverSubmitSslConfiguration(
-    enabled: Boolean,
-    sslPodEnvVars: Array[EnvVar],
-    sslPodVolume: Option[Volume],
-    sslPodVolumeMount: Option[VolumeMount],
-    sslSecret: Option[Secret],
-    driverSubmitClientTrustManager: Option[X509TrustManager],
-    driverSubmitClientSslContext: SSLContext)
-
-/**
- * Provides the SSL configuration for bootstrapping the driver pod to listen for the driver
- * submission over SSL, and then supply the client-side configuration for establishing the
- * SSL connection. This is done in two phases: first, interpreting the raw configuration
- * values from the SparkConf object; then second, converting the configuration parameters
- * into the appropriate Kubernetes constructs, namely the volume and volume mount to add to the
- * driver pod, and the secret to create at the API server; and finally, constructing the
- * client-side trust manager and SSL context for sending the local dependencies.
- */
-private[spark] class DriverSubmitSslConfigurationProvider(
-    sparkConf: SparkConf,
-    kubernetesAppId: String,
-    kubernetesClient: KubernetesClient,
-    kubernetesResourceCleaner: KubernetesResourceCleaner) {
-  private val SECURE_RANDOM = new SecureRandom()
-  private val sslSecretsName = s"$SUBMISSION_SSL_SECRETS_PREFIX-$kubernetesAppId"
-  private val sslSecretsDirectory = DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR +
-    s"/$kubernetesAppId-ssl"
-
-  def getSslConfiguration(): DriverSubmitSslConfiguration = {
-    val sslConfigurationParameters = parseSslConfigurationParameters()
-    if (sslConfigurationParameters.storeBasedSslOptions.enabled) {
-      val storeBasedSslOptions = sslConfigurationParameters.storeBasedSslOptions
-      val keyStoreSecret = resolveFileToSecretMapping(
-          sslConfigurationParameters.isKeyStoreLocalFile,
-          SUBMISSION_SSL_KEYSTORE_SECRET_NAME,
-          storeBasedSslOptions.keyStore,
-          "KeyStore")
-      val keyStorePathEnv = resolveFilePathEnv(
-          sslConfigurationParameters.isKeyStoreLocalFile,
-          ENV_SUBMISSION_KEYSTORE_FILE,
-          SUBMISSION_SSL_KEYSTORE_SECRET_NAME,
-          storeBasedSslOptions.keyStore)
-      val storePasswordSecret = storeBasedSslOptions.keyStorePassword.map(password => {
-        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
-        (SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME, passwordBase64)
-      }).toMap
-      val storePasswordLocationEnv = storeBasedSslOptions.keyStorePassword.map(_ => {
-        new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE)
-          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME")
-          .build()
-      })
-      val storeKeyPasswordSecret = storeBasedSslOptions.keyPassword.map(password => {
-        val passwordBase64 = BaseEncoding.base64().encode(password.getBytes(Charsets.UTF_8))
-        (SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME, passwordBase64)
-      }).toMap
-      val storeKeyPasswordEnv = storeBasedSslOptions.keyPassword.map(_ => {
-        new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE)
-          .withValue(s"$sslSecretsDirectory/$SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME")
-          .build()
-      })
-      val storeTypeEnv = storeBasedSslOptions.keyStoreType.map(storeType => {
-        new EnvVarBuilder()
-          .withName(ENV_SUBMISSION_KEYSTORE_TYPE)
-          .withValue(storeType)
-          .build()
-      })
-      val keyPemSecret = resolveFileToSecretMapping(
-        sslConfigurationParameters.isDriverSubmitKeyPemLocalFile,
-        secretName = SUBMISSION_SSL_KEY_PEM_SECRET_NAME,
-        secretType = "Key pem",
-        secretFile = sslConfigurationParameters.driverSubmitServerKeyPem)
-      val keyPemLocationEnv = resolveFilePathEnv(
-        sslConfigurationParameters.isDriverSubmitKeyPemLocalFile,
-        envName = ENV_SUBMISSION_KEY_PEM_FILE,
-        secretName = SUBMISSION_SSL_KEY_PEM_SECRET_NAME,
-        maybeFile = sslConfigurationParameters.driverSubmitServerKeyPem)
-      val certPemSecret = resolveFileToSecretMapping(
-        sslConfigurationParameters.isDriverSubmitServerCertPemLocalFile,
-        secretName = SUBMISSION_SSL_CERT_PEM_SECRET_NAME,
-        secretType = "Cert pem",
-        secretFile = sslConfigurationParameters.driverSubmitServerCertPem)
-      val certPemLocationEnv = resolveFilePathEnv(
-        sslConfigurationParameters.isDriverSubmitServerCertPemLocalFile,
-        envName = ENV_SUBMISSION_CERT_PEM_FILE,
-        secretName = SUBMISSION_SSL_CERT_PEM_SECRET_NAME,
-        maybeFile = sslConfigurationParameters.driverSubmitServerCertPem)
-      val useSslEnv = new EnvVarBuilder()
-        .withName(ENV_SUBMISSION_USE_SSL)
-        .withValue("true")
-        .build()
-      val sslVolume = new VolumeBuilder()
-        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
-        .withNewSecret()
-        .withSecretName(sslSecretsName)
-        .endSecret()
-        .build()
-      val sslVolumeMount = new VolumeMountBuilder()
-        .withName(SUBMISSION_SSL_SECRETS_VOLUME_NAME)
-        .withReadOnly(true)
-        .withMountPath(sslSecretsDirectory)
-        .build()
-      val allSecrets = keyStoreSecret ++
-        storePasswordSecret ++
-        storeKeyPasswordSecret ++
-        keyPemSecret ++
-        certPemSecret
-      val sslSecret = kubernetesClient.secrets().createNew()
-        .withNewMetadata()
-        .withName(sslSecretsName)
-        .endMetadata()
-        .withData(allSecrets.asJava)
-        .withType("Opaque")
-        .done()
-      kubernetesResourceCleaner.registerOrUpdateResource(sslSecret)
-      val allSslEnvs = keyStorePathEnv ++
-        storePasswordLocationEnv ++
-        storeKeyPasswordEnv ++
-        storeTypeEnv ++
-        keyPemLocationEnv ++
-        Array(useSslEnv) ++
-        certPemLocationEnv
-      val (driverSubmitClientTrustManager, driverSubmitClientSslContext) =
-        buildSslConnectionConfiguration(sslConfigurationParameters)
-      DriverSubmitSslConfiguration(
-        true,
-        allSslEnvs.toArray,
-        Some(sslVolume),
-        Some(sslVolumeMount),
-        Some(sslSecret),
-        driverSubmitClientTrustManager,
-        driverSubmitClientSslContext)
-    } else {
-      DriverSubmitSslConfiguration(
-        false,
-        Array[EnvVar](),
-        None,
-        None,
-        None,
-        None,
-        SSLContext.getDefault)
-    }
-  }
-
-  private def resolveFilePathEnv(
-      isLocal: Boolean,
-      envName: String,
-      secretName: String,
-      maybeFile: Option[File]): Option[EnvVar] = {
-    maybeFile.map(file => {
-      val pemPath = if (isLocal) {
-        s"$sslSecretsDirectory/$secretName"
-      } else {
-        file.getAbsolutePath
-      }
-      new EnvVarBuilder()
-        .withName(envName)
-        .withValue(pemPath)
-        .build()
-    })
-  }
-
-  private def resolveFileToSecretMapping(
-      isLocal: Boolean,
-      secretName: String,
-      secretFile: Option[File],
-      secretType: String): Map[String, String] = {
-    secretFile.filter(_ => isLocal).map(file => {
-      if (!file.isFile) {
-        throw new SparkException(s"$secretType specified at ${file.getAbsolutePath} is not" +
-          s" a file or does not exist.")
-      }
-      val keyStoreBytes = Files.toByteArray(file)
-      (secretName, BaseEncoding.base64().encode(keyStoreBytes))
-    }).toMap
-  }
-
-  private def parseSslConfigurationParameters(): DriverSubmitSslConfigurationParameters = {
-    val maybeKeyStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE)
-    val maybeTrustStore = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE)
-    val maybeKeyPem = sparkConf.get(DRIVER_SUBMIT_SSL_KEY_PEM)
-    val maybeDriverSubmitServerCertPem = sparkConf.get(DRIVER_SUBMIT_SSL_SERVER_CERT_PEM)
-    val maybeDriverSubmitClientCertPem = sparkConf.get(DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM)
-    validatePemsDoNotConflictWithStores(
-      maybeKeyStore,
-      maybeTrustStore,
-      maybeKeyPem,
-      maybeDriverSubmitServerCertPem,
-      maybeDriverSubmitClientCertPem)
-    val resolvedSparkConf = sparkConf.clone()
-    val (isLocalKeyStore, resolvedKeyStore) = resolveLocalFile(maybeKeyStore, "keyStore")
-    resolvedKeyStore.foreach {
-      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE, _)
-    }
-    val (isLocalDriverSubmitServerCertPem, resolvedDriverSubmitServerCertPem) =
-      resolveLocalFile(maybeDriverSubmitServerCertPem, "server cert PEM")
-    val (isLocalKeyPem, resolvedKeyPem) = resolveLocalFile(maybeKeyPem, "key PEM")
-    maybeTrustStore.foreach { trustStore =>
-      require(KubernetesFileUtils.isUriLocalFile(trustStore), s"Invalid trustStore URI" +
-        s" $trustStore; trustStore URI for submit server must have no scheme, or scheme file://")
-      resolvedSparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
-        Utils.resolveURI(trustStore).getPath)
-    }
-    val driverSubmitClientCertPem = maybeDriverSubmitClientCertPem.map { driverSubmitClientCert =>
-      require(KubernetesFileUtils.isUriLocalFile(driverSubmitClientCert),
-        "Invalid client certificate PEM URI $driverSubmitClientCert: client certificate URI must" +
-          " have no scheme, or scheme file://")
-      Utils.resolveURI(driverSubmitClientCert).getPath
-    }
-    val securityManager = new SparkSecurityManager(resolvedSparkConf)
-    val storeBasedSslOptions = securityManager.getSSLOptions(DRIVER_SUBMIT_SSL_NAMESPACE)
-    DriverSubmitSslConfigurationParameters(
-      storeBasedSslOptions,
-      isLocalKeyStore,
-      resolvedKeyPem.map(new File(_)),
-      isLocalKeyPem,
-      resolvedDriverSubmitServerCertPem.map(new File(_)),
-      isLocalDriverSubmitServerCertPem,
-      driverSubmitClientCertPem.map(new File(_)))
-  }
-
-  private def resolveLocalFile(file: Option[String],
-      fileType: String): (Boolean, Option[String]) = {
-    file.map { f =>
-      require(isValidSslFileScheme(f), s"Invalid $fileType URI $f, $fileType URI" +
-        s" for submit server must have scheme file:// or local:// (no scheme defaults to file://")
-      val isLocal = KubernetesFileUtils.isUriLocalFile(f)
-      (isLocal, Option.apply(Utils.resolveURI(f).getPath))
-    }.getOrElse(false, None)
-  }
-
-  private def validatePemsDoNotConflictWithStores(
-      maybeKeyStore: Option[String],
-      maybeTrustStore: Option[String],
-      maybeKeyPem: Option[String],
-      maybeDriverSubmitServerCertPem: Option[String],
-      maybeSubmitClientCertPem: Option[String]) = {
-    maybeKeyPem.orElse(maybeDriverSubmitServerCertPem).foreach { _ =>
-      require(maybeKeyStore.isEmpty,
-        "Cannot specify server PEM files and key store files; must specify only one or the other.")
-    }
-    maybeKeyPem.foreach { _ =>
-      require(maybeDriverSubmitServerCertPem.isDefined,
-        "When specifying the key PEM file, the server certificate PEM file must also be provided.")
-    }
-    maybeDriverSubmitServerCertPem.foreach { _ =>
-      require(maybeKeyPem.isDefined,
-        "When specifying the server certificate PEM file, the key PEM file must also be provided.")
-    }
-    maybeTrustStore.foreach { _ =>
-      require(maybeSubmitClientCertPem.isEmpty,
-        "Cannot specify client cert file and truststore file; must specify only one or the other.")
-    }
-  }
-
-  private def isValidSslFileScheme(rawUri: String): Boolean = {
-    val resolvedScheme = Option.apply(Utils.resolveURI(rawUri).getScheme).getOrElse("file")
-    resolvedScheme == "file" || resolvedScheme == "local"
-  }
-
-  private def buildSslConnectionConfiguration(
-      sslConfigurationParameters: DriverSubmitSslConfigurationParameters)
-      : (Option[X509TrustManager], SSLContext) = {
-    val maybeTrustStore = sslConfigurationParameters.submissionClientCertPem.map { certPem =>
-      PemsToKeyStoreConverter.convertCertPemToTrustStore(
-        certPem,
-        sslConfigurationParameters.storeBasedSslOptions.trustStoreType)
-    }.orElse(sslConfigurationParameters.storeBasedSslOptions.trustStore.map { trustStoreFile =>
-      if (!trustStoreFile.isFile) {
-        throw new SparkException(s"TrustStore file at ${trustStoreFile.getAbsolutePath}" +
-          s" does not exist or is not a file.")
-      }
-      val trustStore = KeyStore.getInstance(
-        sslConfigurationParameters
-          .storeBasedSslOptions
-          .trustStoreType
-          .getOrElse(KeyStore.getDefaultType))
-      Utils.tryWithResource(new FileInputStream(trustStoreFile)) { trustStoreStream =>
-        val trustStorePassword = sslConfigurationParameters
-          .storeBasedSslOptions
-          .trustStorePassword
-          .map(_.toCharArray)
-          .orNull
-        trustStore.load(trustStoreStream, trustStorePassword)
-      }
-      trustStore
-    })
-    maybeTrustStore.map { trustStore =>
-      val trustManagerFactory = TrustManagerFactory.getInstance(
-        TrustManagerFactory.getDefaultAlgorithm)
-      trustManagerFactory.init(trustStore)
-      val trustManagers = trustManagerFactory.getTrustManagers
-      val sslContext = SSLContext.getInstance("TLSv1.2")
-      sslContext.init(null, trustManagers, SECURE_RANDOM)
-      (Option.apply(trustManagers(0).asInstanceOf[X509TrustManager]), sslContext)
-    }.getOrElse((Option.empty[X509TrustManager], SSLContext.getDefault))
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala
deleted file mode 100644
index 4c784aeb5692f..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/ExternalSuppliedUrisDriverServiceManager.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v1
-
-import java.util.concurrent.TimeUnit
-
-import com.google.common.util.concurrent.SettableFuture
-import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
-import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watch, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
-import scala.collection.JavaConverters._
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-/**
- * Creates the service with an annotation that is expected to be detected by another process
- * which the user provides and is not built in this project. When the external process detects
- * the creation of the service with the appropriate annotation, it is expected to populate the
- * value of a second annotation that is the URI of the driver submission server.
- */
-private[spark] class ExternalSuppliedUrisDriverServiceManager
-  extends DriverServiceManager with Logging {
-
-  private val externalUriFuture = SettableFuture.create[String]
-  private var externalUriSetWatch: Option[Watch] = None
-
-  override def onStart(
-      kubernetesClient: KubernetesClient,
-      serviceName: String,
-      sparkConf: SparkConf): Unit = {
-    externalUriSetWatch = Some(kubernetesClient
-      .services()
-      .withName(serviceName)
-      .watch(new ExternalUriSetWatcher(externalUriFuture)))
-  }
-
-  override def getServiceManagerType: String = ExternalSuppliedUrisDriverServiceManager.TYPE
-
-  override def customizeDriverService(driverServiceTemplate: ServiceBuilder): ServiceBuilder = {
-    require(serviceName != null, "Service name was null; was start() called?")
-    driverServiceTemplate
-      .editMetadata()
-      .addToAnnotations(ANNOTATION_PROVIDE_EXTERNAL_URI, "true")
-      .endMetadata()
-      .editSpec()
-      .withType("ClusterIP")
-      .endSpec()
-  }
-
-  override def getDriverServiceSubmissionServerUris(driverService: Service): Set[String] = {
-    val timeoutSeconds = sparkConf.get(KUBERNETES_DRIVER_SUBMIT_TIMEOUT)
-    require(externalUriSetWatch.isDefined, "The watch that listens for the provision of" +
-      " the external URI was not started; was start() called?")
-    Set(externalUriFuture.get(timeoutSeconds, TimeUnit.SECONDS))
-  }
-
-  override def onStop(): Unit = {
-    Utils.tryLogNonFatalError {
-      externalUriSetWatch.foreach(_.close())
-      externalUriSetWatch = None
-    }
-  }
-}
-
-private[spark] object ExternalSuppliedUrisDriverServiceManager {
-  val TYPE = "ExternalAnnotation"
-}
-
-private[spark] class ExternalUriSetWatcher(externalUriFuture: SettableFuture[String])
-  extends Watcher[Service] with Logging {
-
-  override def eventReceived(action: Action, service: Service): Unit = {
-    if (action == Action.MODIFIED && !externalUriFuture.isDone) {
-      service
-        .getMetadata
-        .getAnnotations
-        .asScala
-        .get(ANNOTATION_RESOLVED_EXTERNAL_URI)
-        .foreach(externalUriFuture.set)
-    }
-  }
-
-  override def onClose(cause: KubernetesClientException): Unit = {
-    logDebug("External URI set watcher closed.", cause)
-  }
-}
-
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala
deleted file mode 100644
index 266ec652ed8ae..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/KubernetesResourceCleaner.scala
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v1
-
-import io.fabric8.kubernetes.api.model.HasMetadata
-import io.fabric8.kubernetes.client.KubernetesClient
-import scala.collection.mutable
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-private[spark] class KubernetesResourceCleaner extends Logging {
-
-  private val resources = mutable.HashMap.empty[(String, String), HasMetadata]
-
-  // Synchronized because deleteAllRegisteredResourcesFromKubernetes may be called from a
-  // shutdown hook
-  def registerOrUpdateResource(resource: HasMetadata): Unit = synchronized {
-    resources.put((resource.getMetadata.getName, resource.getKind), resource)
-  }
-
-  def unregisterResource(resource: HasMetadata): Unit = synchronized {
-    resources.remove((resource.getMetadata.getName, resource.getKind))
-  }
-
-  def deleteAllRegisteredResourcesFromKubernetes(kubernetesClient: KubernetesClient): Unit = {
-    synchronized {
-      val resourceCount = resources.size
-      logInfo(s"Deleting ${resourceCount} registered Kubernetes resources...")
-      resources.values.foreach { resource =>
-        Utils.tryLogNonFatalError {
-          kubernetesClient.resource(resource).delete()
-        }
-      }
-      resources.clear()
-      logInfo(s"Deleted ${resourceCount} registered Kubernetes resources.")
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala
deleted file mode 100644
index 965d71917403e..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/v1/NodePortUrisDriverServiceManager.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.v1
-
-import io.fabric8.kubernetes.api.model.{Service, ServiceBuilder}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.internal.Logging
-
-/**
- * Creates the service with an open NodePort. The URI to reach the submission server is thus
- * at the address of any of the nodes through the service's node port.
- */
-private[spark] class NodePortUrisDriverServiceManager extends DriverServiceManager with Logging {
-
-  override def getServiceManagerType: String = NodePortUrisDriverServiceManager.TYPE
-
-  override def customizeDriverService(driverServiceTemplate: ServiceBuilder): ServiceBuilder = {
-    driverServiceTemplate.editSpec().withType("NodePort").endSpec()
-  }
-
-  override def getDriverServiceSubmissionServerUris(driverService: Service): Set[String] = {
-    val urlScheme = if (sparkConf.get(DRIVER_SUBMIT_SSL_ENABLED)) {
-      "https"
-    } else {
-      logWarning("Submitting application details, application secret, Kubernetes credentials," +
-        " and local jars to the cluster over an insecure connection. You should configure SSL" +
-        " to secure this step.")
-      "http"
-    }
-    val servicePort = driverService.getSpec.getPorts.asScala
-      .filter(_.getName == SUBMISSION_SERVER_PORT_NAME)
-      .head.getNodePort
-    val nodeUrls = kubernetesClient.nodes.list.getItems.asScala
-      .filterNot(node => node.getSpec.getUnschedulable != null &&
-        node.getSpec.getUnschedulable)
-      .flatMap(_.getStatus.getAddresses.asScala)
-      // The list contains hostnames, internal and external IP addresses.
-      // (https://kubernetes.io/docs/admin/node/#addresses)
-      // we want only external IP addresses and legacyHostIP addresses in our list
-      // legacyHostIPs are deprecated and will be removed in the future.
-      // (https://github.com/kubernetes/kubernetes/issues/9267)
-      .filter(address => address.getType == "ExternalIP" || address.getType == "LegacyHostIP")
-      .map(address => {
-        s"$urlScheme://${address.getAddress}:$servicePort"
-      }).toSet
-    require(nodeUrls.nonEmpty, "No nodes found to contact the driver!")
-    nodeUrls
-  }
-}
-
-private[spark] object NodePortUrisDriverServiceManager {
-  val TYPE = "NodePort"
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestApi.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/FileFetcher.scala
similarity index 56%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestApi.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/FileFetcher.scala
index 270e7ea0e77bf..d050e0a41a15a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestApi.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/FileFetcher.scala
@@ -14,25 +14,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v1
+package org.apache.spark.deploy.rest.kubernetes
 
-import javax.ws.rs.{Consumes, GET, Path, POST, Produces}
-import javax.ws.rs.core.MediaType
+import java.io.File
 
-import org.apache.spark.deploy.rest.CreateSubmissionResponse
-
-@Path("/v1/submissions/")
-trait KubernetesSparkRestApi {
-
-  @POST
-  @Consumes(Array(MediaType.APPLICATION_JSON))
-  @Produces(Array(MediaType.APPLICATION_JSON))
-  @Path("/create")
-  def submitApplication(request: KubernetesCreateSubmissionRequest): CreateSubmissionResponse
-
-  @GET
-  @Consumes(Array(MediaType.APPLICATION_JSON))
-  @Produces(Array(MediaType.APPLICATION_JSON))
-  @Path("/ping")
-  def ping(): PingResponse
+// Extracted for testing so that unit tests don't have to depend on Utils.fetchFile
+private[spark] trait FileFetcher {
+  def fetchFile(uri: String, targetDir: File): Unit
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
similarity index 95%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
index 7f21087159145..9bdc224f10c90 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
@@ -14,8 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.File
 import java.util.concurrent.TimeUnit
@@ -30,8 +29,8 @@ import scala.concurrent.duration.Duration
 
 import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SSLOptions}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ThreadUtils, Utils}
 
@@ -63,26 +62,6 @@ private class DownloadTarGzCallback(downloadDir: File) extends WaitableCallback[
     }
   }
 }
-
-// Extracted for testing so that unit tests don't have to depend on Utils.fetchFile
-private[v2] trait FileFetcher {
-  def fetchFile(uri: String, targetDir: File): Unit
-}
-
-private class FileFetcherImpl(sparkConf: SparkConf, securityManager: SparkSecurityManager)
-    extends FileFetcher {
-  def fetchFile(uri: String, targetDir: File): Unit = {
-    Utils.fetchFile(
-      url = uri,
-      targetDir = targetDir,
-      conf = sparkConf,
-      securityMgr = securityManager,
-      hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf),
-      timestamp = System.currentTimeMillis(),
-      useCache = false)
-  }
-}
-
 /**
  * Process that fetches files from a resource staging server and/or arbitrary remote locations.
  *
@@ -97,6 +76,7 @@ private[spark] class KubernetesSparkDependencyDownloadInitContainer(
     fileFetcher: FileFetcher,
     resourceStagingServerSslOptions: SSLOptions) extends Logging {
 
+
   private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("download-executor"))
   private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
@@ -184,8 +164,7 @@ private[spark] class KubernetesSparkDependencyDownloadInitContainer(
         val resourceSecret = Files.toString(resourceSecretLocation, Charsets.UTF_8)
         val downloadResourceCallback = new DownloadTarGzCallback(resourceDownloadDir)
         logInfo(downloadStartMessage)
-        service.downloadResources(resourceId, resourceSecret)
-          .enqueue(downloadResourceCallback)
+        service.downloadResources(resourceId, resourceSecret).enqueue(downloadResourceCallback)
         downloadResourceCallback.waitForCompletion(downloadTimeoutMinutes, TimeUnit.MINUTES)
         logInfo(downloadFinishedMessage)
       }
@@ -211,6 +190,27 @@ private[spark] class KubernetesSparkDependencyDownloadInitContainer(
   }
 }
 
+private class FileFetcherImpl(sparkConf: SparkConf, securityManager: SparkSecurityManager)
+    extends FileFetcher {
+  def fetchFile(uri: String, targetDir: File): Unit = {
+    Utils.fetchFile(
+      url = uri,
+      targetDir = targetDir,
+      conf = sparkConf,
+      securityMgr = securityManager,
+      hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf),
+      timestamp = System.currentTimeMillis(),
+      useCache = false)
+  }
+}
+
+private case class StagedResources(
+    resourceSecret: String,
+    podLabels: Map[String, String],
+    podNamespace: String,
+    resourcesFile: File,
+    kubernetesCredentials: KubernetesCredentials)
+
 object KubernetesSparkDependencyDownloadInitContainer extends Logging {
   def main(args: Array[String]): Unit = {
     logInfo("Starting init-container to download Spark application dependencies.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
index 178956a136d1c..17f90118e150d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/PemsToKeyStoreConverter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/PemsToKeyStoreConverter.scala
@@ -14,10 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v1
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.{File, FileInputStream, FileOutputStream, InputStreamReader}
-import java.nio.file.Paths
 import java.security.{KeyStore, PrivateKey}
 import java.security.cert.Certificate
 import java.util.UUID
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala
index 4ecb6369ff3b0..34594ba518b62 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.File
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSslOptionsProvider.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSslOptionsProvider.scala
index 0dd0b08433def..cb1e65421c013 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSslOptionsProvider.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.File
 import java.security.SecureRandom
@@ -26,7 +26,6 @@ import org.apache.commons.lang3.RandomStringUtils
 import org.apache.spark.{SecurityManager, SparkConf, SparkException, SSLOptions}
 import org.apache.spark.deploy.kubernetes.OptionRequirements
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.rest.kubernetes.v1.PemsToKeyStoreConverter
 import org.apache.spark.internal.Logging
 
 private[spark] trait ResourceStagingServerSslOptionsProvider {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala
index 5dbe55b72bd8b..525711e78c01c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.InputStream
 import javax.ws.rs.{Consumes, GET, HeaderParam, Path, PathParam, POST, Produces}
@@ -23,7 +23,7 @@ import javax.ws.rs.core.{MediaType, StreamingOutput}
 import org.glassfish.jersey.media.multipart.FormDataParam
 
 import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
+import org.apache.spark.deploy.kubernetes.submit.SubmittedResourceIdAndSecret
 
 /**
  * Service that receives application data that can be retrieved later on. This is primarily used
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala
similarity index 91%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala
index 34c3192ae6780..abe956da9914d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.{File, FileOutputStream, InputStream, OutputStream}
 import java.security.SecureRandom
@@ -27,7 +27,7 @@ import scala.collection.concurrent.TrieMap
 
 import org.apache.spark.SparkException
 import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
+import org.apache.spark.deploy.kubernetes.submit.SubmittedResourceIdAndSecret
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
@@ -92,10 +92,3 @@ private[spark] class ResourceStagingServiceImpl(dependenciesRootDir: File)
 
   override def ping(): String = "pong"
 }
-
-private case class StagedResources(
-  resourceSecret: String,
-  podLabels: Map[String, String],
-  podNamespace: String,
-  resourcesFile: File,
-  kubernetesCredentials: KubernetesCredentials)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
similarity index 93%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
index e0079a372f0d9..3c2fe8ebbc3c8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceRetrofit.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
@@ -14,13 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import okhttp3.{RequestBody, ResponseBody}
 import retrofit2.Call
 import retrofit2.http.{Multipart, Path, Streaming}
 
-import org.apache.spark.deploy.kubernetes.submit.v2.SubmittedResourceIdAndSecret
+import org.apache.spark.deploy.kubernetes.submit.SubmittedResourceIdAndSecret
 
 /**
  * Retrofit-compatible variant of {@link ResourceStagingService}. For documentation on
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitClientFactory.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
index f906423524944..a374982444f79 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/RetrofitClientFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.FileInputStream
 import java.security.{KeyStore, SecureRandom}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/SparkConfPropertiesParser.scala
similarity index 94%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/SparkConfPropertiesParser.scala
index cf9decab127c5..9e2b8a780df29 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v2/SparkConfPropertiesParser.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/SparkConfPropertiesParser.scala
@@ -14,13 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.{File, FileInputStream}
 import java.util.Properties
 
 import com.google.common.collect.Maps
-import scala.collection.JavaConverters.mapAsScalaMapConverter
+import scala.collection.JavaConverters._
 
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.config.{ConfigReader, SparkConfigProvider}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala
deleted file mode 100644
index ea1abed72c07f..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/HttpClientUtil.scala
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.rest.kubernetes.v1
-
-import java.io.IOException
-import java.net.{InetSocketAddress, ProxySelector, SocketAddress, URI}
-import java.util.Collections
-import javax.net.ssl.{SSLContext, SSLSocketFactory, X509TrustManager}
-
-import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
-import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import feign.{Client, Feign, Request, Response}
-import feign.Request.Options
-import feign.jackson.{JacksonDecoder, JacksonEncoder}
-import feign.jaxrs.JAXRSContract
-import io.fabric8.kubernetes.client.Config
-import okhttp3.OkHttpClient
-import scala.reflect.ClassTag
-
-import org.apache.spark.SparkException
-import org.apache.spark.internal.Logging
-import org.apache.spark.status.api.v1.JacksonMessageWriter
-
-private[spark] object HttpClientUtil extends Logging {
-
-  def createClient[T: ClassTag](
-      uris: Set[String],
-      maxRetriesPerServer: Int = 1,
-      sslSocketFactory: SSLSocketFactory = SSLContext.getDefault.getSocketFactory,
-      trustContext: X509TrustManager = null,
-      readTimeoutMillis: Int = 20000,
-      connectTimeoutMillis: Int = 20000): T = {
-    var httpClientBuilder = new OkHttpClient.Builder()
-    Option.apply(trustContext).foreach(context => {
-      httpClientBuilder = httpClientBuilder.sslSocketFactory(sslSocketFactory, context)
-    })
-    val uriObjects = uris.map(URI.create)
-    val httpUris = uriObjects.filter(uri => uri.getScheme == "http")
-    val httpsUris = uriObjects.filter(uri => uri.getScheme == "https")
-    val maybeAllProxy = Option.apply(System.getProperty(Config.KUBERNETES_ALL_PROXY))
-    val maybeHttpProxy = Option.apply(System.getProperty(Config.KUBERNETES_HTTP_PROXY))
-      .orElse(maybeAllProxy)
-      .map(uriStringToProxy)
-    val maybeHttpsProxy = Option.apply(System.getProperty(Config.KUBERNETES_HTTPS_PROXY))
-      .orElse(maybeAllProxy)
-      .map(uriStringToProxy)
-    val maybeNoProxy = Option.apply(System.getProperty(Config.KUBERNETES_NO_PROXY))
-      .map(_.split(","))
-      .toSeq
-      .flatten
-    val proxySelector = new ProxySelector {
-      override def select(uri: URI): java.util.List[java.net.Proxy] = {
-        val directProxy = java.net.Proxy.NO_PROXY
-        val resolvedProxy = maybeNoProxy.find( _ == uri.getHost)
-          .map( _ => directProxy)
-          .orElse(uri.getScheme match {
-            case "http" =>
-              logDebug(s"Looking up http proxies to route $uri")
-              maybeHttpProxy.filter { _ =>
-                matchingUriExists(uri, httpUris)
-              }
-            case "https" =>
-              logDebug(s"Looking up https proxies to route $uri")
-              maybeHttpsProxy.filter { _ =>
-                matchingUriExists(uri, httpsUris)
-              }
-            case _ => None
-        }).getOrElse(directProxy)
-        logDebug(s"Routing $uri through ${resolvedProxy.address()} with proxy" +
-          s" type ${resolvedProxy.`type`()}")
-        Collections.singletonList(resolvedProxy)
-      }
-
-      override def connectFailed(uri: URI, sa: SocketAddress, ioe: IOException) = {
-        throw new SparkException(s"Failed to connect to proxy through uri $uri," +
-          s" socket address: $sa", ioe)
-      }
-    }
-    httpClientBuilder = httpClientBuilder.proxySelector(proxySelector)
-    val objectMapper = new ObjectMapper()
-      .registerModule(new DefaultScalaModule)
-      .setDateFormat(JacksonMessageWriter.makeISODateFormat)
-    objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
-    val target = new MultiServerFeignTarget[T](uris.toSeq, maxRetriesPerServer)
-    val baseHttpClient = new feign.okhttp.OkHttpClient(httpClientBuilder.build())
-    val resetTargetHttpClient = new Client {
-      override def execute(request: Request, options: Options): Response = {
-        val response = baseHttpClient.execute(request, options)
-        if (response.status() / 100 == 2) {
-          target.reset()
-        }
-        response
-      }
-    }
-    Feign.builder()
-      .client(resetTargetHttpClient)
-      .contract(new JAXRSContract)
-      .encoder(new JacksonEncoder(objectMapper))
-      .decoder(new JacksonDecoder(objectMapper))
-      .options(new Options(connectTimeoutMillis, readTimeoutMillis))
-      .retryer(target)
-      .target(target)
-  }
-
-  private def matchingUriExists(uri: URI, httpUris: Set[URI]): Boolean = {
-    httpUris.exists(httpUri => {
-      httpUri.getScheme == uri.getScheme && httpUri.getHost == uri.getHost &&
-      httpUri.getPort == uri.getPort
-    })
-  }
-
-  private def uriStringToProxy(uriString: String): java.net.Proxy = {
-    val uriObject = URI.create(uriString)
-    new java.net.Proxy(java.net.Proxy.Type.HTTP,
-      new InetSocketAddress(uriObject.getHost, uriObject.getPort))
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
deleted file mode 100644
index bdd4a85da8f85..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesRestProtocolMessages.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.rest.kubernetes.v1
-
-import com.fasterxml.jackson.annotation.{JsonIgnore, JsonSubTypes, JsonTypeInfo}
-
-import org.apache.spark.SPARK_VERSION
-import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.deploy.rest.{SubmitRestProtocolRequest, SubmitRestProtocolResponse}
-import org.apache.spark.util.Utils
-
-case class KubernetesCreateSubmissionRequest(
-    appResource: AppResource,
-    mainClass: String,
-    appArgs: Array[String],
-    sparkProperties: Map[String, String],
-    secret: String,
-    driverPodKubernetesCredentials: KubernetesCredentials,
-    uploadedJarsBase64Contents: TarGzippedData,
-    uploadedFilesBase64Contents: TarGzippedData) extends SubmitRestProtocolRequest {
-  @JsonIgnore
-  override val messageType: String = s"kubernetes.v1.${Utils.getFormattedClassName(this)}"
-  override val action = messageType
-  message = "create"
-  clientSparkVersion = SPARK_VERSION
-}
-
-case class TarGzippedData(
-  dataBase64: String,
-  blockSize: Int = 10240,
-  recordSize: Int = 512,
-  encoding: String
-)
-
-@JsonTypeInfo(
-  use = JsonTypeInfo.Id.NAME,
-  include = JsonTypeInfo.As.PROPERTY,
-  property = "type")
-@JsonSubTypes(value = Array(
-  new JsonSubTypes.Type(value = classOf[UploadedAppResource], name = "UploadedAppResource"),
-  new JsonSubTypes.Type(value = classOf[ContainerAppResource], name = "ContainerLocalAppResource"),
-  new JsonSubTypes.Type(value = classOf[RemoteAppResource], name = "RemoteAppResource")))
-abstract class AppResource
-
-case class UploadedAppResource(
-  resourceBase64Contents: String,
-  name: String = "spark-app-resource") extends AppResource
-
-case class ContainerAppResource(resourcePath: String) extends AppResource
-
-case class RemoteAppResource(resource: String) extends AppResource
-
-class PingResponse extends SubmitRestProtocolResponse {
-  val text = "pong"
-  message = "pong"
-  serverSparkVersion = SPARK_VERSION
-  @JsonIgnore
-  override val messageType: String = s"kubernetes.v1.${Utils.getFormattedClassName(this)}"
-  override val action: String = messageType
-}
-
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
deleted file mode 100644
index 5cd24a8f9b75e..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/KubernetesSparkRestServer.scala
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.rest.kubernetes.v1
-
-import java.io.{File, FileOutputStream, StringReader}
-import java.net.URI
-import java.nio.file.Paths
-import java.security.SecureRandom
-import java.util.concurrent.CountDownLatch
-import java.util.concurrent.atomic.AtomicInteger
-import javax.servlet.http.{HttpServletRequest, HttpServletResponse}
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, ByteStreams, Files}
-import org.apache.commons.codec.binary.Base64
-import org.apache.commons.lang3.RandomStringUtils
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.{SecurityManager, SPARK_VERSION => sparkVersion, SparkConf, SparkException, SSLOptions}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
-import org.apache.spark.deploy.rest._
-import org.apache.spark.internal.config.OptionalConfigEntry
-import org.apache.spark.util.{ShutdownHookManager, ThreadUtils, Utils}
-
-private case class KubernetesSparkRestServerArguments(
-    host: Option[String] = None,
-    port: Option[Int] = None,
-    useSsl: Boolean = false,
-    secretFile: Option[String] = None,
-    keyStoreFile: Option[String] = None,
-    keyStorePasswordFile: Option[String] = None,
-    keyStoreType: Option[String] = None,
-    keyPasswordFile: Option[String] = None,
-    keyPemFile: Option[String] = None,
-    certPemFile: Option[String] = None) {
-  def validate(): KubernetesSparkRestServerArguments = {
-    require(host.isDefined, "Hostname not set via --hostname.")
-    require(port.isDefined, "Port not set via --port")
-    require(secretFile.isDefined, "Secret file not set via --secret-file")
-    this
-  }
-}
-
-private object KubernetesSparkRestServerArguments {
-  def fromArgsArray(inputArgs: Array[String]): KubernetesSparkRestServerArguments = {
-    var args = inputArgs.toList
-    var resolvedArguments = KubernetesSparkRestServerArguments()
-    while (args.nonEmpty) {
-      resolvedArguments = args match {
-        case "--hostname" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(host = Some(value))
-        case "--port" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(port = Some(value.toInt))
-        case "--secret-file" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(secretFile = Some(value))
-        case "--use-ssl" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(useSsl = value.toBoolean)
-        case "--keystore-file" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(keyStoreFile = Some(value))
-        case "--keystore-password-file" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(keyStorePasswordFile = Some(value))
-        case "--keystore-type" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(keyStoreType = Some(value))
-        case "--keystore-key-password-file" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(keyPasswordFile = Some(value))
-        case "--key-pem-file" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(keyPemFile = Some(value))
-        case "--cert-pem-file" :: value :: tail =>
-          args = tail
-          resolvedArguments.copy(certPemFile = Some(value))
-        // TODO polish usage message
-        case Nil => resolvedArguments
-        case unknown => throw new IllegalStateException(s"Unknown argument(s) found: $unknown")
-      }
-    }
-    resolvedArguments.validate()
-  }
-}
-
-/**
- * Runs in the driver pod and receives a request to run an application. Note that
- * unlike the submission rest server in standalone mode, this server is expected
- * to be used to run one application only, and then shut down once that application
- * is complete.
- */
-private[spark] class KubernetesSparkRestServer(
-    host: String,
-    port: Int,
-    conf: SparkConf,
-    expectedApplicationSecret: Array[Byte],
-    shutdownLock: CountDownLatch,
-    exitCode: AtomicInteger,
-    sslOptions: SSLOptions = new SSLOptions)
-  extends RestSubmissionServer(host, port, conf, sslOptions) {
-
-  private val SERVLET_LOCK = new Object
-  private val javaExecutable = s"${System.getenv("JAVA_HOME")}/bin/java"
-  private val sparkHome = System.getenv("SPARK_HOME")
-  private val securityManager = new SecurityManager(conf)
-  override protected lazy val contextToServlet = Map[String, RestServlet](
-    s"$baseContext/create/*" -> submitRequestServlet,
-    s"$baseContext/ping/*" -> pingServlet)
-
-  private val pingServlet = new PingServlet
-  override protected val submitRequestServlet: SubmitRequestServlet
-    = new KubernetesSubmitRequestServlet
-  // TODO
-  override protected val statusRequestServlet: StatusRequestServlet = null
-  override protected val killRequestServlet: KillRequestServlet = null
-
-  private class PingServlet extends RestServlet {
-    protected override def doGet(
-      request: HttpServletRequest,
-      response: HttpServletResponse): Unit = {
-      sendResponse(new PingResponse, response)
-    }
-  }
-
-  private class KubernetesSubmitRequestServlet extends SubmitRequestServlet {
-
-    private val waitForProcessCompleteExecutor = ThreadUtils
-        .newDaemonSingleThreadExecutor("wait-for-spark-app-complete")
-    private var startedApplication = false
-
-    // TODO validating the secret should be done as part of a header of the request.
-    // Instead here we have to specify the secret in the body.
-    override protected def handleSubmit(
-        requestMessageJson: String,
-        requestMessage: SubmitRestProtocolMessage,
-        responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
-      SERVLET_LOCK.synchronized {
-        if (startedApplication) {
-          throw new IllegalStateException("Application has already been submitted.")
-        } else {
-          requestMessage match {
-            case KubernetesCreateSubmissionRequest(
-                appResource,
-                mainClass,
-                appArgs,
-                sparkProperties,
-                secret,
-                driverPodKubernetesCredentials,
-                uploadedJars,
-                uploadedFiles) =>
-              val decodedSecret = Base64.decodeBase64(secret)
-              if (!expectedApplicationSecret.sameElements(decodedSecret)) {
-                responseServlet.setStatus(HttpServletResponse.SC_UNAUTHORIZED)
-                handleError("Unauthorized to submit application.")
-              } else {
-                val tempDir = Utils.createTempDir()
-                val resolvedAppResource = resolveAppResource(appResource, tempDir)
-                val writtenJars = writeUploadedJars(uploadedJars, tempDir)
-                val writtenFiles = writeUploadedFiles(uploadedFiles)
-                val resolvedSparkProperties = new mutable.HashMap[String, String]
-                resolvedSparkProperties ++= sparkProperties
-                val originalJars = sparkProperties.get("spark.jars")
-                  .map(_.split(","))
-                  .getOrElse(Array.empty)
-
-                // The driver at this point has handed us the value of spark.jars verbatim as
-                // specified in spark-submit. At this point, remove all jars that were local
-                // to the submitting user's disk, and replace them with the paths that were
-                // written to disk above.
-                val onlyContainerLocalOrRemoteJars = KubernetesFileUtils
-                  .getNonSubmitterLocalFiles(originalJars)
-                val resolvedJars = (writtenJars ++
-                  onlyContainerLocalOrRemoteJars ++
-                  Array(resolvedAppResource.sparkJarPath)).toSet
-                if (resolvedJars.nonEmpty) {
-                  resolvedSparkProperties("spark.jars") = resolvedJars.mkString(",")
-                } else {
-                  resolvedSparkProperties.remove("spark.jars")
-                }
-
-                // Determining the driver classpath is similar. It's the combination of:
-                // - Jars written from uploads
-                // - Jars in (spark.jars + mainAppResource) that has a "local" prefix
-                // - spark.driver.extraClasspath
-                // - Spark core jars from the installation
-                val sparkCoreJars = new File(sparkHome, "jars").listFiles().map(_.getAbsolutePath)
-                val driverExtraClasspath = sparkProperties
-                  .get("spark.driver.extraClassPath")
-                  .map(_.split(","))
-                  .getOrElse(Array.empty[String])
-                val onlyContainerLocalJars = KubernetesFileUtils
-                  .getOnlyContainerLocalFiles(originalJars)
-                val driverClasspath = driverExtraClasspath ++
-                  Seq(resolvedAppResource.localPath) ++
-                  writtenJars ++
-                  onlyContainerLocalJars ++
-                  sparkCoreJars
-
-                // Resolve spark.files similarly to spark.jars.
-                val originalFiles = sparkProperties.get("spark.files")
-                  .map(_.split(","))
-                  .getOrElse(Array.empty[String])
-                val onlyContainerLocalOrRemoteFiles = KubernetesFileUtils
-                  .getNonSubmitterLocalFiles(originalFiles)
-                val resolvedFiles = writtenFiles ++ onlyContainerLocalOrRemoteFiles
-                if (resolvedFiles.nonEmpty) {
-                  resolvedSparkProperties("spark.files") = resolvedFiles.mkString(",")
-                } else {
-                  resolvedSparkProperties.remove("spark.files")
-                }
-                resolvedSparkProperties ++= writeKubernetesCredentials(
-                  driverPodKubernetesCredentials, tempDir)
-
-                val command = new ArrayBuffer[String]
-                command += javaExecutable
-                command += "-cp"
-                command += s"${driverClasspath.mkString(":")}"
-                for (prop <- resolvedSparkProperties) {
-                  command += s"-D${prop._1}=${prop._2}"
-                }
-                val driverMemory = resolvedSparkProperties.getOrElse("spark.driver.memory", "1g")
-                command += s"-Xms$driverMemory"
-                command += s"-Xmx$driverMemory"
-                val extraJavaOpts = resolvedSparkProperties.get("spark.driver.extraJavaOptions")
-                  .map(Utils.splitCommandString)
-                  .getOrElse(Seq.empty)
-                command ++= extraJavaOpts
-                command += mainClass
-                command ++= appArgs
-                val pb = new ProcessBuilder(command: _*).inheritIO()
-                val process = pb.start()
-                ShutdownHookManager.addShutdownHook(() => {
-                  logInfo("Received stop command, shutting down the running Spark application...")
-                  process.destroy()
-                  shutdownLock.countDown()
-                })
-                waitForProcessCompleteExecutor.submit(new Runnable {
-                  override def run(): Unit = {
-                    // set the REST service's exit code to the exit code of the driver subprocess
-                    exitCode.set(process.waitFor)
-                    SERVLET_LOCK.synchronized {
-                      logInfo("Spark application complete. Shutting down submission server...")
-                      KubernetesSparkRestServer.this.stop
-                      shutdownLock.countDown()
-                    }
-                  }
-                })
-                startedApplication = true
-                val response = new CreateSubmissionResponse
-                response.success = true
-                response.submissionId = null
-                response.message = "success"
-                response.serverSparkVersion = sparkVersion
-                response
-              }
-            case unexpected =>
-              responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
-              handleError(s"Received message of unexpected type ${unexpected.messageType}.")
-          }
-        }
-      }
-    }
-
-    private def writeUploadedJars(jars: TarGzippedData, rootTempDir: File):
-        Seq[String] = {
-      val resolvedDirectory = new File(rootTempDir, "jars")
-      if (!resolvedDirectory.mkdir()) {
-        throw new IllegalStateException(s"Failed to create jars dir at " +
-          resolvedDirectory.getAbsolutePath)
-      }
-      CompressionUtils.unpackAndWriteCompressedFiles(jars, resolvedDirectory)
-    }
-
-    private def writeUploadedFiles(files: TarGzippedData): Seq[String] = {
-      val workingDir = Paths.get("").toFile.getAbsoluteFile
-      CompressionUtils.unpackAndWriteCompressedFiles(files, workingDir)
-    }
-
-    private def writeKubernetesCredentials(
-        kubernetesCredentials: KubernetesCredentials,
-        rootTempDir: File): Map[String, String] = {
-      val resolvedDirectory = new File(rootTempDir, "kubernetes-credentials")
-      if (!resolvedDirectory.mkdir()) {
-        throw new IllegalStateException(s"Failed to create credentials dir at "
-          + resolvedDirectory.getAbsolutePath)
-      }
-      val oauthTokenFile = writeRawStringCredentialAndGetConf("oauth-token.txt", resolvedDirectory,
-        KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN,
-        kubernetesCredentials.oauthTokenBase64.map { base64 =>
-          new String(BaseEncoding.base64().decode(base64), Charsets.UTF_8)
-        })
-      val caCertFile = writeBase64CredentialAndGetConf("ca.crt", resolvedDirectory,
-        KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE, kubernetesCredentials.caCertDataBase64)
-      val clientKeyFile = writeBase64CredentialAndGetConf("key.key", resolvedDirectory,
-        KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE, kubernetesCredentials.clientKeyDataBase64)
-      val clientCertFile = writeBase64CredentialAndGetConf("cert.crt", resolvedDirectory,
-        KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE, kubernetesCredentials.clientCertDataBase64)
-      (oauthTokenFile ++ caCertFile ++ clientKeyFile ++ clientCertFile).toMap
-    }
-
-    private def writeRawStringCredentialAndGetConf(
-        fileName: String,
-        dir: File,
-        conf: OptionalConfigEntry[String],
-        credential: Option[String]): Option[(String, String)] = {
-      credential.map { cred =>
-        val credentialFile = new File(dir, fileName)
-        Files.write(cred, credentialFile, Charsets.UTF_8)
-        (conf.key, credentialFile.getAbsolutePath)
-      }
-    }
-
-    private def writeBase64CredentialAndGetConf(
-        fileName: String,
-        dir: File,
-        conf: OptionalConfigEntry[String],
-        credential: Option[String]): Option[(String, String)] = {
-      credential.map { cred =>
-        val credentialFile = new File(dir, fileName)
-        Files.write(BaseEncoding.base64().decode(cred), credentialFile)
-        (conf.key, credentialFile.getAbsolutePath)
-      }
-    }
-
-    /**
-     * Retrieve the path on the driver container where the main app resource is, and what value it
-     * ought to have in the spark.jars property. The two may be different because for non-local
-     * dependencies, we have to fetch the resource (if it is not "local") but still want to use
-     * the full URI in spark.jars.
-     */
-    private def resolveAppResource(appResource: AppResource, tempDir: File):
-        ResolvedAppResource = {
-      appResource match {
-        case UploadedAppResource(resourceContentsBase64, resourceName) =>
-          val resourceFile = new File(tempDir, resourceName)
-          val resourceFilePath = resourceFile.getAbsolutePath
-          if (resourceFile.createNewFile()) {
-            Utils.tryWithResource(new StringReader(resourceContentsBase64)) { reader =>
-              Utils.tryWithResource(new FileOutputStream(resourceFile)) { os =>
-                Utils.tryWithResource(BaseEncoding.base64().decodingStream(reader)) {
-                    decodingStream =>
-                  ByteStreams.copy(decodingStream, os)
-                }
-              }
-            }
-            ResolvedAppResource(resourceFile.getAbsolutePath, resourceFile.getAbsolutePath)
-          } else {
-            throw new IllegalStateException(s"Failed to write main app resource file" +
-              s" to $resourceFilePath")
-          }
-        case ContainerAppResource(resource) =>
-          ResolvedAppResource(Utils.resolveURI(resource).getPath, resource)
-        case RemoteAppResource(resource) =>
-          Utils.fetchFile(resource, tempDir, conf,
-            securityManager, SparkHadoopUtil.get.newConfiguration(conf),
-            System.currentTimeMillis(), useCache = false)
-          val fileName = Utils.decodeFileNameInURI(URI.create(resource))
-          val downloadedFile = new File(tempDir, fileName)
-          val downloadedFilePath = downloadedFile.getAbsolutePath
-          if (!downloadedFile.isFile) {
-            throw new IllegalStateException(s"Main app resource is not a file or" +
-              s" does not exist at $downloadedFilePath")
-          }
-          ResolvedAppResource(downloadedFilePath, resource)
-      }
-    }
-  }
-
-  private case class ResolvedAppResource(localPath: String, sparkJarPath: String)
-}
-
-private[spark] object KubernetesSparkRestServer {
-  private val barrier = new CountDownLatch(1)
-  private val SECURE_RANDOM = new SecureRandom()
-
-  def main(args: Array[String]): Unit = {
-    val parsedArguments = KubernetesSparkRestServerArguments.fromArgsArray(args)
-    val secretFile = new File(parsedArguments.secretFile.get)
-    require(secretFile.isFile, "Secret file specified by --secret-file is not a file, or" +
-      " does not exist.")
-    val sslOptions = if (parsedArguments.useSsl) {
-      validateSslOptions(parsedArguments)
-      val keyPassword = parsedArguments
-        .keyPasswordFile
-        .map(new File(_))
-        .map(Files.toString(_, Charsets.UTF_8))
-        // If key password isn't set but we're using PEM files, generate a password
-        .orElse(parsedArguments.keyPemFile.map(_ => randomPassword()))
-      val keyStorePassword = parsedArguments
-        .keyStorePasswordFile
-        .map(new File(_))
-        .map(Files.toString(_, Charsets.UTF_8))
-        // If keystore password isn't set but we're using PEM files, generate a password
-        .orElse(parsedArguments.keyPemFile.map(_ => randomPassword()))
-      val resolvedKeyStore = parsedArguments.keyStoreFile.map(new File(_)).orElse(
-        for {
-          keyPemFile <- parsedArguments.keyPemFile
-          certPemFile <- parsedArguments.certPemFile
-          resolvedKeyStorePassword <- keyStorePassword
-          resolvedKeyPassword <- keyPassword
-        } yield {
-          PemsToKeyStoreConverter.convertPemsToTempKeyStoreFile(
-            new File(keyPemFile),
-            new File(certPemFile),
-            "provided-key",
-            resolvedKeyStorePassword,
-            resolvedKeyPassword,
-            parsedArguments.keyStoreType)
-        })
-      new SSLOptions(
-        enabled = true,
-        keyStore = resolvedKeyStore,
-        keyStoreType = parsedArguments.keyStoreType,
-        keyStorePassword = keyStorePassword,
-        keyPassword = keyPassword)
-    } else {
-      new SSLOptions
-    }
-    val secretBytes = Files.toByteArray(secretFile)
-    val sparkConf = new SparkConf(true)
-    val exitCode = new AtomicInteger(0)
-    val server = new KubernetesSparkRestServer(
-      parsedArguments.host.get,
-      parsedArguments.port.get,
-      sparkConf,
-      secretBytes,
-      barrier,
-      exitCode,
-      sslOptions)
-    server.start()
-    ShutdownHookManager.addShutdownHook(() => {
-      try {
-        server.stop()
-      } finally {
-        barrier.countDown()
-      }
-    })
-    barrier.await()
-    System.exit(exitCode.get())
-  }
-
-  private def validateSslOptions(parsedArguments: KubernetesSparkRestServerArguments): Unit = {
-    parsedArguments.keyStoreFile.foreach { _ =>
-      require(parsedArguments.keyPemFile.orElse(parsedArguments.certPemFile).isEmpty,
-        "Cannot provide both key/cert PEM files and a keyStore file; select one or the other" +
-          " for configuring SSL.")
-    }
-    parsedArguments.keyPemFile.foreach { _ =>
-      require(parsedArguments.certPemFile.isDefined,
-        "When providing the key PEM file, the certificate PEM file must also be provided.")
-    }
-    parsedArguments.certPemFile.foreach { _ =>
-      require(parsedArguments.keyPemFile.isDefined,
-        "When providing the certificate PEM file, the key PEM file must also be provided.")
-    }
-  }
-
-  private def randomPassword(): String = {
-    RandomStringUtils.random(1024, 0, Integer.MAX_VALUE, false, false, null, SECURE_RANDOM)
-  }
-}
-
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala
deleted file mode 100644
index 56ff82ea2fc33..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/v1/MultiServerFeignTarget.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.rest.kubernetes.v1
-
-import feign.{Request, RequestTemplate, RetryableException, Retryer, Target}
-import scala.reflect.ClassTag
-import scala.util.Random
-
-import org.apache.spark.internal.Logging
-
-private[kubernetes] class MultiServerFeignTarget[T : ClassTag](
-    private val servers: Seq[String],
-    private val maxRetriesPerServer: Int = 1,
-    private val delayBetweenRetriesMillis: Int = 1000) extends Target[T] with Retryer with Logging {
-  require(servers.nonEmpty, "Must provide at least one server URI.")
-
-  private val threadLocalShuffledServers = new ThreadLocal[Seq[String]] {
-    override def initialValue(): Seq[String] = Random.shuffle(servers)
-  }
-  private val threadLocalCurrentAttempt = new ThreadLocal[Int] {
-    override def initialValue(): Int = 0
-  }
-
-  override def `type`(): Class[T] = {
-    implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
-  }
-
-  /**
-   * Cloning the target is done on every request, for use on the current
-   * thread - thus it's important that clone returns a "fresh" target.
-   */
-  override def clone(): Retryer = {
-    reset()
-    this
-  }
-
-  override def name(): String = {
-    s"${getClass.getSimpleName} with servers [${servers.mkString(",")}]"
-  }
-
-  override def apply(requestTemplate: RequestTemplate): Request = {
-    if (!requestTemplate.url().startsWith("http")) {
-      requestTemplate.insert(0, url())
-    }
-    requestTemplate.request()
-  }
-
-  override def url(): String = threadLocalShuffledServers.get.head
-
-  override def continueOrPropagate(e: RetryableException): Unit = {
-    threadLocalCurrentAttempt.set(threadLocalCurrentAttempt.get + 1)
-    val currentAttempt = threadLocalCurrentAttempt.get
-    if (threadLocalCurrentAttempt.get < maxRetriesPerServer) {
-      logWarning(s"Attempt $currentAttempt of $maxRetriesPerServer failed for" +
-        s" server ${url()}. Retrying request...", e)
-      Thread.sleep(delayBetweenRetriesMillis)
-    } else {
-      val previousUrl = url()
-      threadLocalShuffledServers.set(threadLocalShuffledServers.get.drop(1))
-      if (threadLocalShuffledServers.get.isEmpty) {
-        logError(s"Failed request to all servers $maxRetriesPerServer times.", e)
-        throw e
-      } else {
-        logWarning(s"Failed request to $previousUrl $maxRetriesPerServer times." +
-          s" Trying to access ${url()} instead.", e)
-        threadLocalCurrentAttempt.set(0)
-      }
-    }
-  }
-
-  def reset(): Unit = {
-    threadLocalShuffledServers.set(Random.shuffle(servers))
-    threadLocalCurrentAttempt.set(0)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
index 886484ffb4692..8de0f56f007dc 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SSLUtils.scala
@@ -30,7 +30,7 @@ import org.bouncycastle.cert.jcajce.{JcaX509CertificateConverter, JcaX509v3Certi
 import org.bouncycastle.openssl.jcajce.JcaPEMWriter
 import org.bouncycastle.operator.jcajce.JcaContentSignerBuilder
 
-import org.apache.spark.deploy.kubernetes.submit.v2.{KeyAndCertPem, KeyStoreAndTrustStore}
+import org.apache.spark.deploy.kubernetes.submit.{KeyAndCertPem, KeyStoreAndTrustStore}
 import org.apache.spark.util.Utils
 
 private[spark] object SSLUtils {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index 9ad46e52747fd..d4d3882bb8bab 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
@@ -35,7 +35,6 @@ import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.SparkPodInitContainerBootstrap
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.LoggingPodStatusWatcher
 
 class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
index 6804f0010b6a5..ca5cd1fff9b74 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ContainerLocalizedFilesResolverSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.SparkFunSuite
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
index d4413076fb092..c1005a176408c 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/DriverPodKubernetesCredentialsMounterSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
@@ -14,16 +14,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import io.fabric8.kubernetes.api.model.{PodBuilder, SecretBuilder}
 import org.scalatest.prop.TableDrivenPropertyChecks
 import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 
 class DriverPodKubernetesCredentialsMounterSuite
     extends SparkFunSuite with TableDrivenPropertyChecks {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala
index 62bfd127d17e2..ead1d49b8a37c 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/ExecutorInitContainerConfigurationSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.config._
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SSLFilePairs.scala
similarity index 94%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SSLFilePairs.scala
index 3d3ff7ad7011a..5240128743b76 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SSLFilePairs.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SSLFilePairs.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala
index 7c6fbf5ce6da2..f1e1ff7013496 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SparkInitContainerConfigMapBuilderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.StringReader
 import java.util.Properties
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala
similarity index 96%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala
index 09b41dc1bcaaf..8431b77c9e85f 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyInitContainerConfigPluginSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala
@@ -14,11 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
-import java.io.File
-
-import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.deploy.kubernetes.config._
 
 class SubmittedDependencyInitContainerConfigPluginSuite extends SparkFunSuite {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala
index 358edbecf8708..83fd568e7a3aa 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencySecretBuilderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
@@ -24,7 +24,7 @@ import io.fabric8.kubernetes.api.model.Secret
 import scala.collection.JavaConverters._
 import scala.collection.Map
 
-import org.apache.spark.{SparkFunSuite, SSLOptions}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.util.Utils
 
 class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
index 7b259aa2c3a0c..8693ff4e15372 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/v2/SubmittedDependencyUploaderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit.v2
+package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File}
 import java.util.UUID
@@ -35,7 +35,7 @@ import retrofit2.{Call, Response}
 
 import org.apache.spark.{SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.CompressionUtils
-import org.apache.spark.deploy.rest.kubernetes.v2.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
+import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
 import org.apache.spark.util.Utils
 
 private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with BeforeAndAfter {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainerSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainerSuite.scala
index c551fbc01d060..f2fdf026390cd 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/KubernetesSparkDependencyDownloadInitContainerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainerSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.{ByteArrayOutputStream, File}
 import java.util.UUID
@@ -32,7 +32,7 @@ import org.scalatest.BeforeAndAfter
 import org.scalatest.mock.MockitoSugar._
 import retrofit2.{Call, Callback, Response}
 
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.CompressionUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.util.Utils
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSslOptionsProviderSuite.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSslOptionsProviderSuite.scala
index c33d8beb2c397..3bb318d713a54 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSslOptionsProviderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSslOptionsProviderSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.{File, FileInputStream, StringWriter}
 import java.security.KeyStore
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
index 4ffb0d4dfa887..0604e0d6494ae 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.net.ServerSocket
 import javax.ws.rs.core.MediaType
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala
index 9677d12681a16..53396a3f27a1a 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/v2/ResourceStagingServiceImplSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.kubernetes.v2
+package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.{ByteArrayInputStream, File}
 import java.nio.file.Paths
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile
deleted file mode 100644
index 40f9459dc06dc..0000000000000
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-v2/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM openjdk:8-alpine
-
-# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
-# command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-driver:latest -f dockerfiles/driver/Dockerfile .
-
-RUN apk upgrade --update
-RUN apk add --update bash
-RUN mkdir -p /opt/spark
-RUN touch /opt/spark/RELEASE
-
-ADD jars /opt/spark/jars
-ADD examples /opt/spark/examples
-ADD bin /opt/spark/bin
-ADD sbin /opt/spark/sbin
-ADD conf /opt/spark/conf
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    exec ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 8ab7a58704505..40f9459dc06dc 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -36,16 +36,8 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-CMD SSL_ARGS="" && \
-    if ! [ -z ${SPARK_SUBMISSION_USE_SSL+x} ]; then SSL_ARGS="$SSL_ARGS --use-ssl $SPARK_SUBMISSION_USE_SSL"; fi && \
-    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-file $SPARK_SUBMISSION_KEYSTORE_FILE"; fi && \
-    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_TYPE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-type $SPARK_SUBMISSION_KEYSTORE_TYPE"; fi && \
-    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-password-file $SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"; fi && \
-    if ! [ -z ${SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --keystore-key-password-file $SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"; fi && \
-    if ! [ -z ${SPARK_SUBMISSION_KEY_PEM_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --key-pem-file $SPARK_SUBMISSION_KEY_PEM_FILE"; fi && \
-    if ! [ -z ${SPARK_SUBMISSION_CERT_PEM_FILE+x} ]; then SSL_ARGS="$SSL_ARGS --cert-pem-file $SPARK_SUBMISSION_CERT_PEM_FILE"; fi && \
-    exec bin/spark-class org.apache.spark.deploy.rest.kubernetes.v1.KubernetesSparkRestServer \
-      --hostname $HOSTNAME \
-      --port $SPARK_SUBMISSION_SERVER_PORT \
-      --secret-file $SPARK_SUBMISSION_SECRET_LOCATION \
-      ${SSL_ARGS}
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    exec ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
similarity index 95%
rename from resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile
rename to resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
index 59029a6c08b4a..bb249a4ea86b6 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-init/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
@@ -35,4 +35,4 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.v2.KubernetesSparkDependencyDownloadInitContainer" ]
+ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.KubernetesSparkDependencyDownloadInitContainer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
index 15e1ce75815df..125749c71c79a 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
@@ -35,4 +35,4 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.v2.ResourceStagingServer" ]
+ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.ResourceStagingServer" ]
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 56fcf692b8ff7..d23bfcdbc5251 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -17,31 +17,257 @@
 package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.nio.file.Paths
+import java.util.UUID
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
-import org.scalatest.Suite
-import org.scalatest.concurrent.PatienceConfiguration
+import io.fabric8.kubernetes.client.internal.readiness.Readiness
+import org.scalatest.BeforeAndAfter
+import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
+import scala.collection.JavaConverters._
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.integrationtest.backend.{IntegrationTestBackend, IntegrationTestBackendFactory}
+import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
+import org.apache.spark.deploy.kubernetes.SSLUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackendFactory
+import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
+import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
+import org.apache.spark.deploy.kubernetes.submit.{Client, KeyAndCertPem}
+import org.apache.spark.launcher.SparkLauncher
 
-private[spark] class KubernetesSuite extends SparkFunSuite {
-  private val testBackend: IntegrationTestBackend = IntegrationTestBackendFactory.getTestBackend()
+private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
+  import KubernetesSuite._
+  private val testBackend = IntegrationTestBackendFactory.getTestBackend()
+
+  private val APP_LOCATOR_LABEL = UUID.randomUUID().toString.replaceAll("-", "")
+  private var kubernetesTestComponents: KubernetesTestComponents = _
+  private var sparkConf: SparkConf = _
+  private var resourceStagingServerLauncher: ResourceStagingServerLauncher = _
+  private var staticAssetServerLauncher: StaticAssetServerLauncher = _
 
   override def beforeAll(): Unit = {
     testBackend.initialize()
+    kubernetesTestComponents = new KubernetesTestComponents(testBackend.getKubernetesClient)
+    resourceStagingServerLauncher = new ResourceStagingServerLauncher(
+      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
+    staticAssetServerLauncher = new StaticAssetServerLauncher(
+      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
   }
 
   override def afterAll(): Unit = {
     testBackend.cleanUp()
   }
 
-  override def nestedSuites: scala.collection.immutable.IndexedSeq[Suite] = {
-      Vector(
-        new KubernetesV1Suite(testBackend),
-        new KubernetesV2Suite(testBackend))
+  before {
+    sparkConf = kubernetesTestComponents.newSparkConf()
+      .set(INIT_CONTAINER_DOCKER_IMAGE, s"spark-init:latest")
+      .set(DRIVER_DOCKER_IMAGE, s"spark-driver:latest")
+      .set(KUBERNETES_DRIVER_LABELS, s"spark-app-locator=$APP_LOCATOR_LABEL")
+    kubernetesTestComponents.createNamespace()
+  }
+
+  after {
+    kubernetesTestComponents.deleteNamespace()
+  }
+
+  test("Simple submission test with the resource staging server.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    launchStagingServer(SSLOptions(), None)
+    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Enable SSL on the resource staging server") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
+      ipAddress = Minikube.getMinikubeIp,
+      keyStorePassword = "keyStore",
+      keyPassword = "key",
+      trustStorePassword = "trustStore")
+    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore",
+          keyStoreAndTrustStore.keyStore.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore",
+          keyStoreAndTrustStore.trustStore.getAbsolutePath)
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStore")
+      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "key")
+      .set("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword", "trustStore")
+    launchStagingServer(SSLOptions(
+      enabled = true,
+      keyStore = Some(keyStoreAndTrustStore.keyStore),
+      trustStore = Some(keyStoreAndTrustStore.trustStore),
+      keyStorePassword = Some("keyStore"),
+      keyPassword = Some("key"),
+      trustStorePassword = Some("trustStore")),
+      None)
+    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Use container-local resources without the resource staging server") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Dynamic executor scaling basic test") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    launchStagingServer(SSLOptions(), None)
+    createShuffleServiceDaemonSet()
+
+    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+    sparkConf.set("spark.dynamicAllocation.enabled", "true")
+    sparkConf.set("spark.shuffle.service.enabled", "true")
+    sparkConf.set("spark.kubernetes.shuffle.labels", "app=spark-shuffle-service")
+    sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
+    sparkConf.set("spark.app.name", "group-by-test")
+    runSparkGroupByTestAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Use remote resources without the resource staging server.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
+    sparkConf.setJars(Seq(
+      s"$assetServerUri/${EXAMPLES_JAR_FILE.getName}",
+      s"$assetServerUri/${HELPER_JAR_FILE.getName}"
+    ))
+    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+  }
+
+  test("Mix remote resources with submitted ones.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    launchStagingServer(SSLOptions(), None)
+    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
+    sparkConf.setJars(Seq(
+      SUBMITTER_LOCAL_MAIN_APP_RESOURCE, s"$assetServerUri/${HELPER_JAR_FILE.getName}"
+    ))
+    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+  }
+
+  test("Use key and certificate PEM files for TLS.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    val keyAndCertificate = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
+    launchStagingServer(
+        SSLOptions(enabled = true),
+        Some(keyAndCertificate))
+    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+        .set(
+            RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key, keyAndCertificate.certPem.getAbsolutePath)
+    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
+  test("Use client key and client cert file when requesting executors") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    sparkConf.setJars(Seq(
+        CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+        CONTAINER_LOCAL_HELPER_JAR_PATH))
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
+        kubernetesTestComponents.clientConfig.getClientKeyFile)
+    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
+        kubernetesTestComponents.clientConfig.getClientCertFile)
+    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
+        kubernetesTestComponents.clientConfig.getCaCertFile)
+    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+  }
+
+  private def launchStagingServer(
+      resourceStagingServerSslOptions: SSLOptions, keyAndCertPem: Option[KeyAndCertPem]): Unit = {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    val resourceStagingServerPort = resourceStagingServerLauncher.launchStagingServer(
+      resourceStagingServerSslOptions, keyAndCertPem)
+    val resourceStagingServerUriScheme = if (resourceStagingServerSslOptions.enabled) {
+      "https"
+    } else {
+      "http"
+    }
+    sparkConf.set(RESOURCE_STAGING_SERVER_URI,
+      s"$resourceStagingServerUriScheme://" +
+        s"${Minikube.getMinikubeIp}:$resourceStagingServerPort")
+  }
+
+  private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
+    Client.run(sparkConf, appResource, SPARK_PI_MAIN_CLASS, Array.empty[String])
+    val driverPod = kubernetesTestComponents.kubernetesClient
+      .pods()
+      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
+      .list()
+      .getItems
+      .get(0)
+    Eventually.eventually(TIMEOUT, INTERVAL) {
+      assert(kubernetesTestComponents.kubernetesClient
+        .pods()
+        .withName(driverPod.getMetadata.getName)
+        .getLog
+        .contains("Pi is roughly 3"), "The application did not compute the value of pi.")
+    }
+  }
+
+  private def runSparkGroupByTestAndVerifyCompletion(appResource: String): Unit = {
+    Client.run(
+      sparkConf = sparkConf,
+      appArgs = Array.empty[String],
+      mainClass = GROUP_BY_MAIN_CLASS,
+      mainAppResource = appResource)
+    val driverPod = kubernetesTestComponents.kubernetesClient
+      .pods()
+      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
+      .list()
+      .getItems
+      .get(0)
+    Eventually.eventually(TIMEOUT, INTERVAL) {
+      assert(kubernetesTestComponents.kubernetesClient
+        .pods()
+        .withName(driverPod.getMetadata.getName)
+        .getLog
+        .contains("The Result is"), "The application did not complete.")
+    }
+  }
+
+  private def createShuffleServiceDaemonSet(): Unit = {
+    val ds = kubernetesTestComponents.kubernetesClient.extensions().daemonSets()
+      .createNew()
+        .withNewMetadata()
+        .withName("shuffle")
+      .endMetadata()
+      .withNewSpec()
+        .withNewTemplate()
+          .withNewMetadata()
+            .withLabels(Map("app" -> "spark-shuffle-service").asJava)
+          .endMetadata()
+          .withNewSpec()
+            .addNewVolume()
+              .withName("shuffle-dir")
+              .withNewHostPath()
+                .withPath("/tmp")
+              .endHostPath()
+            .endVolume()
+            .addNewContainer()
+              .withName("shuffle")
+              .withImage("spark-shuffle:latest")
+              .withImagePullPolicy("IfNotPresent")
+              .addNewVolumeMount()
+                .withName("shuffle-dir")
+                .withMountPath("/tmp")
+              .endVolumeMount()
+            .endContainer()
+          .endSpec()
+        .endTemplate()
+      .endSpec()
+      .done()
+
+    // wait for daemonset to become available.
+    Eventually.eventually(TIMEOUT, INTERVAL) {
+      val pods = kubernetesTestComponents.kubernetesClient.pods()
+        .withLabel("app", "spark-shuffle-service").list().getItems
+
+      if (pods.size() == 0 || !Readiness.isReady(pods.get(0))) {
+        throw ShuffleNotReadyException
+      }
+    }
   }
 }
 
@@ -70,5 +296,5 @@ private[spark] object KubernetesSuite {
   val GROUP_BY_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.GroupByTest"
 
-  case class ShuffleNotReadyException() extends Exception
+  case object ShuffleNotReadyException extends Exception
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
index 677c0db606a47..9ae0d9ade7dc2 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
@@ -17,18 +17,13 @@
 package org.apache.spark.deploy.kubernetes.integrationtest
 
 import java.util.UUID
-import javax.net.ssl.X509TrustManager
-
-import scala.collection.JavaConverters._
-import scala.reflect.ClassTag
 
 import io.fabric8.kubernetes.client.DefaultKubernetesClient
-import io.fabric8.kubernetes.client.internal.SSLUtils
 import org.scalatest.concurrent.Eventually
+import scala.collection.JavaConverters._
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.rest.kubernetes.v1.HttpClientUtil
 
 private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesClient) {
 
@@ -73,26 +68,4 @@ private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesCl
       .set("spark.testing", "false")
       .set(WAIT_FOR_APP_COMPLETION, false)
   }
-
-  def getService[T: ClassTag](
-    serviceName: String,
-    namespace: String,
-    servicePortName: String,
-    servicePath: String = ""): T = synchronized {
-    val kubernetesMaster = s"${defaultClient.getMasterUrl}"
-
-    val url = s"${
-      Array[String](
-        s"${kubernetesClient.getMasterUrl}",
-        "api", "v1", "proxy",
-        "namespaces", namespace,
-        "services", serviceName).mkString("/")
-    }" +
-      s":$servicePortName$servicePath"
-    val userHome = System.getProperty("user.home")
-    val kubernetesConf = kubernetesClient.getConfiguration
-    val sslContext = SSLUtils.sslContext(kubernetesConf)
-    val trustManager = SSLUtils.trustManagers(kubernetesConf)(0).asInstanceOf[X509TrustManager]
-    HttpClientUtil.createClient[T](Set(url), 5, sslContext.getSocketFactory, trustManager)
-  }
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
deleted file mode 100644
index 559cb281c7c62..0000000000000
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV1Suite.scala
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.integrationtest
-
-import java.util.concurrent.TimeUnit
-
-import scala.collection.JavaConverters._
-
-import com.google.common.collect.ImmutableList
-import com.google.common.util.concurrent.SettableFuture
-import io.fabric8.kubernetes.api.model.Pod
-import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
-import org.scalatest.{BeforeAndAfter, DoNotDiscover}
-import org.scalatest.concurrent.Eventually
-
-import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
-import org.apache.spark.deploy.kubernetes.SSLUtils
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
-import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
-import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.integrationtest.restapis.SparkRestApiV1
-import org.apache.spark.deploy.kubernetes.submit.v1.{Client, ExternalSuppliedUrisDriverServiceManager}
-import org.apache.spark.status.api.v1.{ApplicationStatus, StageStatus}
-import org.apache.spark.util.Utils
-
-@DoNotDiscover
-private[spark] class KubernetesV1Suite(testBackend: IntegrationTestBackend)
-  extends SparkFunSuite with BeforeAndAfter {
-
-  private var kubernetesTestComponents: KubernetesTestComponents = _
-  private var sparkConf: SparkConf = _
-
-  override def beforeAll(): Unit = {
-    kubernetesTestComponents = new KubernetesTestComponents(testBackend.getKubernetesClient)
-    kubernetesTestComponents.createNamespace()
-  }
-
-  override def afterAll(): Unit = {
-    kubernetesTestComponents.deleteNamespace()
-  }
-
-  before {
-    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      val podsList = kubernetesTestComponents.kubernetesClient.pods().list()
-      assert(podsList == null
-        || podsList.getItems == null
-        || podsList.getItems.isEmpty
-      )
-      val servicesList = kubernetesTestComponents.kubernetesClient.services().list()
-      assert(servicesList == null
-        || servicesList.getItems == null
-        || servicesList.getItems.isEmpty)
-    }
-    sparkConf = kubernetesTestComponents.newSparkConf()
-  }
-
-  after {
-    val pods = kubernetesTestComponents.kubernetesClient.pods().list().getItems.asScala
-    pods.par.foreach(pod => {
-      kubernetesTestComponents.kubernetesClient.pods()
-        .withName(pod.getMetadata.getName)
-        .withGracePeriod(60)
-        .delete
-    })
-  }
-
-  private def getSparkMetricsService(sparkBaseAppName: String): SparkRestApiV1 = {
-    val serviceName = kubernetesTestComponents.kubernetesClient.services()
-      .withLabel("spark-app-name", sparkBaseAppName)
-      .list()
-      .getItems
-      .get(0)
-      .getMetadata
-      .getName
-    kubernetesTestComponents.getService[SparkRestApiV1](serviceName,
-      kubernetesTestComponents.namespace, "spark-ui-port")
-  }
-
-  private def expectationsForStaticAllocation(sparkMetricsService: SparkRestApiV1): Unit = {
-    val apps = Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      val result = sparkMetricsService
-        .getApplications(ImmutableList.of(ApplicationStatus.RUNNING, ApplicationStatus.COMPLETED))
-      assert(result.size == 1
-        && !result.head.id.equalsIgnoreCase("appid")
-        && !result.head.id.equalsIgnoreCase("{appId}"))
-      result
-    }
-    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      val result = sparkMetricsService.getExecutors(apps.head.id)
-      assert(result.size == 2)
-      assert(result.count(exec => exec.id != "driver") == 1)
-      result
-    }
-    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      val result = sparkMetricsService.getStages(
-        apps.head.id, Seq(StageStatus.COMPLETE).asJava)
-      assert(result.size == 1)
-      result
-    }
-  }
-
-  test("Run a simple example") {
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
-  }
-
-  test("Run with the examples jar on the docker image") {
-    sparkConf.setJars(Seq(KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
-  }
-
-  test("Run with custom labels and annotations") {
-    sparkConf.set(KUBERNETES_DRIVER_LABELS, "label1=label1value,label2=label2value")
-    sparkConf.set(KUBERNETES_DRIVER_ANNOTATIONS, "annotation1=annotation1value," +
-        "annotation2=annotation2value")
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val driverPodMetadata = kubernetesTestComponents.kubernetesClient
-      .pods
-      .withLabel("spark-app-name", "spark-pi")
-      .list()
-      .getItems
-      .get(0)
-      .getMetadata
-    val driverPodLabels = driverPodMetadata.getLabels
-    // We can't match all of the selectors directly since one of the selectors is based on the
-    // launch time.
-    assert(driverPodLabels.size === 5, "Unexpected number of pod labels.")
-    assert(driverPodLabels.get("spark-app-name") === "spark-pi", "Unexpected value for" +
-      " spark-app-name label.")
-    assert(driverPodLabels.get("spark-app-id").startsWith("spark-pi"), "Unexpected value for" +
-      " spark-app-id label (should be prefixed with the app name).")
-    assert(driverPodLabels.get("label1") === "label1value", "Unexpected value for label1")
-    assert(driverPodLabels.get("label2") === "label2value", "Unexpected value for label2")
-    val driverPodAnnotations = driverPodMetadata.getAnnotations
-    assert(driverPodAnnotations.size === 2, "Unexpected number of pod annotations.")
-    assert(driverPodAnnotations.get("annotation1") === "annotation1value",
-      "Unexpected value for annotation1")
-    assert(driverPodAnnotations.get("annotation2") === "annotation2value",
-      "Unexpected value for annotation2")
-  }
-
-  test("Run with driver pod name") {
-    sparkConf.set(KUBERNETES_DRIVER_POD_NAME, "spark-pi")
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val driverPodMetadata = kubernetesTestComponents.kubernetesClient
-      .pods()
-      .withName("spark-pi")
-      .get()
-      .getMetadata()
-    val driverName = driverPodMetadata.getName
-    assert(driverName === "spark-pi", "Unexpected driver pod name.")
-  }
-
-  test("Enable SSL on the driver submit server") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
-      Minikube.getMinikubeIp,
-      "changeit",
-      "changeit",
-      "changeit")
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_KEYSTORE,
-        s"file://${keyStoreAndTrustStore.keyStore.getAbsolutePath}")
-    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyStorePassword", "changeit")
-    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.keyPassword", "changeit")
-    sparkConf.set(KUBERNETES_DRIVER_SUBMIT_SSL_TRUSTSTORE,
-        s"file://${keyStoreAndTrustStore.trustStore.getAbsolutePath}")
-    sparkConf.set("spark.ssl.kubernetes.driversubmitserver.trustStorePassword", "changeit")
-    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-  }
-
-  test("Enable SSL on the driver submit server using PEM files") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    val keyAndCertPem = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
-    sparkConf.set(DRIVER_SUBMIT_SSL_KEY_PEM, s"file://${keyAndCertPem.keyPem.getAbsolutePath}")
-    sparkConf.set(
-        DRIVER_SUBMIT_SSL_CLIENT_CERT_PEM, s"file://${keyAndCertPem.certPem.getAbsolutePath}")
-    sparkConf.set(
-        DRIVER_SUBMIT_SSL_SERVER_CERT_PEM, s"file://${keyAndCertPem.certPem.getAbsolutePath}")
-    sparkConf.set(DRIVER_SUBMIT_SSL_ENABLED, true)
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-  }
-
-  test("Added files should exist on the driver.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    sparkConf.set("spark.files", KubernetesSuite.TEST_EXISTENCE_FILE.getAbsolutePath)
-    sparkConf.setAppName("spark-file-existence-test")
-    val podCompletedFuture = SettableFuture.create[Boolean]
-    val watch = new Watcher[Pod] {
-      override def eventReceived(action: Action, pod: Pod): Unit = {
-        val containerStatuses = pod.getStatus.getContainerStatuses.asScala
-        val allSuccessful = containerStatuses.nonEmpty && containerStatuses
-          .forall(status => {
-            status.getState.getTerminated != null && status.getState.getTerminated.getExitCode == 0
-        })
-        if (allSuccessful) {
-          podCompletedFuture.set(true)
-        } else {
-          val failedContainers = containerStatuses.filter(container => {
-            container.getState.getTerminated != null &&
-              container.getState.getTerminated.getExitCode != 0
-          })
-          if (failedContainers.nonEmpty) {
-            podCompletedFuture.setException(new SparkException(
-              "One or more containers in the driver failed with a nonzero exit code."))
-          }
-        }
-      }
-
-      override def onClose(e: KubernetesClientException): Unit = {
-        logWarning("Watch closed", e)
-      }
-    }
-    Utils.tryWithResource(kubernetesTestComponents.kubernetesClient
-        .pods
-        .withLabel("spark-app-name", "spark-file-existence-test")
-        .watch(watch)) { _ =>
-      new Client(
-        sparkConf = sparkConf,
-        mainClass = KubernetesSuite.FILE_EXISTENCE_MAIN_CLASS,
-        mainAppResource = KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
-        appArgs = Array(KubernetesSuite.TEST_EXISTENCE_FILE.getName,
-          KubernetesSuite.TEST_EXISTENCE_FILE_CONTENTS)).run()
-      assert(podCompletedFuture.get(60, TimeUnit.SECONDS), "Failed to run driver pod")
-      val driverPod = kubernetesTestComponents.kubernetesClient
-        .pods
-        .withLabel("spark-app-name", "spark-file-existence-test")
-        .list()
-        .getItems
-        .get(0)
-      val podLog = kubernetesTestComponents.kubernetesClient
-        .pods
-        .withName(driverPod.getMetadata.getName)
-        .getLog
-      assert(podLog.contains(s"File found at" +
-        s" /opt/spark/${KubernetesSuite.TEST_EXISTENCE_FILE.getName} with correct contents."),
-        "Job did not find the file as expected.")
-    }
-  }
-
-  test("Use external URI provider") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    val externalUriProviderWatch =
-      new ExternalUriProviderWatch(kubernetesTestComponents.kubernetesClient)
-    Utils.tryWithResource(kubernetesTestComponents.kubernetesClient.services()
-        .withLabel("spark-app-name", "spark-pi")
-        .watch(externalUriProviderWatch)) { _ =>
-      sparkConf.set(DRIVER_SERVICE_MANAGER_TYPE, ExternalSuppliedUrisDriverServiceManager.TYPE)
-      new Client(
-        sparkConf = sparkConf,
-        mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-        mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-        appArgs = Array.empty[String]).run()
-      val sparkMetricsService = getSparkMetricsService("spark-pi")
-      expectationsForStaticAllocation(sparkMetricsService)
-      assert(externalUriProviderWatch.annotationSet.get)
-      val driverService = kubernetesTestComponents.kubernetesClient
-        .services()
-        .withLabel("spark-app-name", "spark-pi")
-        .list()
-        .getItems
-        .asScala(0)
-      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_PROVIDE_EXTERNAL_URI),
-          "External URI request annotation was not set on the driver service.")
-      // Unfortunately we can't check the correctness of the actual value of the URI, as it depends
-      // on the driver submission port set on the driver service but we remove that port from the
-      // service once the submission is complete.
-      assert(driverService.getMetadata.getAnnotations.containsKey(ANNOTATION_RESOLVED_EXTERNAL_URI),
-        "Resolved URI annotation not set on driver service.")
-    }
-  }
-
-  test("Mount the Kubernetes credentials onto the driver pod") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
-      kubernetesTestComponents.clientConfig.getCaCertFile)
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
-      kubernetesTestComponents.clientConfig.getClientKeyFile)
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
-      kubernetesTestComponents.clientConfig.getClientCertFile)
-    new Client(
-      sparkConf = sparkConf,
-      mainClass = KubernetesSuite.SPARK_PI_MAIN_CLASS,
-      mainAppResource = KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      appArgs = Array.empty[String]).run()
-    val sparkMetricsService = getSparkMetricsService("spark-pi")
-    expectationsForStaticAllocation(sparkMetricsService)
-  }
-
-}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
deleted file mode 100644
index e9900b90cb588..0000000000000
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesV2Suite.scala
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.integrationtest
-
-import java.util.UUID
-
-import io.fabric8.kubernetes.client.internal.readiness.Readiness
-import org.scalatest.{BeforeAndAfter, DoNotDiscover}
-import org.scalatest.concurrent.Eventually
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
-import org.apache.spark.deploy.kubernetes.SSLUtils
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
-import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
-import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.submit.v2.{Client, KeyAndCertPem}
-import org.apache.spark.launcher.SparkLauncher
-
-@DoNotDiscover
-private[spark] class KubernetesV2Suite(testBackend: IntegrationTestBackend)
-  extends SparkFunSuite with BeforeAndAfter {
-
-  private val APP_LOCATOR_LABEL = UUID.randomUUID().toString.replaceAll("-", "")
-  private var kubernetesTestComponents: KubernetesTestComponents = _
-  private var sparkConf: SparkConf = _
-  private var resourceStagingServerLauncher: ResourceStagingServerLauncher = _
-  private var staticAssetServerLauncher: StaticAssetServerLauncher = _
-
-  override def beforeAll(): Unit = {
-    kubernetesTestComponents = new KubernetesTestComponents(testBackend.getKubernetesClient)
-    resourceStagingServerLauncher = new ResourceStagingServerLauncher(
-      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
-    staticAssetServerLauncher = new StaticAssetServerLauncher(
-      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
-  }
-
-  before {
-    sparkConf = kubernetesTestComponents.newSparkConf()
-      .set(INIT_CONTAINER_DOCKER_IMAGE, s"spark-driver-init:latest")
-      .set(DRIVER_DOCKER_IMAGE, s"spark-driver-v2:latest")
-      .set(KUBERNETES_DRIVER_LABELS, s"spark-app-locator=$APP_LOCATOR_LABEL")
-    kubernetesTestComponents.createNamespace()
-  }
-
-  after {
-    kubernetesTestComponents.deleteNamespace()
-  }
-
-  test("Use submission v2.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    launchStagingServer(SSLOptions(), None)
-    runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Enable SSL on the submission server") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
-      ipAddress = Minikube.getMinikubeIp,
-      keyStorePassword = "keyStore",
-      keyPassword = "key",
-      trustStorePassword = "trustStore")
-    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore",
-          keyStoreAndTrustStore.keyStore.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore",
-          keyStoreAndTrustStore.trustStore.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStore")
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "key")
-      .set("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword", "trustStore")
-    launchStagingServer(SSLOptions(
-      enabled = true,
-      keyStore = Some(keyStoreAndTrustStore.keyStore),
-      trustStore = Some(keyStoreAndTrustStore.trustStore),
-      keyStorePassword = Some("keyStore"),
-      keyPassword = Some("key"),
-      trustStorePassword = Some("trustStore")),
-      None)
-    runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Use container-local resources without the resource staging server") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    sparkConf.setJars(Seq(
-      KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
-    runSparkPiAndVerifyCompletion(KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Dynamic executor scaling basic test") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    launchStagingServer(SSLOptions(), None)
-    createShuffleServiceDaemonSet()
-
-    sparkConf.setJars(Seq(KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
-    sparkConf.set("spark.dynamicAllocation.enabled", "true")
-    sparkConf.set("spark.shuffle.service.enabled", "true")
-    sparkConf.set("spark.kubernetes.shuffle.labels", "app=spark-shuffle-service")
-    sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
-    sparkConf.set("spark.app.name", "group-by-test")
-    runSparkGroupByTestAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Use remote resources without the resource staging server.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
-    sparkConf.setJars(Seq(
-      s"$assetServerUri/${KubernetesSuite.EXAMPLES_JAR_FILE.getName}",
-      s"$assetServerUri/${KubernetesSuite.HELPER_JAR_FILE.getName}"
-    ))
-    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
-  }
-
-  test("Mix remote resources with submitted ones.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    launchStagingServer(SSLOptions(), None)
-    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
-    sparkConf.setJars(Seq(
-      KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
-      s"$assetServerUri/${KubernetesSuite.HELPER_JAR_FILE.getName}"
-    ))
-    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
-  }
-
-  test("Use key and certificate PEM files for TLS.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    val keyAndCertificate = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
-    launchStagingServer(
-        SSLOptions(enabled = true),
-        Some(keyAndCertificate))
-    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
-        .set(
-            RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key, keyAndCertificate.certPem.getAbsolutePath)
-    runSparkPiAndVerifyCompletion(KubernetesSuite.SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Use client key and client cert file when requesting executors") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    sparkConf.setJars(Seq(
-        KubernetesSuite.CONTAINER_LOCAL_MAIN_APP_RESOURCE,
-        KubernetesSuite.CONTAINER_LOCAL_HELPER_JAR_PATH))
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
-        kubernetesTestComponents.clientConfig.getClientKeyFile)
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
-        kubernetesTestComponents.clientConfig.getClientCertFile)
-    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
-        kubernetesTestComponents.clientConfig.getCaCertFile)
-    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
-  }
-
-  private def launchStagingServer(
-      resourceStagingServerSslOptions: SSLOptions, keyAndCertPem: Option[KeyAndCertPem]): Unit = {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    val resourceStagingServerPort = resourceStagingServerLauncher.launchStagingServer(
-      resourceStagingServerSslOptions, keyAndCertPem)
-    val resourceStagingServerUriScheme = if (resourceStagingServerSslOptions.enabled) {
-      "https"
-    } else {
-      "http"
-    }
-    sparkConf.set(RESOURCE_STAGING_SERVER_URI,
-      s"$resourceStagingServerUriScheme://" +
-        s"${Minikube.getMinikubeIp}:$resourceStagingServerPort")
-  }
-
-  private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
-    Client.run(sparkConf, appResource, KubernetesSuite.SPARK_PI_MAIN_CLASS, Array.empty[String])
-    val driverPod = kubernetesTestComponents.kubernetesClient
-      .pods()
-      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
-      .list()
-      .getItems
-      .get(0)
-    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      assert(kubernetesTestComponents.kubernetesClient
-        .pods()
-        .withName(driverPod.getMetadata.getName)
-        .getLog
-        .contains("Pi is roughly 3"), "The application did not compute the value of pi.")
-    }
-  }
-
-  private def runSparkGroupByTestAndVerifyCompletion(appResource: String): Unit = {
-    Client.run(
-      sparkConf = sparkConf,
-      appArgs = Array.empty[String],
-      mainClass = KubernetesSuite.GROUP_BY_MAIN_CLASS,
-      mainAppResource = appResource)
-    val driverPod = kubernetesTestComponents.kubernetesClient
-      .pods()
-      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
-      .list()
-      .getItems
-      .get(0)
-    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      assert(kubernetesTestComponents.kubernetesClient
-        .pods()
-        .withName(driverPod.getMetadata.getName)
-        .getLog
-        .contains("The Result is"), "The application did not complete.")
-    }
-  }
-
-  private def createShuffleServiceDaemonSet(): Unit = {
-    val ds = kubernetesTestComponents.kubernetesClient.extensions().daemonSets()
-      .createNew()
-        .withNewMetadata()
-        .withName("shuffle")
-      .endMetadata()
-      .withNewSpec()
-        .withNewTemplate()
-          .withNewMetadata()
-            .withLabels(Map("app" -> "spark-shuffle-service").asJava)
-          .endMetadata()
-          .withNewSpec()
-            .addNewVolume()
-              .withName("shuffle-dir")
-              .withNewHostPath()
-                .withPath("/tmp")
-              .endHostPath()
-            .endVolume()
-            .addNewContainer()
-              .withName("shuffle")
-              .withImage("spark-shuffle:latest")
-              .withImagePullPolicy("IfNotPresent")
-              .addNewVolumeMount()
-                .withName("shuffle-dir")
-                .withMountPath("/tmp")
-              .endVolumeMount()
-            .endContainer()
-          .endSpec()
-        .endTemplate()
-      .endSpec()
-      .done()
-
-    // wait for daemonset to become available.
-    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
-      val pods = kubernetesTestComponents.kubernetesClient.pods()
-        .withLabel("app", "spark-shuffle-service").list().getItems()
-
-      if (pods.size() == 0 || Readiness.isReady(pods.get(0))) {
-        throw KubernetesSuite.ShuffleNotReadyException()
-      }
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
index 1ba54c131c196..e5e1b1f085f9f 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ResourceStagingServerLauncher.scala
@@ -26,7 +26,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.spark.SSLOptions
 import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.submit.v2.{ContainerNameEqualityPredicate, KeyAndCertPem}
+import org.apache.spark.deploy.kubernetes.submit.{ContainerNameEqualityPredicate, KeyAndCertPem}
 import org.apache.spark.util.Utils
 
 /**
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index 0692cf55db848..3ff72829f88a7 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -28,11 +28,10 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
 
   private val DOCKER_BUILD_PATH = Paths.get("target", "docker")
   // Dockerfile paths must be relative to the build path.
-  private val DRIVER_V1_DOCKER_FILE = "dockerfiles/driver/Dockerfile"
-  private val DRIVER_V2_DOCKER_FILE = "dockerfiles/driver-v2/Dockerfile"
+  private val DRIVER_DOCKER_FILE = "dockerfiles/driver/Dockerfile"
   private val EXECUTOR_DOCKER_FILE = "dockerfiles/executor/Dockerfile"
   private val SHUFFLE_SERVICE_DOCKER_FILE = "dockerfiles/shuffle-service/Dockerfile"
-  private val DRIVER_INIT_DOCKER_FILE = "dockerfiles/driver-init/Dockerfile"
+  private val INIT_CONTAINER_DOCKER_FILE = "dockerfiles/init-container/Dockerfile"
   private val STAGING_SERVER_DOCKER_FILE = "dockerfiles/resource-staging-server/Dockerfile"
   private val STATIC_ASSET_SERVER_DOCKER_FILE =
     "dockerfiles/integration-test-asset-server/Dockerfile"
@@ -61,12 +60,11 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
 
   def buildSparkDockerImages(): Unit = {
     Eventually.eventually(TIMEOUT, INTERVAL) { dockerClient.ping() }
-    buildImage("spark-driver", DRIVER_V1_DOCKER_FILE)
+    buildImage("spark-driver", DRIVER_DOCKER_FILE)
     buildImage("spark-executor", EXECUTOR_DOCKER_FILE)
     buildImage("spark-shuffle", SHUFFLE_SERVICE_DOCKER_FILE)
-    buildImage("spark-driver-v2", DRIVER_V2_DOCKER_FILE)
     buildImage("spark-resource-staging-server", STAGING_SERVER_DOCKER_FILE)
-    buildImage("spark-driver-init", DRIVER_INIT_DOCKER_FILE)
+    buildImage("spark-init", INIT_CONTAINER_DOCKER_FILE)
     buildImage("spark-integration-test-asset-server", STATIC_ASSET_SERVER_DOCKER_FILE)
   }
 

From 56414f9a56cecd02906fed38b098ecd921871c2c Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 23 May 2017 16:38:07 -0700
Subject: [PATCH 486/534] Added files should be in the working directories.
 (#294)

* Added files should be in the working directories.

* Revert unintentional changes

* Fix test
---
 docs/running-on-kubernetes.md                 | 16 ++++++
 .../SparkPodInitContainerBootstrap.scala      |  4 ++
 .../spark/deploy/kubernetes/config.scala      |  4 +-
 .../spark/deploy/kubernetes/constants.scala   |  1 +
 .../SparkPodInitContainerBootstrapSuite.scala | 10 ++++
 .../src/main/docker/driver/Dockerfile         |  1 +
 .../src/main/docker/executor/Dockerfile       |  1 +
 .../jobs/FileExistenceTest.scala              | 13 ++---
 .../integrationtest/KubernetesSuite.scala     | 54 +++++++++++--------
 .../KubernetesTestComponents.scala            |  2 +-
 10 files changed, 75 insertions(+), 31 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 98393cbbbba2d..b18987f6af4a4 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -558,6 +558,22 @@ from the other deployment modes. See the [configuration page](configuration.html
     disk as a secret into the init-containers.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.mountdependencies.jarsDownloadDir</code></td>
+  <td><code>/var/spark-data/spark-jars</code></td>
+  <td>
+    Location to download jars to in the driver and executors. This will be mounted as an empty directory volume
+    into the driver and executor containers.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.mountdependencies.filesDownloadDir</code></td>
+  <td><code>/var/spark-data/spark-files</code></td>
+  <td>
+    Location to download files to in the driver and executors. This will be mounted as an empty directory volume
+    into the driver and executor containers.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.report.interval</code></td>
   <td><code>1s</code></td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
index 0d4e82566643d..a4d0aeb23d01f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
@@ -93,6 +93,10 @@ private[spark] class SparkPodInitContainerBootstrapImpl(
           .endVolume()
         .editMatchingContainer(new ContainerNameEqualityPredicate(mainContainerName))
           .addToVolumeMounts(sharedVolumeMounts: _*)
+          .addNewEnv()
+            .withName(ENV_MOUNTED_FILES_DIR)
+            .withValue(filesDownloadPath)
+            .endEnv()
           .endContainer()
         .endSpec()
     resourceStagingServerSecretPlugin.map { plugin =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index bcb9a96cae960..c892b01314975 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -447,7 +447,7 @@ package object config extends Logging {
         " spark-submit, this directory must be empty and will be mounted as an empty directory" +
         " volume on the driver and executor pod.")
       .stringConf
-      .createWithDefault("/var/spark-data/spark-submitted-jars")
+      .createWithDefault("/var/spark-data/spark-jars")
 
   private[spark] val INIT_CONTAINER_FILES_DOWNLOAD_LOCATION =
     ConfigBuilder("spark.kubernetes.mountdependencies.filesDownloadDir")
@@ -455,7 +455,7 @@ package object config extends Logging {
         " spark-submit, this directory must be empty and will be mounted as an empty directory" +
         " volume on the driver and executor pods.")
       .stringConf
-      .createWithDefault("/var/spark-data/spark-submitted-files")
+      .createWithDefault("/var/spark-data/spark-files")
 
   private[spark] val INIT_CONTAINER_MOUNT_TIMEOUT =
     ConfigBuilder("spark.kubernetes.mountdependencies.mountTimeout")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index ea11ca2ec8f21..5515e88a50fb0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -92,6 +92,7 @@ package object constants {
   private[spark] val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
   private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   private[spark] val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
+  private[spark] val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
 
   // Annotation keys
   private[spark] val ANNOTATION_PROVIDE_EXTERNAL_URI =
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
index 6db7d3ff2da53..3feba80f800c7 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
@@ -111,6 +111,16 @@ class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAf
     })
   }
 
+  test("Files download path is set as environment variable") {
+    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
+    val containers = bootstrappedPod.getSpec.getContainers.asScala
+    val maybeMainContainer = containers.find(_.getName === MAIN_CONTAINER_NAME)
+    assert(maybeMainContainer.exists { mainContainer =>
+      mainContainer.getEnv.asScala.exists(envVar =>
+        envVar.getName == ENV_MOUNTED_FILES_DIR && envVar.getValue == FILES_DOWNLOAD_PATH)
+    })
+  }
+
   test("Running with submitted dependencies modifies the init container with the plugin.") {
     val bootstrappedPod = bootstrapPodWithSubmittedDependencies()
     val podAnnotations = bootstrappedPod.getMetadata.getAnnotations.asScala
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index 40f9459dc06dc..c4c75642c9d22 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -40,4 +40,5 @@ CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
     exec ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index c5f1c43ff7cf4..e345f10056522 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -40,4 +40,5 @@ WORKDIR /opt/spark
 CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
     exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp $SPARK_CLASSPATH org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala
index 8b8d5e05f6479..8994c998bffee 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/src/main/scala/org/apache/spark/deploy/kubernetes/integrationtest/jobs/FileExistenceTest.scala
@@ -28,7 +28,9 @@ private[spark] object FileExistenceTest {
 
   def main(args: Array[String]): Unit = {
     if (args.length < 2) {
-      throw new IllegalArgumentException("Usage: WordCount <source-file> <expected contents>")
+      throw new IllegalArgumentException(
+          s"Invalid args: ${args.mkString}, " +
+            "Usage: FileExistenceTest <source-file> <expected contents>")
     }
     // Can't use SparkContext.textFile since the file is local to the driver
     val file = Paths.get(args(0)).toFile
@@ -39,16 +41,15 @@ private[spark] object FileExistenceTest {
       val contents = Files.toString(file, Charsets.UTF_8)
       if (args(1) != contents) {
         throw new SparkException(s"Contents do not match. Expected: ${args(1)}," +
-          s" actual, $contents")
+          s" actual: $contents")
       } else {
         println(s"File found at ${file.getAbsolutePath} with correct contents.")
       }
       // scalastyle:on println
     }
-    val spark = SparkSession.builder()
-      .appName("Test")
-      .getOrCreate()
-    spark.stop()
+    while (true) {
+      Thread.sleep(600000)
+    }
   }
 
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index d23bfcdbc5251..95775d262a69d 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -16,6 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest
 
+import java.io.File
 import java.nio.file.Paths
 import java.util.UUID
 
@@ -35,11 +36,11 @@ import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minik
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
 import org.apache.spark.deploy.kubernetes.submit.{Client, KeyAndCertPem}
 import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.util.Utils
 
 private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   import KubernetesSuite._
   private val testBackend = IntegrationTestBackendFactory.getTestBackend()
-
   private val APP_LOCATOR_LABEL = UUID.randomUUID().toString.replaceAll("-", "")
   private var kubernetesTestComponents: KubernetesTestComponents = _
   private var sparkConf: SparkConf = _
@@ -124,7 +125,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     sparkConf.set("spark.kubernetes.shuffle.labels", "app=spark-shuffle-service")
     sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
     sparkConf.set("spark.app.name", "group-by-test")
-    runSparkGroupByTestAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+    runSparkApplicationAndVerifyCompletion(
+        SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+        GROUP_BY_MAIN_CLASS,
+        "The Result is",
+        Array.empty[String])
   }
 
   test("Use remote resources without the resource staging server.") {
@@ -173,6 +178,20 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
   }
 
+  test("Added files should be placed in the driver's working directory.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    val testExistenceFileTempDir = Utils.createTempDir(namePrefix = "test-existence-file-temp-dir")
+    val testExistenceFile = new File(testExistenceFileTempDir, "input.txt")
+    Files.write(TEST_EXISTENCE_FILE_CONTENTS, testExistenceFile, Charsets.UTF_8)
+    launchStagingServer(SSLOptions(), None)
+    sparkConf.set("spark.files", testExistenceFile.getAbsolutePath)
+    runSparkApplicationAndVerifyCompletion(
+        SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+        FILE_EXISTENCE_MAIN_CLASS,
+        s"File found at /opt/spark/${testExistenceFile.getName} with correct contents.",
+        Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS))
+  }
+
   private def launchStagingServer(
       resourceStagingServerSslOptions: SSLOptions, keyAndCertPem: Option[KeyAndCertPem]): Unit = {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
@@ -190,27 +209,19 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
-    Client.run(sparkConf, appResource, SPARK_PI_MAIN_CLASS, Array.empty[String])
-    val driverPod = kubernetesTestComponents.kubernetesClient
-      .pods()
-      .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
-      .list()
-      .getItems
-      .get(0)
-    Eventually.eventually(TIMEOUT, INTERVAL) {
-      assert(kubernetesTestComponents.kubernetesClient
-        .pods()
-        .withName(driverPod.getMetadata.getName)
-        .getLog
-        .contains("Pi is roughly 3"), "The application did not compute the value of pi.")
-    }
+    runSparkApplicationAndVerifyCompletion(
+        appResource, SPARK_PI_MAIN_CLASS, "Pi is roughly 3", Array.empty[String])
   }
 
-  private def runSparkGroupByTestAndVerifyCompletion(appResource: String): Unit = {
+  private def runSparkApplicationAndVerifyCompletion(
+      appResource: String,
+      mainClass: String,
+      expectedLogOnCompletion: String,
+      appArgs: Array[String]): Unit = {
     Client.run(
       sparkConf = sparkConf,
-      appArgs = Array.empty[String],
-      mainClass = GROUP_BY_MAIN_CLASS,
+      appArgs = appArgs,
+      mainClass = mainClass,
       mainAppResource = appResource)
     val driverPod = kubernetesTestComponents.kubernetesClient
       .pods()
@@ -223,7 +234,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         .pods()
         .withName(driverPod.getMetadata.getName)
         .getLog
-        .contains("The Result is"), "The application did not complete.")
+        .contains(expectedLogOnCompletion), "The application did not complete.")
     }
   }
 
@@ -285,8 +296,6 @@ private[spark] object KubernetesSuite {
   val CONTAINER_LOCAL_HELPER_JAR_PATH = s"local:///opt/spark/examples/" +
     s"integration-tests-jars/${HELPER_JAR_FILE.getName}"
 
-  val TEST_EXISTENCE_FILE = Paths.get("test-data", "input.txt").toFile
-  val TEST_EXISTENCE_FILE_CONTENTS = Files.toString(TEST_EXISTENCE_FILE, Charsets.UTF_8)
   val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
   val SPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
@@ -295,6 +304,7 @@ private[spark] object KubernetesSuite {
     ".integrationtest.jobs.FileExistenceTest"
   val GROUP_BY_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.GroupByTest"
+  val TEST_EXISTENCE_FILE_CONTENTS = "contents"
 
   case object ShuffleNotReadyException extends Exception
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
index 9ae0d9ade7dc2..0ca1f482269db 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
@@ -63,7 +63,7 @@ private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesCl
       .set("spark.executor.memory", "500m")
       .set("spark.executor.cores", "1")
       .set("spark.executors.instances", "1")
-      .set("spark.app.name", "spark-pi")
+      .set("spark.app.name", "spark-test-app")
       .set("spark.ui.enabled", "true")
       .set("spark.testing", "false")
       .set(WAIT_FOR_APP_COMPLETION, false)

From fe03c7c18454aa02fe4695e81dde833d6f4d20f0 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 23 May 2017 20:53:09 -0700
Subject: [PATCH 487/534] Add missing license (#296)

---
 conf/kubernetes-resource-staging-server.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/conf/kubernetes-resource-staging-server.yaml b/conf/kubernetes-resource-staging-server.yaml
index de0da3edcb901..11f5d3a13b9e3 100644
--- a/conf/kubernetes-resource-staging-server.yaml
+++ b/conf/kubernetes-resource-staging-server.yaml
@@ -1,3 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 ---
 apiVersion: extensions/v1beta1
 kind: Deployment

From 38814043070bfdd522c2c9350b9411b943888cee Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 23 May 2017 22:57:27 -0700
Subject: [PATCH 488/534] Remove some leftover code and fix a constant. (#297)

* Remove some leftover code and fix a constant.

* Fix build
---
 .../spark/deploy/kubernetes/constants.scala   | 54 +++----------
 .../ExternalUriProviderWatch.scala            | 75 -------------------
 2 files changed, 9 insertions(+), 120 deletions(-)
 delete mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 5515e88a50fb0..950c1f6efe4e8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -23,21 +23,7 @@ package object constants {
   private[spark] val SPARK_APP_NAME_LABEL = "spark-app-name"
   private[spark] val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
 
-  // Secrets
-  private[spark] val DRIVER_CONTAINER_SUBMISSION_SECRETS_BASE_DIR =
-    "/var/run/secrets/spark-submission"
-  private[spark] val SUBMISSION_APP_SECRET_NAME = "spark-submission-server-secret"
-  private[spark] val SUBMISSION_APP_SECRET_PREFIX = "spark-submission-server-secret"
-  private[spark] val SUBMISSION_APP_SECRET_VOLUME_NAME = "spark-submission-secret-volume"
-  private[spark] val SUBMISSION_SSL_KEY_PASSWORD_SECRET_NAME =
-    "spark-submission-server-key-password"
-  private[spark] val SUBMISSION_SSL_KEYSTORE_PASSWORD_SECRET_NAME =
-    "spark-submission-server-keystore-password"
-  private[spark] val SUBMISSION_SSL_KEYSTORE_SECRET_NAME = "spark-submission-server-keystore"
-  private[spark] val SUBMISSION_SSL_SECRETS_PREFIX = "spark-submission-server-ssl"
-  private[spark] val SUBMISSION_SSL_SECRETS_VOLUME_NAME = "spark-submission-server-ssl-secrets"
-  private[spark] val SUBMISSION_SSL_KEY_PEM_SECRET_NAME = "spark-submission-server-key-pem"
-  private[spark] val SUBMISSION_SSL_CERT_PEM_SECRET_NAME = "spark-submission-server-cert-pem"
+  // Credentials secrets
   private[spark] val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
     "/mnt/secrets/spark-kubernetes-credentials"
   private[spark] val DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME = "ca-cert"
@@ -54,30 +40,15 @@ package object constants {
     s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME"
   private[spark] val DRIVER_CREDENTIALS_SECRET_VOLUME_NAME = "kubernetes-credentials"
 
-
   // Default and fixed ports
   private[spark] val SUBMISSION_SERVER_PORT = 7077
   private[spark] val DEFAULT_DRIVER_PORT = 7078
   private[spark] val DEFAULT_BLOCKMANAGER_PORT = 7079
   private[spark] val DEFAULT_UI_PORT = 4040
-  private[spark] val UI_PORT_NAME = "spark-ui-port"
-  private[spark] val SUBMISSION_SERVER_PORT_NAME = "submit-server"
   private[spark] val BLOCK_MANAGER_PORT_NAME = "blockmanager"
-  private[spark] val DRIVER_PORT_NAME = "driver"
   private[spark] val EXECUTOR_PORT_NAME = "executor"
 
   // Environment Variables
-  private[spark] val ENV_SUBMISSION_SECRET_LOCATION = "SPARK_SUBMISSION_SECRET_LOCATION"
-  private[spark] val ENV_SUBMISSION_SERVER_PORT = "SPARK_SUBMISSION_SERVER_PORT"
-  private[spark] val ENV_SUBMISSION_KEYSTORE_FILE = "SPARK_SUBMISSION_KEYSTORE_FILE"
-  private[spark] val ENV_SUBMISSION_KEYSTORE_PASSWORD_FILE =
-    "SPARK_SUBMISSION_KEYSTORE_PASSWORD_FILE"
-  private[spark] val ENV_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE =
-    "SPARK_SUBMISSION_KEYSTORE_KEY_PASSWORD_FILE"
-  private[spark] val ENV_SUBMISSION_KEYSTORE_TYPE = "SPARK_SUBMISSION_KEYSTORE_TYPE"
-  private[spark] val ENV_SUBMISSION_KEY_PEM_FILE = "SPARK_SUBMISSION_KEY_PEM_FILE"
-  private[spark] val ENV_SUBMISSION_CERT_PEM_FILE = "SPARK_SUBMISSION_CERT_PEM_FILE"
-  private[spark] val ENV_SUBMISSION_USE_SSL = "SPARK_SUBMISSION_USE_SSL"
   private[spark] val ENV_EXECUTOR_PORT = "SPARK_EXECUTOR_PORT"
   private[spark] val ENV_DRIVER_URL = "SPARK_DRIVER_URL"
   private[spark] val ENV_EXECUTOR_CORES = "SPARK_EXECUTOR_CORES"
@@ -87,27 +58,14 @@ package object constants {
   private[spark] val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
   private[spark] val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
   private[spark] val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
-  private[spark] val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  private[spark] val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_EXECUTOR_EXTRA_CLASSPATH"
   private[spark] val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   private[spark] val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
   private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   private[spark] val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
   private[spark] val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
 
-  // Annotation keys
-  private[spark] val ANNOTATION_PROVIDE_EXTERNAL_URI =
-    "spark-job.alpha.apache.org/provideExternalUri"
-  private[spark] val ANNOTATION_RESOLVED_EXTERNAL_URI =
-    "spark-job.alpha.apache.org/resolvedExternalUri"
-
-  // Miscellaneous
-  private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
-  private[spark] val DRIVER_SUBMIT_SSL_NAMESPACE = "kubernetes.driversubmitserver"
-  private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
-  private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
-  private[spark] val MEMORY_OVERHEAD_MIN = 384L
-
-  // V2 submission init container
+  // Bootstrapping dependencies with the init-container
   private[spark] val INIT_CONTAINER_ANNOTATION = "pod.beta.kubernetes.io/init-containers"
   private[spark] val INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH =
     "/mnt/secrets/spark-init"
@@ -127,4 +85,10 @@ package object constants {
     s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
   private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
   private[spark] val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
+
+  // Miscellaneous
+  private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
+  private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
+  private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
+  private[spark] val MEMORY_OVERHEAD_MIN = 384L
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
deleted file mode 100644
index f402d240bfc33..0000000000000
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/ExternalUriProviderWatch.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.integrationtest
-
-import java.util.concurrent.atomic.AtomicBoolean
-
-import io.fabric8.kubernetes.api.model.Service
-import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
-import scala.collection.JavaConverters._
-
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
-import org.apache.spark.internal.Logging
-
-/**
- * A slightly unrealistic implementation of external URI provision, but works
- * for tests - essentially forces the service to revert back to being exposed
- * on NodePort.
- */
-private[spark] class ExternalUriProviderWatch(kubernetesClient: KubernetesClient)
-    extends Watcher[Service] with Logging {
-
-  // Visible for testing
-  val annotationSet = new AtomicBoolean(false)
-
-  override def eventReceived(action: Action, service: Service): Unit = {
-    if (action == Action.ADDED) {
-      service.getMetadata
-          .getAnnotations
-          .asScala
-          .get(ANNOTATION_PROVIDE_EXTERNAL_URI).foreach { _ =>
-        if (!annotationSet.getAndSet(true)) {
-          val nodePortService = kubernetesClient.services().withName(service.getMetadata.getName)
-            .edit()
-              .editSpec()
-                .withType("NodePort")
-                .endSpec()
-            .done()
-          val submissionServerPort = nodePortService
-            .getSpec()
-            .getPorts
-            .asScala
-            .find(_.getName == SUBMISSION_SERVER_PORT_NAME)
-            .map(_.getNodePort)
-            .getOrElse(throw new IllegalStateException("Submission server port not found."))
-          val resolvedNodePortUri = s"http://${Minikube.getMinikubeIp}:$submissionServerPort"
-          kubernetesClient.services().withName(service.getMetadata.getName).edit()
-            .editMetadata()
-              .addToAnnotations(ANNOTATION_RESOLVED_EXTERNAL_URI, resolvedNodePortUri)
-              .endMetadata()
-            .done()
-        }
-      }
-    }
-  }
-
-  override def onClose(cause: KubernetesClientException): Unit = {
-    logWarning("External URI provider watch closed.", cause)
-  }
-}

From b84cb66e906af4cf70fcab45f0f2ed00528ee235 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <ramanathana@google.com>
Date: Thu, 25 May 2017 10:43:35 -0700
Subject: [PATCH 489/534] Adding restart policy fix for v2 (#303)

---
 .../scala/org/apache/spark/deploy/kubernetes/submit/Client.scala | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index bfb0bc3ffb0f3..a8029a28009c2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -119,6 +119,7 @@ private[spark] class Client(
           .addToAnnotations(parsedCustomAnnotations.asJava)
           .endMetadata()
         .withNewSpec()
+          .withRestartPolicy("Never")
           .addToContainers(driverContainer)
           .endSpec()
 

From dbf7a39075098e2508e965fa5013b63bfedcb9cb Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 25 May 2017 22:13:05 -0700
Subject: [PATCH 490/534] Add all dockerfiles to distributions. (#307)

---
 dev/make-distribution.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 62706b0fffedc..6f9dfa0e39072 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -176,11 +176,9 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
 
 # Copy docker files
-mkdir -p "$DISTDIR/dockerfiles/driver"
-mkdir -p "$DISTDIR/dockerfiles/executor"
+mkdir -p "$DISTDIR/dockerfiles"
 DOCKERFILES_SRC="$SPARK_HOME/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker"
-cp "$DOCKERFILES_SRC/driver/Dockerfile" "$DISTDIR/dockerfiles/driver/Dockerfile"
-cp "$DOCKERFILES_SRC/executor/Dockerfile" "$DISTDIR/dockerfiles/executor/Dockerfile"
+cp -R "$DOCKERFILES_SRC/." "$DISTDIR/dockerfiles/."
 
 # Only create the yarn directory if the yarn artifacts were build.
 if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then

From 2a2cfb6e72f6a020b3c3b0f0fab6ddef7674dab0 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 25 May 2017 22:46:53 -0700
Subject: [PATCH 491/534] Add proxy configuration to retrofit clients. (#301)

* Add proxy configuration to retrofit clients.

* Add logging
---
 .../kubernetes/RetrofitClientFactory.scala    | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
index a374982444f79..e38a3d9ad928e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
@@ -17,31 +17,56 @@
 package org.apache.spark.deploy.rest.kubernetes
 
 import java.io.FileInputStream
+import java.net.{InetSocketAddress, URI}
 import java.security.{KeyStore, SecureRandom}
 import javax.net.ssl.{SSLContext, TrustManagerFactory, X509TrustManager}
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import io.fabric8.kubernetes.client.Config
 import okhttp3.{Dispatcher, OkHttpClient}
 import retrofit2.Retrofit
 import retrofit2.converter.jackson.JacksonConverterFactory
 import retrofit2.converter.scalars.ScalarsConverterFactory
 
 import org.apache.spark.SSLOptions
+import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ThreadUtils, Utils}
 
 private[spark] trait RetrofitClientFactory {
   def createRetrofitClient[T](baseUrl: String, serviceType: Class[T], sslOptions: SSLOptions): T
 }
 
-private[spark] object RetrofitClientFactoryImpl extends RetrofitClientFactory {
+private[spark] object RetrofitClientFactoryImpl extends RetrofitClientFactory with Logging {
 
   private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
   private val SECURE_RANDOM = new SecureRandom()
 
   def createRetrofitClient[T](baseUrl: String, serviceType: Class[T], sslOptions: SSLOptions): T = {
     val dispatcher = new Dispatcher(ThreadUtils.newDaemonCachedThreadPool(s"http-client-$baseUrl"))
-    val okHttpClientBuilder = new OkHttpClient.Builder().dispatcher(dispatcher)
+    val serviceUri = URI.create(baseUrl)
+    val maybeAllProxy = Option.apply(System.getProperty(Config.KUBERNETES_ALL_PROXY))
+    val serviceUriScheme = serviceUri.getScheme
+    val maybeHttpProxy = (if (serviceUriScheme.equalsIgnoreCase("https")) {
+      Option.apply(System.getProperty(Config.KUBERNETES_HTTPS_PROXY))
+    } else if (serviceUriScheme.equalsIgnoreCase("http")) {
+      Option.apply(System.getProperty(Config.KUBERNETES_HTTP_PROXY))
+    } else {
+      maybeAllProxy
+    }).map(uriStringToProxy)
+    val maybeNoProxy = Option.apply(System.getProperty(Config.KUBERNETES_NO_PROXY))
+      .map(_.split(","))
+      .toSeq
+      .flatten
+    val resolvedProxy = maybeNoProxy.find(_ == serviceUri.getHost)
+      .map( _ => java.net.Proxy.NO_PROXY)
+      .orElse(maybeHttpProxy)
+      .getOrElse(java.net.Proxy.NO_PROXY)
+    val okHttpClientBuilder = new OkHttpClient.Builder()
+      .dispatcher(dispatcher)
+      .proxy(resolvedProxy)
+    logDebug(s"Proxying to $baseUrl through address ${resolvedProxy.address()} with proxy of" +
+      s" type ${resolvedProxy.`type`()}")
     sslOptions.trustStore.foreach { trustStoreFile =>
       require(trustStoreFile.isFile, s"TrustStore provided at ${trustStoreFile.getAbsolutePath}"
         + " does not exist, or is not a file.")
@@ -69,4 +94,9 @@ private[spark] object RetrofitClientFactoryImpl extends RetrofitClientFactory {
       .create(serviceType)
   }
 
+  private def uriStringToProxy(uriString: String): java.net.Proxy = {
+    val uriObject = URI.create(uriString)
+    new java.net.Proxy(java.net.Proxy.Type.HTTP,
+        new InetSocketAddress(uriObject.getHost, uriObject.getPort))
+  }
 }

From d31d81aaf513ee0530dbfa4be066d29d879cbf5c Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Thu, 25 May 2017 23:29:16 -0700
Subject: [PATCH 492/534] Fix an HDFS data locality bug in case cluster node
 names are short host names (#291)

* Fix an HDFS data locality bug in case cluster node names are not full host names

* Add a NOTE about InetAddress caching
---
 .../kubernetes/KubernetesTaskSetManager.scala |  26 +++-
 .../KubernetesTaskSetManagerSuite.scala       | 117 ++++++++++++++++++
 2 files changed, 141 insertions(+), 2 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManagerSuite.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala
index 5cea95be382f0..51566d03a7a6c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManager.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.scheduler.cluster.kubernetes
 
+import java.net.InetAddress
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.scheduler.{TaskSchedulerImpl, TaskSet, TaskSetManager}
@@ -23,7 +25,9 @@ import org.apache.spark.scheduler.{TaskSchedulerImpl, TaskSet, TaskSetManager}
 private[spark] class KubernetesTaskSetManager(
     sched: TaskSchedulerImpl,
     taskSet: TaskSet,
-    maxTaskFailures: Int) extends TaskSetManager(sched, taskSet, maxTaskFailures) {
+    maxTaskFailures: Int,
+    inetAddressUtil: InetAddressUtil = new InetAddressUtil)
+  extends TaskSetManager(sched, taskSet, maxTaskFailures) {
 
   /**
    * Overrides the lookup to use not only the executor pod IP, but also the cluster node
@@ -52,8 +56,16 @@ private[spark] class KubernetesTaskSetManager(
           if (pendingTasksClusterNodeIP.nonEmpty) {
             logDebug(s"Got preferred task list $pendingTasksClusterNodeIP for executor host " +
               s"$executorIP using cluster node IP $clusterNodeIP")
+            pendingTasksClusterNodeIP
+          } else {
+            val clusterNodeFullName = inetAddressUtil.getFullHostName(clusterNodeIP)
+            val pendingTasksClusterNodeFullName = super.getPendingTasksForHost(clusterNodeFullName)
+            if (pendingTasksClusterNodeFullName.nonEmpty) {
+              logDebug(s"Got preferred task list $pendingTasksClusterNodeFullName " +
+                s"for executor host $executorIP using cluster node full name $clusterNodeFullName")
+            }
+            pendingTasksClusterNodeFullName
           }
-          pendingTasksClusterNodeIP
         }
       } else {
         pendingTasksExecutorIP  // Empty
@@ -61,3 +73,13 @@ private[spark] class KubernetesTaskSetManager(
     }
   }
 }
+
+// To support mocks in unit tests.
+private[kubernetes] class InetAddressUtil {
+
+  // NOTE: This does issue a network call to DNS. Caching is done internally by the InetAddress
+  // class for both hits and misses.
+  def getFullHostName(ipAddress: String): String = {
+    InetAddress.getByName(ipAddress).getCanonicalHostName
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManagerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManagerSuite.scala
new file mode 100644
index 0000000000000..7618c137ab22b
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesTaskSetManagerSuite.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.kubernetes
+
+import scala.collection.mutable.ArrayBuffer
+
+import io.fabric8.kubernetes.api.model.{Pod, PodSpec, PodStatus}
+import org.mockito.Mockito._
+
+import org.apache.spark.{SparkContext, SparkFunSuite}
+import org.apache.spark.scheduler.{FakeTask, FakeTaskScheduler, HostTaskLocation, TaskLocation}
+
+class KubernetesTaskSetManagerSuite extends SparkFunSuite {
+
+  val sc = new SparkContext("local", "test")
+  val sched = new FakeTaskScheduler(sc,
+    ("execA", "10.0.0.1"), ("execB", "10.0.0.2"), ("execC", "10.0.0.3"))
+  val backend = mock(classOf[KubernetesClusterSchedulerBackend])
+  sched.backend = backend
+
+  test("Find pending tasks for executors using executor pod IP addresses") {
+    val taskSet = FakeTask.createTaskSet(3,
+      Seq(TaskLocation("10.0.0.1", "execA")),  // Task 0 runs on executor pod 10.0.0.1.
+      Seq(TaskLocation("10.0.0.1", "execA")),  // Task 1 runs on executor pod 10.0.0.1.
+      Seq(TaskLocation("10.0.0.2", "execB"))   // Task 2 runs on executor pod 10.0.0.2.
+    )
+
+    val manager = new KubernetesTaskSetManager(sched, taskSet, maxTaskFailures = 2)
+    assert(manager.getPendingTasksForHost("10.0.0.1") == ArrayBuffer(1, 0))
+    assert(manager.getPendingTasksForHost("10.0.0.2") == ArrayBuffer(2))
+  }
+
+  test("Find pending tasks for executors using cluster node names that executor pods run on") {
+    val taskSet = FakeTask.createTaskSet(2,
+      Seq(HostTaskLocation("kube-node1")),  // Task 0's partition belongs to datanode on kube-node1
+      Seq(HostTaskLocation("kube-node1"))   // Task 1's partition belongs to datanode on kube-node2
+    )
+    val spec1 = mock(classOf[PodSpec])
+    when(spec1.getNodeName).thenReturn("kube-node1")
+    val pod1 = mock(classOf[Pod])
+    when(pod1.getSpec).thenReturn(spec1)
+    when(backend.getExecutorPodByIP("10.0.0.1")).thenReturn(Some(pod1))
+
+    val manager = new KubernetesTaskSetManager(sched, taskSet, maxTaskFailures = 2)
+    assert(manager.getPendingTasksForHost("10.0.0.1") == ArrayBuffer(1, 0))
+  }
+
+  test("Find pending tasks for executors using cluster node IPs that executor pods run on") {
+    val taskSet = FakeTask.createTaskSet(2,
+      Seq(HostTaskLocation("196.0.0.5")),  // Task 0's partition belongs to datanode on 196.0.0.5.
+      Seq(HostTaskLocation("196.0.0.5"))   // Task 1's partition belongs to datanode on 196.0.0.5.
+    )
+    val spec1 = mock(classOf[PodSpec])
+    when(spec1.getNodeName).thenReturn("kube-node1")
+    val pod1 = mock(classOf[Pod])
+    when(pod1.getSpec).thenReturn(spec1)
+    val status1 = mock(classOf[PodStatus])
+    when(status1.getHostIP).thenReturn("196.0.0.5")
+    when(pod1.getStatus).thenReturn(status1)
+    when(backend.getExecutorPodByIP("10.0.0.1")).thenReturn(Some(pod1))
+    val manager = new KubernetesTaskSetManager(sched, taskSet, maxTaskFailures = 2)
+    assert(manager.getPendingTasksForHost("10.0.0.1") == ArrayBuffer(1, 0))
+  }
+
+  test("Find pending tasks for executors using cluster node FQDNs that executor pods run on") {
+    val taskSet = FakeTask.createTaskSet(2,
+      Seq(HostTaskLocation("kube-node1.domain1")),  // Task 0's partition belongs to datanode here.
+      Seq(HostTaskLocation("kube-node1.domain1"))   // task 1's partition belongs to datanode here.
+    )
+    val spec1 = mock(classOf[PodSpec])
+    when(spec1.getNodeName).thenReturn("kube-node1")
+    val pod1 = mock(classOf[Pod])
+    when(pod1.getSpec).thenReturn(spec1)
+    val status1 = mock(classOf[PodStatus])
+    when(status1.getHostIP).thenReturn("196.0.0.5")
+    when(pod1.getStatus).thenReturn(status1)
+    val inetAddressUtil = mock(classOf[InetAddressUtil])
+    when(inetAddressUtil.getFullHostName("196.0.0.5")).thenReturn("kube-node1.domain1")
+    when(backend.getExecutorPodByIP("10.0.0.1")).thenReturn(Some(pod1))
+
+    val manager = new KubernetesTaskSetManager(sched, taskSet, maxTaskFailures = 2, inetAddressUtil)
+    assert(manager.getPendingTasksForHost("10.0.0.1") == ArrayBuffer(1, 0))
+  }
+
+  test("Return empty pending tasks for executors when all look up fail") {
+    val taskSet = FakeTask.createTaskSet(1,
+      Seq(HostTaskLocation("kube-node1.domain1"))   // task 0's partition belongs to datanode here.
+    )
+    val spec1 = mock(classOf[PodSpec])
+    when(spec1.getNodeName).thenReturn("kube-node2")
+    val pod1 = mock(classOf[Pod])
+    when(pod1.getSpec).thenReturn(spec1)
+    val status1 = mock(classOf[PodStatus])
+    when(status1.getHostIP).thenReturn("196.0.0.6")
+    when(pod1.getStatus).thenReturn(status1)
+    val inetAddressUtil = mock(classOf[InetAddressUtil])
+    when(inetAddressUtil.getFullHostName("196.0.0.6")).thenReturn("kube-node2.domain1")
+    when(backend.getExecutorPodByIP("10.0.0.1")).thenReturn(Some(pod1))
+
+    val manager = new KubernetesTaskSetManager(sched, taskSet, maxTaskFailures = 2, inetAddressUtil)
+    assert(manager.getPendingTasksForHost("10.0.0.1") == ArrayBuffer())
+  }
+}

From 0702e18e0b0ea71209d63e454d9c0a9f90a2dc8c Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 30 May 2017 16:01:58 -0700
Subject: [PATCH 493/534] Remove leading slash from Retrofit interface. (#308)

---
 .../rest/kubernetes/ResourceStagingServiceRetrofit.scala   | 4 ++--
 .../deploy/rest/kubernetes/RetrofitClientFactory.scala     | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
index 3c2fe8ebbc3c8..c0da44838aba3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
@@ -29,7 +29,7 @@ import org.apache.spark.deploy.kubernetes.submit.SubmittedResourceIdAndSecret
 private[spark] trait ResourceStagingServiceRetrofit {
 
   @Multipart
-  @retrofit2.http.POST("/api/v0/resources/")
+  @retrofit2.http.POST("api/v0/resources/")
   def uploadResources(
       @retrofit2.http.Part("podLabels") podLabels: RequestBody,
       @retrofit2.http.Part("podNamespace") podNamespace: RequestBody,
@@ -38,7 +38,7 @@ private[spark] trait ResourceStagingServiceRetrofit {
           kubernetesCredentials: RequestBody): Call[SubmittedResourceIdAndSecret]
 
   @Streaming
-  @retrofit2.http.GET("/api/v0/resources/{resourceId}")
+  @retrofit2.http.GET("api/v0/resources/{resourceId}")
   def downloadResources(
     @Path("resourceId") resourceId: String,
     @retrofit2.http.Header("Authorization") resourceSecret: String): Call[ResponseBody]
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
index e38a3d9ad928e..5046cb479054c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/RetrofitClientFactory.scala
@@ -85,8 +85,13 @@ private[spark] object RetrofitClientFactoryImpl extends RetrofitClientFactory wi
       okHttpClientBuilder.sslSocketFactory(sslContext.getSocketFactory,
         trustManagers(0).asInstanceOf[X509TrustManager])
     }
+    val resolvedBaseUrl = if (!baseUrl.endsWith("/")) {
+      s"$baseUrl/"
+    } else {
+      baseUrl
+    }
     new Retrofit.Builder()
-      .baseUrl(baseUrl)
+      .baseUrl(resolvedBaseUrl)
       .addConverterFactory(ScalarsConverterFactory.create())
       .addConverterFactory(JacksonConverterFactory.create(OBJECT_MAPPER))
       .client(okHttpClientBuilder.build())

From 9be8f20f48c1385c0dfa6d5c12f2e211c70e3e00 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 31 May 2017 15:01:13 -0700
Subject: [PATCH 494/534] Use tini in Docker images (#320)

---
 .../docker-minimal-bundle/src/main/docker/driver/Dockerfile   | 4 ++--
 .../docker-minimal-bundle/src/main/docker/executor/Dockerfile | 4 ++--
 .../src/main/docker/init-container/Dockerfile                 | 4 ++--
 .../src/main/docker/resource-staging-server/Dockerfile        | 4 ++--
 .../src/main/docker/shuffle-service/Dockerfile                | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index c4c75642c9d22..fa651ff43aaa0 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -22,7 +22,7 @@ FROM openjdk:8-alpine
 # docker build -t spark-driver:latest -f dockerfiles/driver/Dockerfile .
 
 RUN apk upgrade --update
-RUN apk add --update bash
+RUN apk add --update bash tini
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
@@ -41,4 +41,4 @@ CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    exec ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
+    exec /sbin/tini -- ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index e345f10056522..fbad43b6255b9 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -22,7 +22,7 @@ FROM openjdk:8-alpine
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
 RUN apk upgrade --update
-RUN apk add --update bash
+RUN apk add --update bash tini
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
@@ -41,4 +41,4 @@ CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    exec ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp $SPARK_CLASSPATH org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
+    exec /sbin/tini -- ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp $SPARK_CLASSPATH org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
index bb249a4ea86b6..40557a7465a8a 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
@@ -22,7 +22,7 @@ FROM openjdk:8-alpine
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
 RUN apk upgrade --update
-RUN apk add --update bash
+RUN apk add --update bash tini
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
@@ -35,4 +35,4 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.KubernetesSparkDependencyDownloadInitContainer" ]
+ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.KubernetesSparkDependencyDownloadInitContainer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
index 125749c71c79a..c8b13c44207bc 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
@@ -22,7 +22,7 @@ FROM openjdk:8-alpine
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
 RUN apk upgrade --update
-RUN apk add --update bash
+RUN apk add --update bash tini
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
@@ -35,4 +35,4 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-ENTRYPOINT [ "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.ResourceStagingServer" ]
+ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.ResourceStagingServer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
index 630d3408519ac..06aac56ba2f52 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
@@ -22,7 +22,7 @@ FROM openjdk:8-alpine
 # docker build -t spark-shuffle:latest -f dockerfiles/shuffle/Dockerfile .
 
 RUN apk upgrade --update
-RUN apk add --update bash
+RUN apk add --update bash tini
 RUN mkdir -p /opt/spark
 RUN touch /opt/spark/RELEASE
 
@@ -36,4 +36,4 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-CMD ["/bin/sh","-c","/opt/spark/bin/spark-class org.apache.spark.deploy.ExternalShuffleService 1"]
\ No newline at end of file
+ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.ExternalShuffleService", "1" ]

From e5623b78fc1536b7221bf31945b3add527959d75 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 1 Jun 2017 12:05:16 -0700
Subject: [PATCH 495/534] Allow custom executor labels and annotations (#321)

* Allow custom executor labels and annotations

* Address comments.

* Fix scalastyle.
---
 docs/running-on-kubernetes.md                 | 17 ++++++++++
 .../spark/deploy/kubernetes/config.scala      | 16 ++++++++++
 .../deploy/kubernetes/submit/Client.scala     | 31 ++++---------------
 .../KubernetesClusterSchedulerBackend.scala   | 26 ++++++++++++++--
 4 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index b18987f6af4a4..488efbe5eef36 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -476,6 +476,23 @@ from the other deployment modes. See the [configuration page](configuration.html
     pairs, where each annotation is in the format <code>key=value</code>.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.executor.labels</code></td>
+  <td>(none)</td>
+  <td>
+    Custom labels that will be added to the executor pods. This should be a comma-separated list of label key-value
+    pairs, where each label is in the format <code>key=value</code>. Note that Spark also adds its own labels to the
+    executor pods for bookkeeping purposes.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.annotations</code></td>
+  <td>(none)</td>
+  <td>
+    Custom annotations that will be added to the executor pods. This should be a comma-separated list of annotation
+    key-value pairs, where each annotation is in the format <code>key=value</code>.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.driver.pod.name</code></td>
   <td>(none)</td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index c892b01314975..d1341b15afaca 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -211,6 +211,22 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_EXECUTOR_LABELS =
+    ConfigBuilder("spark.kubernetes.executor.labels")
+      .doc("Custom labels that will be added to the executor pods. This should be a" +
+        " comma-separated list of label key-value pairs, where each label is in the format" +
+        " key=value.")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_EXECUTOR_ANNOTATIONS =
+    ConfigBuilder("spark.kubernetes.executor.annotations")
+      .doc("Custom annotations that will be added to the executor pods. This should be a" +
+        " comma-separated list of annotation key-value pairs, where each annotation is in the" +
+        " format key=value.")
+      .stringConf
+      .createOptional
+
   private[spark] val KUBERNETES_DRIVER_POD_NAME =
     ConfigBuilder("spark.kubernetes.driver.pod.name")
       .doc("Name of the driver pod.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index a8029a28009c2..743ec9d7707e0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -22,7 +22,8 @@ import java.util.Collections
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder}
 import scala.collection.JavaConverters._
 
-import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.ConfigurationUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.kubernetes.ResourceStagingServerSslOptionsProviderImpl
@@ -75,18 +76,16 @@ private[spark] class Client(
   def run(): Unit = {
     validateNoDuplicateFileNames(sparkJars)
     validateNoDuplicateFileNames(sparkFiles)
-    val parsedCustomLabels = parseKeyValuePairs(customLabels, KUBERNETES_DRIVER_LABELS.key,
-      "labels")
+    val parsedCustomLabels = ConfigurationUtils.parseKeyValuePairs(
+        customLabels, KUBERNETES_DRIVER_LABELS.key, "labels")
     require(!parsedCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
       s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
     require(!parsedCustomLabels.contains(SPARK_APP_NAME_LABEL), s"Label with key" +
       s" $SPARK_APP_NAME_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
     val allLabels = parsedCustomLabels ++
       Map(SPARK_APP_ID_LABEL -> kubernetesAppId, SPARK_APP_NAME_LABEL -> appName)
-    val parsedCustomAnnotations = parseKeyValuePairs(
-      customAnnotations,
-      KUBERNETES_DRIVER_ANNOTATIONS.key,
-      "annotations")
+    val parsedCustomAnnotations = ConfigurationUtils.parseKeyValuePairs(
+        customAnnotations, KUBERNETES_DRIVER_ANNOTATIONS.key, "annotations")
     Utils.tryWithResource(kubernetesClientProvider.get) { kubernetesClient =>
       val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
         new EnvVarBuilder()
@@ -237,24 +236,6 @@ private[spark] class Client(
           s" file name $fileName is shared by all of these URIs: $urisWithFileName")
     }
   }
-
-  private def parseKeyValuePairs(
-      maybeKeyValues: Option[String],
-      configKey: String,
-      keyValueType: String): Map[String, String] = {
-    maybeKeyValues.map(keyValues => {
-      keyValues.split(",").map(_.trim).filterNot(_.isEmpty).map(keyValue => {
-        keyValue.split("=", 2).toSeq match {
-          case Seq(k, v) =>
-            (k, v)
-          case _ =>
-            throw new SparkException(s"Custom $keyValueType set by $configKey must be a" +
-              s" comma-separated list of key-value pairs, with format <key>=<value>." +
-              s" Got value: $keyValue. All values: $keyValues")
-        }
-      }).toMap
-    }).getOrElse(Map.empty[String, String])
-  }
 }
 
 private[spark] object Client {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 5627f7c20de3d..7fcfa36a771fb 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -55,6 +55,23 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val executorExtraClasspath = conf.get(
     org.apache.spark.internal.config.EXECUTOR_CLASS_PATH)
   private val executorJarsDownloadDir = conf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
+
+  private val executorLabels = ConfigurationUtils.parseKeyValuePairs(
+      conf.get(KUBERNETES_EXECUTOR_LABELS),
+      KUBERNETES_EXECUTOR_LABELS.key,
+      "executor labels")
+  require(
+      !executorLabels.contains(SPARK_APP_ID_LABEL),
+      s"Custom executor labels cannot contain $SPARK_APP_ID_LABEL as it is reserved for Spark.")
+  require(
+      !executorLabels.contains(SPARK_EXECUTOR_ID_LABEL),
+      s"Custom executor labels cannot contain $SPARK_EXECUTOR_ID_LABEL as it is reserved for" +
+        s" Spark.")
+  private val executorAnnotations = ConfigurationUtils.parseKeyValuePairs(
+      conf.get(KUBERNETES_EXECUTOR_ANNOTATIONS),
+      KUBERNETES_EXECUTOR_ANNOTATIONS.key,
+      "executor annotations")
+
   private var shufflePodCache: Option[ShufflePodCache] = None
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
@@ -250,8 +267,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
     // executorId and applicationId
     val hostname = name.substring(Math.max(0, name.length - 63))
 
-    val selectors = Map(SPARK_EXECUTOR_ID_LABEL -> executorId,
-      SPARK_APP_ID_LABEL -> applicationId()).asJava
+    val resolvedExecutorLabels = Map(
+      SPARK_EXECUTOR_ID_LABEL -> executorId,
+      SPARK_APP_ID_LABEL -> applicationId()) ++
+      executorLabels
     val executorMemoryQuantity = new QuantityBuilder(false)
       .withAmount(s"${executorMemoryMb}M")
       .build()
@@ -300,7 +319,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val basePodBuilder = new PodBuilder()
       .withNewMetadata()
         .withName(name)
-        .withLabels(selectors)
+        .withLabels(resolvedExecutorLabels.asJava)
+        .withAnnotations(executorAnnotations.asJava)
         .withOwnerReferences()
         .addNewOwnerReference()
           .withController(true)

From 5e2b205d8c54e8493878188462e45dc509f073b2 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <ramanathana@google.com>
Date: Fri, 2 Jun 2017 11:59:43 -0700
Subject: [PATCH 496/534] Dynamic allocation, cleanup in case of driver death
 (#319)

* Adding cleanup for shuffle service for driver death

* Address comments + fix tests

* Cleanly open and close resources.

* Added unit test, reusing RegisterDriver

* lint + fix mesos
---
 .../KubernetesExternalShuffleClient.java      |  79 ++++++++
 .../mesos/MesosExternalShuffleClient.java     |   2 +-
 .../protocol/BlockTransferMessage.java        |   1 -
 .../protocol/{mesos => }/RegisterDriver.java  |   5 +-
 conf/kubernetes-shuffle-service.yaml          |   3 +-
 .../mesos/MesosExternalShuffleService.scala   |   3 +-
 .../KubernetesExternalShuffleService.scala    | 179 ++++++++++++++++++
 .../spark/deploy/kubernetes/constants.scala   |   1 +
 .../deploy/kubernetes/submit/Client.scala     |   7 +-
 .../DriverPodKubernetesClientProvider.scala   |  10 +-
 .../KubernetesClusterSchedulerBackend.scala   |  36 +++-
 .../kubernetes/submit/ClientV2Suite.scala     |  34 +++-
 .../main/docker/shuffle-service/Dockerfile    |   2 +-
 13 files changed, 343 insertions(+), 19 deletions(-)
 create mode 100644 common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/kubernetes/KubernetesExternalShuffleClient.java
 rename common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/{mesos => }/RegisterDriver.java (91%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/kubernetes/KubernetesExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/kubernetes/KubernetesExternalShuffleClient.java
new file mode 100644
index 0000000000000..49cb5243e32dc
--- /dev/null
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/kubernetes/KubernetesExternalShuffleClient.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.shuffle.kubernetes;
+
+import org.apache.spark.network.client.RpcResponseCallback;
+import org.apache.spark.network.client.TransportClient;
+import org.apache.spark.network.sasl.SecretKeyHolder;
+import org.apache.spark.network.shuffle.ExternalShuffleClient;
+import org.apache.spark.network.shuffle.protocol.RegisterDriver;
+import org.apache.spark.network.util.TransportConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * A client for talking to the external shuffle service in Kubernetes cluster mode.
+ *
+ * This is used by the each Spark executor to register with a corresponding external
+ * shuffle service on the cluster. The purpose is for cleaning up shuffle files
+ * reliably if the application exits unexpectedly.
+ */
+public class KubernetesExternalShuffleClient extends ExternalShuffleClient {
+  private static final Logger logger = LoggerFactory
+          .getLogger(KubernetesExternalShuffleClient.class);
+
+  /**
+   * Creates an Kubernetes external shuffle client that wraps the {@link ExternalShuffleClient}.
+   * Please refer to docs on {@link ExternalShuffleClient} for more information.
+   */
+  public KubernetesExternalShuffleClient(
+      TransportConf conf,
+      SecretKeyHolder secretKeyHolder,
+      boolean saslEnabled,
+      boolean saslEncryptionEnabled) {
+    super(conf, secretKeyHolder, saslEnabled, saslEncryptionEnabled);
+  }
+
+  public void registerDriverWithShuffleService(String host, int port) throws IOException {
+    checkInit();
+    ByteBuffer registerDriver = new RegisterDriver(appId, 0).toByteBuffer();
+    TransportClient client = clientFactory.createClient(host, port);
+    client.sendRpc(registerDriver, new RegisterDriverCallback());
+  }
+
+  private class RegisterDriverCallback implements RpcResponseCallback {
+    @Override
+    public void onSuccess(ByteBuffer response) {
+      logger.info("Successfully registered app " + appId + " with external shuffle service.");
+    }
+
+    @Override
+    public void onFailure(Throwable e) {
+      logger.warn("Unable to register app " + appId + " with external shuffle service. " +
+          "Please manually remove shuffle data after driver exit. Error: " + e);
+    }
+  }
+
+  @Override
+  public void close() {
+    super.close();
+  }
+}
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java
index 42cedd9943150..e36cfd165db30 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/mesos/MesosExternalShuffleClient.java
@@ -32,7 +32,7 @@
 import org.apache.spark.network.client.TransportClient;
 import org.apache.spark.network.sasl.SecretKeyHolder;
 import org.apache.spark.network.shuffle.ExternalShuffleClient;
-import org.apache.spark.network.shuffle.protocol.mesos.RegisterDriver;
+import org.apache.spark.network.shuffle.protocol.RegisterDriver;
 import org.apache.spark.network.util.TransportConf;
 
 /**
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java
index 9af6759f5d5f3..6012a84599368 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java
@@ -23,7 +23,6 @@
 import io.netty.buffer.Unpooled;
 
 import org.apache.spark.network.protocol.Encodable;
-import org.apache.spark.network.shuffle.protocol.mesos.RegisterDriver;
 import org.apache.spark.network.shuffle.protocol.mesos.ShuffleServiceHeartbeat;
 
 /**
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterDriver.java
similarity index 91%
rename from common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
rename to common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterDriver.java
index d5f53ccb7f741..ac606e6539f3e 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/mesos/RegisterDriver.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterDriver.java
@@ -15,19 +15,18 @@
  * limitations under the License.
  */
 
-package org.apache.spark.network.shuffle.protocol.mesos;
+package org.apache.spark.network.shuffle.protocol;
 
 import com.google.common.base.Objects;
 import io.netty.buffer.ByteBuf;
 
 import org.apache.spark.network.protocol.Encoders;
-import org.apache.spark.network.shuffle.protocol.BlockTransferMessage;
 
 // Needed by ScalaDoc. See SPARK-7726
 import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type;
 
 /**
- * A message sent from the driver to register with the MesosExternalShuffleService.
+ * A message sent from the driver to register with an ExternalShuffleService.
  */
 public class RegisterDriver extends BlockTransferMessage {
   private final String appId;
diff --git a/conf/kubernetes-shuffle-service.yaml b/conf/kubernetes-shuffle-service.yaml
index 3aeb1f54f301c..c0cc310cf4755 100644
--- a/conf/kubernetes-shuffle-service.yaml
+++ b/conf/kubernetes-shuffle-service.yaml
@@ -38,7 +38,8 @@ spec:
           # This is an official image that is built
           # from the dockerfiles/shuffle directory
           # in the spark distribution.
-          image: kubespark/spark-shuffle:v2.1.0-kubernetes-0.1.0-alpha.3
+          image: spark-shuffle:latest
+          imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: '/tmp'
               name: temp-volume
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
index 859aa836a3157..cbb03c7d3b1d6 100644
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
+++ b/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
@@ -29,7 +29,8 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
 import org.apache.spark.network.shuffle.protocol.BlockTransferMessage
-import org.apache.spark.network.shuffle.protocol.mesos.{RegisterDriver, ShuffleServiceHeartbeat}
+import org.apache.spark.network.shuffle.protocol.RegisterDriver
+import org.apache.spark.network.shuffle.protocol.mesos.ShuffleServiceHeartbeat
 import org.apache.spark.network.util.TransportConf
 import org.apache.spark.util.ThreadUtils
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
new file mode 100644
index 0000000000000..94292dae10f29
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.kubernetes
+
+import java.nio.ByteBuffer
+
+import io.fabric8.kubernetes.api.model.Pod
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watch, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import org.apache.commons.io.IOUtils
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.deploy.ExternalShuffleService
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.Logging
+import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
+import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
+import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterDriver}
+import org.apache.spark.network.util.TransportConf
+import org.apache.spark.scheduler.cluster.kubernetes.DriverPodKubernetesClientProvider
+
+/**
+ * An RPC endpoint that receives registration requests from Spark drivers running on Kubernetes.
+ * It detects driver termination and calls the cleanup callback to [[ExternalShuffleService]].
+ */
+private[spark] class KubernetesShuffleBlockHandler (
+    transportConf: TransportConf,
+    kubernetesClientProvider: DriverPodKubernetesClientProvider)
+  extends ExternalShuffleBlockHandler(transportConf, null) with Logging {
+
+  private val INIT_AND_STOP_LOCK = new Object
+  private val CONNECTED_APPS_LOCK = new Object
+  private val connectedApps = mutable.Set.empty[String]
+  private var shuffleWatch: Option[Watch] = None
+  private var kubernetesClient: Option[KubernetesClient] = None
+
+  def start(): Unit = INIT_AND_STOP_LOCK.synchronized {
+    val client = kubernetesClientProvider.get
+    shuffleWatch = startShuffleWatcher(client)
+    kubernetesClient = Some(client)
+  }
+
+  override def close(): Unit = {
+    try {
+      super.close()
+    } finally {
+      INIT_AND_STOP_LOCK.synchronized {
+        shuffleWatch.foreach(IOUtils.closeQuietly)
+        shuffleWatch = None
+        kubernetesClient.foreach(IOUtils.closeQuietly)
+        kubernetesClient = None
+      }
+    }
+  }
+
+  protected override def handleMessage(
+    message: BlockTransferMessage,
+    client: TransportClient,
+    callback: RpcResponseCallback): Unit = {
+      message match {
+        case RegisterDriverParam(appId) =>
+          val address = client.getSocketAddress
+          logInfo(s"Received registration request from app $appId (remote address $address).")
+          CONNECTED_APPS_LOCK.synchronized {
+            if (connectedApps.contains(appId)) {
+              logWarning(s"Received a registration request from app $appId, but it was already " +
+                s"registered")
+            }
+            connectedApps += appId
+          }
+          callback.onSuccess(ByteBuffer.allocate(0))
+        case _ => super.handleMessage(message, client, callback)
+      }
+  }
+
+  private def startShuffleWatcher(client: KubernetesClient): Option[Watch] = {
+    try {
+      Some(client
+        .pods()
+        .withLabels(Map(SPARK_ROLE_LABEL -> "driver").asJava)
+        .watch(new Watcher[Pod] {
+          override def eventReceived(action: Watcher.Action, p: Pod): Unit = {
+            action match {
+              case Action.DELETED | Action.ERROR =>
+                val labels = p.getMetadata.getLabels
+                if (labels.containsKey(SPARK_APP_ID_LABEL)) {
+                  val appId = labels.get(SPARK_APP_ID_LABEL)
+                  CONNECTED_APPS_LOCK.synchronized {
+                    if (connectedApps.contains(appId)) {
+                      connectedApps -= appId
+                      applicationRemoved(appId, true)
+                    }
+                  }
+                }
+              case Action.ADDED | Action.MODIFIED =>
+            }
+          }
+
+          override def onClose(e: KubernetesClientException): Unit = {}
+        }))
+    } catch {
+      case throwable: Throwable =>
+        logWarning(s"Shuffle service cannot access Kubernetes. " +
+          s"Orphaned file cleanup is disabled.", throwable)
+        None
+    }
+  }
+
+  /** An extractor object for matching [[RegisterDriver]] message. */
+  private object RegisterDriverParam {
+    def unapply(r: RegisterDriver): Option[(String)] =
+      Some(r.getAppId)
+  }
+}
+
+/**
+ * A wrapper of [[ExternalShuffleService]] that provides an additional endpoint for drivers
+ * to associate with. This allows the shuffle service to detect when a driver is terminated
+ * and can clean up the associated shuffle files.
+ */
+private[spark] class KubernetesExternalShuffleService(
+    conf: SparkConf,
+    securityManager: SecurityManager,
+    kubernetesClientProvider: DriverPodKubernetesClientProvider)
+  extends ExternalShuffleService(conf, securityManager) {
+
+  private var shuffleBlockHandlers: mutable.Buffer[KubernetesShuffleBlockHandler] = _
+  protected override def newShuffleBlockHandler(
+      tConf: TransportConf): ExternalShuffleBlockHandler = {
+    val newBlockHandler = new KubernetesShuffleBlockHandler(tConf, kubernetesClientProvider)
+    newBlockHandler.start()
+
+    // TODO: figure out a better way of doing this.
+    // This is necessary because the constructor is not called
+    // when this class is initialized through ExternalShuffleService.
+    if (shuffleBlockHandlers == null) {
+        shuffleBlockHandlers = mutable.Buffer.empty[KubernetesShuffleBlockHandler]
+    }
+    shuffleBlockHandlers += newBlockHandler
+    newBlockHandler
+  }
+
+  override def stop(): Unit = {
+    try {
+      super.stop()
+    } finally {
+      shuffleBlockHandlers.foreach(_.close())
+    }
+  }
+}
+
+private[spark] object KubernetesExternalShuffleService extends Logging {
+  def main(args: Array[String]): Unit = {
+    ExternalShuffleService.main(args,
+      (conf: SparkConf, sm: SecurityManager) => {
+        val kubernetesClientProvider = new DriverPodKubernetesClientProvider(conf)
+        new KubernetesExternalShuffleService(conf, sm, kubernetesClientProvider)
+      })
+  }
+}
+
+
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 950c1f6efe4e8..e267c9ff7e1d1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -22,6 +22,7 @@ package object constants {
   private[spark] val SPARK_APP_ID_LABEL = "spark-app-id"
   private[spark] val SPARK_APP_NAME_LABEL = "spark-app-name"
   private[spark] val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
+  private[spark] val SPARK_ROLE_LABEL = "spark-role"
 
   // Credentials secrets
   private[spark] val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 743ec9d7707e0..dc8a6da45495e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -82,10 +82,13 @@ private[spark] class Client(
       s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
     require(!parsedCustomLabels.contains(SPARK_APP_NAME_LABEL), s"Label with key" +
       s" $SPARK_APP_NAME_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
-    val allLabels = parsedCustomLabels ++
-      Map(SPARK_APP_ID_LABEL -> kubernetesAppId, SPARK_APP_NAME_LABEL -> appName)
+    val allLabels = parsedCustomLabels ++ Map(
+        SPARK_APP_ID_LABEL -> kubernetesAppId,
+        SPARK_APP_NAME_LABEL -> appName,
+        SPARK_ROLE_LABEL -> "driver")
     val parsedCustomAnnotations = ConfigurationUtils.parseKeyValuePairs(
         customAnnotations, KUBERNETES_DRIVER_ANNOTATIONS.key, "annotations")
+
     Utils.tryWithResource(kubernetesClientProvider.get) { kubernetesClient =>
       val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
         new EnvVarBuilder()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
index 50f2c218c22c4..cc2032219f885 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
@@ -29,7 +29,10 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.util.ThreadUtils
 
-private[spark] class DriverPodKubernetesClientProvider(sparkConf: SparkConf, namespace: String) {
+private[spark] class DriverPodKubernetesClientProvider(
+  sparkConf: SparkConf,
+  namespace: Option[String] = None) {
+
   private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
   private val SERVICE_ACCOUNT_CA_CERT = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
   private val oauthTokenFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN)
@@ -45,7 +48,10 @@ private[spark] class DriverPodKubernetesClientProvider(sparkConf: SparkConf, nam
     val baseClientConfigBuilder = new ConfigBuilder()
       .withApiVersion("v1")
       .withMasterUrl(KUBERNETES_MASTER_INTERNAL_URL)
-      .withNamespace(namespace)
+
+    // Build a namespaced client if specified.
+    val namespacedClientConfigBuilder = namespace
+      .map(baseClientConfigBuilder.withNamespace(_)).getOrElse(baseClientConfigBuilder)
 
     val configBuilder = oauthTokenFile
         .orElse(caCertFile)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 7fcfa36a771fb..257cee80fdea9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -32,6 +32,8 @@ import org.apache.spark.{SparkContext, SparkEnv, SparkException}
 import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, SparkPodInitContainerBootstrap}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.network.netty.SparkTransportConf
+import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClient
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointAddress, RpcEnv}
 import org.apache.spark.scheduler.TaskSchedulerImpl
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RetrieveSparkAppConfig, SparkAppConfig}
@@ -100,8 +102,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
 
-  private val kubernetesClient = new DriverPodKubernetesClientProvider(conf, kubernetesNamespace)
-    .get
+  private val kubernetesClient = new DriverPodKubernetesClientProvider(conf,
+    Some(kubernetesNamespace)).get
 
   private val driverPod = try {
     kubernetesClient.pods().inNamespace(kubernetesNamespace).
@@ -134,6 +136,15 @@ private[spark] class KubernetesClusterSchedulerBackend(
       None
     }
 
+  // A client for talking to the external shuffle service
+  private val kubernetesExternalShuffleClient: Option[KubernetesExternalShuffleClient] = {
+    if (Utils.isDynamicAllocationEnabled(sc.conf)) {
+      Some(getShuffleClient())
+    } else {
+      None
+    }
+  }
+
   override val minRegisteredRatio =
     if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
       0.8
@@ -183,6 +194,14 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
+  private def getShuffleClient(): KubernetesExternalShuffleClient = {
+    new KubernetesExternalShuffleClient(
+      SparkTransportConf.fromSparkConf(conf, "shuffle"),
+      sc.env.securityManager,
+      sc.env.securityManager.isAuthenticationEnabled(),
+      sc.env.securityManager.isSaslEncryptionEnabled())
+  }
+
   private def getInitialTargetExecutorNumber(defaultNumExecutors: Int = 1): Int = {
     if (Utils.isDynamicAllocationEnabled(conf)) {
       val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
@@ -220,6 +239,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
         .map { config => new ShufflePodCache(
           kubernetesClient, config.shuffleNamespace, config.shuffleLabels) }
       shufflePodCache.foreach(_.start())
+      kubernetesExternalShuffleClient.foreach(_.init(applicationId()))
     }
   }
 
@@ -227,6 +247,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     // stop allocation of new resources and caches.
     allocator.shutdown()
     shufflePodCache.foreach(_.stop())
+    kubernetesExternalShuffleClient.foreach(_.close())
 
     // send stop message to executors so they shut down cleanly
     super.stop()
@@ -266,10 +287,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
     // name as the hostname.  This preserves uniqueness since the end of name contains
     // executorId and applicationId
     val hostname = name.substring(Math.max(0, name.length - 63))
-
     val resolvedExecutorLabels = Map(
       SPARK_EXECUTOR_ID_LABEL -> executorId,
-      SPARK_APP_ID_LABEL -> applicationId()) ++
+      SPARK_APP_ID_LABEL -> applicationId(),
+      SPARK_ROLE_LABEL -> "executor") ++
       executorLabels
     val executorMemoryQuantity = new QuantityBuilder(false)
       .withAmount(s"${executorMemoryMb}M")
@@ -444,6 +465,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
     rpcEnv: RpcEnv,
     sparkProperties: Seq[(String, String)])
     extends DriverEndpoint(rpcEnv, sparkProperties) {
+    private val externalShufflePort = conf.getInt("spark.shuffle.service.port", 7337)
+
     override def receiveAndReply(
       context: RpcCallContext): PartialFunction[Any, Unit] = {
       new PartialFunction[Any, Unit]() {
@@ -466,6 +489,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
                   .get()
                 val nodeName = runningExecutorPod.getSpec.getNodeName
                 val shufflePodIp = shufflePodCache.get.getShufflePodForExecutor(nodeName)
+
+                // Inform the shuffle pod about this application so it can watch.
+                kubernetesExternalShuffleClient.foreach(
+                  _.registerDriverWithShuffleService(shufflePodIp, externalShufflePort))
+
                 resolvedProperties = resolvedProperties ++ Seq(
                   (SPARK_SHUFFLE_SERVICE_HOST.key, shufflePodIp))
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index d4d3882bb8bab..ff6c710117318 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -31,10 +31,13 @@ import org.scalatest.BeforeAndAfter
 import scala.collection.JavaConverters._
 import scala.collection.mutable
 
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.kubernetes.SparkPodInitContainerBootstrap
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.{KubernetesExternalShuffleService, KubernetesShuffleBlockHandler, SparkPodInitContainerBootstrap}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.network.netty.SparkTransportConf
+import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClient
+import org.apache.spark.scheduler.cluster.kubernetes.DriverPodKubernetesClientProvider
 
 class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
@@ -49,7 +52,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val ALL_EXPECTED_LABELS = Map(
       CUSTOM_LABEL_KEY -> CUSTOM_LABEL_VALUE,
       SPARK_APP_ID_LABEL -> APP_ID,
-      SPARK_APP_NAME_LABEL -> APP_NAME)
+      SPARK_APP_NAME_LABEL -> APP_NAME,
+      SPARK_ROLE_LABEL -> "driver")
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
   private val INIT_CONTAINER_SECRET_NAME = "init-container-secret"
@@ -305,6 +309,30 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     verify(loggingPodStatusWatcher).awaitCompletion()
   }
 
+  test("Run kubernetes shuffle service.") {
+    expectationsForNoMountedCredentials()
+    expectationsForNoDependencyUploader()
+
+    val shuffleService = new KubernetesExternalShuffleService(
+      SPARK_CONF,
+      new SecurityManager(SPARK_CONF),
+      new DriverPodKubernetesClientProvider(SPARK_CONF))
+
+    val shuffleClient = new KubernetesExternalShuffleClient(
+      SparkTransportConf.fromSparkConf(SPARK_CONF, "shuffle"),
+      new SecurityManager(SPARK_CONF),
+      false,
+      false)
+
+    shuffleService.start()
+    shuffleClient.init("newapp")
+
+    // verifies that we can connect to the shuffle service and send
+    // it a message.
+    shuffleClient.registerDriverWithShuffleService("localhost", 7337)
+    shuffleService.stop()
+  }
+
   private def expectationsForNoDependencyUploader(): Unit = {
     when(initContainerComponentsProvider
       .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
index 06aac56ba2f52..1f64376b89aae 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
@@ -36,4 +36,4 @@ ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark
 
-ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.ExternalShuffleService", "1" ]
+ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.kubernetes.KubernetesExternalShuffleService", "1" ]

From bb1b234084c6d4298e0499d8974af9413a1b864f Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Fri, 2 Jun 2017 16:08:10 -0700
Subject: [PATCH 497/534] Fix client to await the driver pod (#325)

---
 .../deploy/kubernetes/submit/LoggingPodStatusWatcher.scala      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala
index 1633a084e463c..4a8a7308b9fe4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/LoggingPodStatusWatcher.scala
@@ -137,7 +137,7 @@ private[kubernetes] class LoggingPodStatusWatcherImpl(
   }
 
   override def awaitCompletion(): Unit = {
-    podCompletedFuture.countDown()
+    podCompletedFuture.await()
     logInfo(pod.map { p =>
       s"Container final statuses:\n\n${containersDescription(p)}"
     }.getOrElse("No containers were found in the driver pod."))

From e37b0cfdbe65819fdda65481b937ed8846407a21 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 2 Jun 2017 20:25:03 -0700
Subject: [PATCH 498/534] Clean up resources that are not used by pods. (#305)

* Clean up resources that are not used by pods.

* Make client side send correct credentials.

* Simplify cleanup logic.

Cancellation is no longer instantaneous and we might clean up a little
later than the given TTL. However, the tradeoff is a simpler
implementation with clearer contracts about when things will and will
not be cleaned up.

* Remove class

* Fix imports and line length.

* Remove import.

* Add a unit test for StagingResourcesStore.

* Revamp cleanup process.

- Delete resources immediately when owners do not exist
- Delete resources if after they are first uploaded, they are not
accessed for a certain period of time.
- Resource owners are more specifically defined and can have a type
(currently only uses pods)

* Clarify log messages

* Use a single set of credentials in resource staging server.

Also refactors construction of Kubernetes Clients to unify the code
paths.

* Fix unit test.

* Safe close if creating shuffle block handler fails

* Use implicit class.

* Address comments.

* Fix broken test.
---
 docs/running-on-kubernetes.md                 |  63 ++++
 .../KubernetesExternalShuffleService.scala    |  64 ++--
 .../SparkKubernetesClientFactory.scala        | 103 ++++++
 .../spark/deploy/kubernetes/config.scala      | 173 ++++-----
 .../deploy/kubernetes/submit/Client.scala     | 329 +++++++++---------
 ...riverInitContainerComponentsProvider.scala |   4 +-
 ...riverPodKubernetesCredentialsMounter.scala |  71 ++--
 ...KubernetesCredentialsMounterProvider.scala |  12 +-
 ...iverPodKubernetesCredentialsProvider.scala |  33 +-
 .../SubmissionKubernetesClientProvider.scala  |  55 ---
 .../SubmittedDependencyUploaderImpl.scala     |  30 +-
 ...SparkDependencyDownloadInitContainer.scala |   7 -
 .../kubernetes/ResourceStagingServer.scala    |  31 +-
 .../kubernetes/ResourceStagingService.scala   |  19 +-
 .../ResourceStagingServiceImpl.scala          |  52 +--
 .../ResourceStagingServiceRetrofit.scala      |   6 +-
 .../rest/kubernetes/StagedResources.scala     |  24 ++
 .../kubernetes/StagedResourcesCleaner.scala   | 150 ++++++++
 .../kubernetes/StagedResourcesOwner.scala     |  34 ++
 .../kubernetes/StagedResourcesStore.scala     | 108 ++++++
 .../DriverPodKubernetesClientProvider.scala   | 103 ------
 .../kubernetes/KubernetesClusterManager.scala |  16 +-
 .../KubernetesClusterSchedulerBackend.scala   |   8 +-
 .../kubernetes/submit/ClientV2Suite.scala     |  32 +-
 ...PodKubernetesCredentialsMounterSuite.scala |  12 +-
 ...ubernetesExternalShuffleServiceSuite.scala |  49 +++
 .../SubmittedDependencyUploaderSuite.scala    |  74 ++--
 .../ResourceStagingServerSuite.scala          |  37 +-
 .../ResourceStagingServiceImplSuite.scala     |  60 ----
 .../StagedResourcesCleanerSuite.scala         | 149 ++++++++
 .../StagedResourcesStoreSuite.scala           |  86 +++++
 .../integrationtest/KubernetesSuite.scala     |   9 +-
 32 files changed, 1242 insertions(+), 761 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkKubernetesClientFactory.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResources.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleaner.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesOwner.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStore.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesExternalShuffleServiceSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleanerSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStoreSuite.scala

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 488efbe5eef36..e9002bdfe0502 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -450,6 +450,69 @@ from the other deployment modes. See the [configuration page](configuration.html
     client cert file, and/or OAuth token.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.resourceStagingServer.caCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the CA cert file for connecting to the Kubernetes API server over TLS from the resource staging server when
+    it monitors objects in determining when to clean up resource bundles.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.resourceStagingServer.clientKeyFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client key file for authenticating against the Kubernetes API server from the resource staging server
+    when it monitors objects in determining when to clean up resource bundles. The resource staging server must have
+    credentials that allow it to view API objects in any namespace.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.resourceStagingServer.clientCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client cert file for authenticating against the Kubernetes API server from the resource staging server
+    when it monitors objects in determining when to clean up resource bundles. The resource staging server must have
+    credentials that allow it to view API objects in any namespace.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.resourceStagingServer.oauthToken</code></td>
+  <td>(none)</td>
+  <td>
+    OAuth token value for authenticating against the Kubernetes API server from the resource staging server
+    when it monitors objects in determining when to clean up resource bundles. The resource staging server must have
+    credentials that allow it to view API objects in any namespace. Note that this cannot be set at the same time as
+    <code>spark.kubernetes.authenticate.resourceStagingServer.oauthTokenFile</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.resourceStagingServer.oauthTokenFile</code></td>
+  <td>(none)</td>
+  <td>
+    File containing the OAuth token to use when authenticating against the against the Kubernetes API server from the
+    resource staging server, when it monitors objects in determining when to clean up resource bundles. The resource
+    staging server must have credentials that allow it to view API objects in any namespace. Note that this cannot be
+    set at the same time as <code>spark.kubernetes.authenticate.resourceStagingServer.oauthToken</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.resourceStagingServer.useServiceAccountCredentials</code></td>
+  <td>true</td>
+  <td>
+    Whether or not to use a service account token and a service account CA certificate when the resource staging server
+    authenticates to Kubernetes. If this is set, interactions with Kubernetes will authenticate using a token located at
+    <code>/var/run/secrets/kubernetes.io/serviceaccount/token</code> and the CA certificate located at
+    <code>/var/run/secrets/kubernetes.io/serviceaccount/ca.crt</code>. Note that if
+    <code>spark.kubernetes.authenticate.resourceStagingServer.oauthTokenFile</code> is set, it takes precedence
+    over the usage of the service account token file. Also, if
+    <code>spark.kubernetes.authenticate.resourceStagingServer.caCertFile</code> is set, it takes precedence over using
+    the service account's CA certificate file. This generally should be set to true (the default value) when the
+    resource staging server is deployed as a Kubernetes pod, but should be set to false if the resource staging server
+    is deployed by other means (i.e. when running the staging server process outside of Kubernetes). The resource
+    staging server must have credentials that allow it to view API objects in any namespace.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.executor.memoryOverhead</code></td>
   <td>executorMemory * 0.10, with minimum of 384</td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
index 94292dae10f29..01a8a9a6899fd 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
@@ -17,10 +17,11 @@
 
 package org.apache.spark.deploy.kubernetes
 
+import java.io.File
 import java.nio.ByteBuffer
 
 import io.fabric8.kubernetes.api.model.Pod
-import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watch, Watcher}
+import io.fabric8.kubernetes.client.{Config, KubernetesClient, KubernetesClientException, Watch, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.io.IOUtils
 import scala.collection.JavaConverters._
@@ -28,13 +29,13 @@ import scala.collection.mutable
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.ExternalShuffleService
+import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
 import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
 import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterDriver}
 import org.apache.spark.network.util.TransportConf
-import org.apache.spark.scheduler.cluster.kubernetes.DriverPodKubernetesClientProvider
 
 /**
  * An RPC endpoint that receives registration requests from Spark drivers running on Kubernetes.
@@ -42,19 +43,16 @@ import org.apache.spark.scheduler.cluster.kubernetes.DriverPodKubernetesClientPr
  */
 private[spark] class KubernetesShuffleBlockHandler (
     transportConf: TransportConf,
-    kubernetesClientProvider: DriverPodKubernetesClientProvider)
+    kubernetesClient: KubernetesClient)
   extends ExternalShuffleBlockHandler(transportConf, null) with Logging {
 
   private val INIT_AND_STOP_LOCK = new Object
   private val CONNECTED_APPS_LOCK = new Object
   private val connectedApps = mutable.Set.empty[String]
   private var shuffleWatch: Option[Watch] = None
-  private var kubernetesClient: Option[KubernetesClient] = None
 
   def start(): Unit = INIT_AND_STOP_LOCK.synchronized {
-    val client = kubernetesClientProvider.get
-    shuffleWatch = startShuffleWatcher(client)
-    kubernetesClient = Some(client)
+    shuffleWatch = startShuffleWatcher()
   }
 
   override def close(): Unit = {
@@ -64,8 +62,7 @@ private[spark] class KubernetesShuffleBlockHandler (
       INIT_AND_STOP_LOCK.synchronized {
         shuffleWatch.foreach(IOUtils.closeQuietly)
         shuffleWatch = None
-        kubernetesClient.foreach(IOUtils.closeQuietly)
-        kubernetesClient = None
+        IOUtils.closeQuietly(kubernetesClient)
       }
     }
   }
@@ -90,9 +87,9 @@ private[spark] class KubernetesShuffleBlockHandler (
       }
   }
 
-  private def startShuffleWatcher(client: KubernetesClient): Option[Watch] = {
+  private def startShuffleWatcher(): Option[Watch] = {
     try {
-      Some(client
+      Some(kubernetesClient
         .pods()
         .withLabels(Map(SPARK_ROLE_LABEL -> "driver").asJava)
         .watch(new Watcher[Pod] {
@@ -137,31 +134,47 @@ private[spark] class KubernetesShuffleBlockHandler (
  */
 private[spark] class KubernetesExternalShuffleService(
     conf: SparkConf,
-    securityManager: SecurityManager,
-    kubernetesClientProvider: DriverPodKubernetesClientProvider)
+    securityManager: SecurityManager)
   extends ExternalShuffleService(conf, securityManager) {
 
   private var shuffleBlockHandlers: mutable.Buffer[KubernetesShuffleBlockHandler] = _
   protected override def newShuffleBlockHandler(
       tConf: TransportConf): ExternalShuffleBlockHandler = {
-    val newBlockHandler = new KubernetesShuffleBlockHandler(tConf, kubernetesClientProvider)
-    newBlockHandler.start()
-
-    // TODO: figure out a better way of doing this.
-    // This is necessary because the constructor is not called
-    // when this class is initialized through ExternalShuffleService.
-    if (shuffleBlockHandlers == null) {
+    val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
+        conf.get(KUBERNETES_SHUFFLE_APISERVER_URI),
+        None,
+        APISERVER_AUTH_SHUFFLE_SERVICE_CONF_PREFIX,
+        conf,
+        Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH))
+            .filter( _ => conf.get(KUBERNETES_SHUFFLE_USE_SERVICE_ACCOUNT_CREDENTIALS)),
+        Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH))
+            .filter( _ => conf.get(KUBERNETES_SHUFFLE_USE_SERVICE_ACCOUNT_CREDENTIALS)))
+    val newBlockHandler = new KubernetesShuffleBlockHandler(tConf, kubernetesClient)
+    try {
+      newBlockHandler.start()
+      // TODO: figure out a better way of doing this.
+      // This is necessary because the constructor is not called
+      // when this class is initialized through ExternalShuffleService.
+      if (shuffleBlockHandlers == null) {
         shuffleBlockHandlers = mutable.Buffer.empty[KubernetesShuffleBlockHandler]
+      }
+      shuffleBlockHandlers += newBlockHandler
+      newBlockHandler
+    } catch {
+      case e: Throwable =>
+        logError("Failed to create Kubernetes shuffle block handler.", e)
+        newBlockHandler.close()
+        throw e
     }
-    shuffleBlockHandlers += newBlockHandler
-    newBlockHandler
   }
 
   override def stop(): Unit = {
     try {
       super.stop()
     } finally {
-      shuffleBlockHandlers.foreach(_.close())
+      if (shuffleBlockHandlers != null) {
+        shuffleBlockHandlers.foreach(_.close())
+      }
     }
   }
 }
@@ -169,10 +182,7 @@ private[spark] class KubernetesExternalShuffleService(
 private[spark] object KubernetesExternalShuffleService extends Logging {
   def main(args: Array[String]): Unit = {
     ExternalShuffleService.main(args,
-      (conf: SparkConf, sm: SecurityManager) => {
-        val kubernetesClientProvider = new DriverPodKubernetesClientProvider(conf)
-        new KubernetesExternalShuffleService(conf, sm, kubernetesClientProvider)
-      })
+      (conf: SparkConf, sm: SecurityManager) => new KubernetesExternalShuffleService(conf, sm))
   }
 }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkKubernetesClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkKubernetesClientFactory.scala
new file mode 100644
index 0000000000000..d2729a2db2fa0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkKubernetesClientFactory.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
+import io.fabric8.kubernetes.client.utils.HttpClientUtils
+import okhttp3.Dispatcher
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * Spark-opinionated builder for Kubernetes clients. It uses a prefix plus common suffixes to
+ * parse configuration keys, similar to the manner in which Spark's SecurityManager parses SSL
+ * options for different components.
+ */
+private[spark] object SparkKubernetesClientFactory {
+
+  def createKubernetesClient(
+      master: String,
+      namespace: Option[String],
+      kubernetesAuthConfPrefix: String,
+      sparkConf: SparkConf,
+      maybeServiceAccountToken: Option[File],
+      maybeServiceAccountCaCert: Option[File]): KubernetesClient = {
+    val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX"
+    val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX"
+    val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf)
+      .map(new File(_))
+      .orElse(maybeServiceAccountToken)
+    val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
+    OptionRequirements.requireNandDefined(
+        oauthTokenFile,
+        oauthTokenValue,
+        s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a" +
+            s" value $oauthTokenConf.")
+
+    val caCertFile = sparkConf
+        .getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX")
+        .orElse(maybeServiceAccountCaCert.map(_.getAbsolutePath))
+    val clientKeyFile = sparkConf
+        .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX")
+    val clientCertFile = sparkConf
+        .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX")
+    val dispatcher = new Dispatcher(
+        ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher"))
+    val config = new ConfigBuilder()
+        .withApiVersion("v1")
+        .withMasterUrl(master)
+        .withWebsocketPingInterval(0)
+        .withOption(oauthTokenValue) {
+          (token, configBuilder) => configBuilder.withOauthToken(token)
+        }.withOption(oauthTokenFile) {
+          (file, configBuilder) =>
+              configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8))
+        }.withOption(caCertFile) {
+          (file, configBuilder) => configBuilder.withCaCertFile(file)
+        }.withOption(clientKeyFile) {
+          (file, configBuilder) => configBuilder.withClientKeyFile(file)
+        }.withOption(clientCertFile) {
+          (file, configBuilder) => configBuilder.withClientCertFile(file)
+        }.withOption(namespace) {
+          (ns, configBuilder) => configBuilder.withNamespace(ns)
+        }.build()
+    val baseHttpClient = HttpClientUtils.createHttpClient(config)
+    val httpClientWithCustomDispatcher = baseHttpClient.newBuilder()
+      .dispatcher(dispatcher)
+      .build()
+    new DefaultKubernetesClient(httpClientWithCustomDispatcher, config)
+  }
+
+  private implicit class OptionConfigurableConfigBuilder(configBuilder: ConfigBuilder) {
+
+    def withOption[T]
+        (option: Option[T])
+        (configurator: ((T, ConfigBuilder) => ConfigBuilder)): OptionConfigurableConfigBuilder = {
+      new OptionConfigurableConfigBuilder(option.map { opt =>
+        configurator(opt, configBuilder)
+      }.getOrElse(configBuilder))
+    }
+
+    def build(): Config = configBuilder.build()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index d1341b15afaca..dd99e0f7a5ae0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -47,120 +47,32 @@ package object config extends Logging {
       .stringConf
       .createWithDefault(s"spark-executor:$sparkVersion")
 
-  private val APISERVER_SUBMIT_CONF_PREFIX = "spark.kubernetes.authenticate.submission"
-  private val APISERVER_DRIVER_CONF_PREFIX = "spark.kubernetes.authenticate.driver"
-
-  private[spark] val KUBERNETES_SUBMIT_CA_CERT_FILE =
-    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.caCertFile")
-      .doc("Path to the CA cert file for connecting to Kubernetes over SSL when creating" +
-        " Kubernetes resources for the driver. This file should be located on the submitting" +
-        " machine's disk.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_SUBMIT_CLIENT_KEY_FILE =
-    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.clientKeyFile")
-      .doc("Path to the client key file for authenticating against the Kubernetes API server" +
-        " when initially creating Kubernetes resources for the driver. This file should be" +
-        " located on the submitting machine's disk.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_SUBMIT_CLIENT_CERT_FILE =
-    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.clientCertFile")
-      .doc("Path to the client cert file for authenticating against the Kubernetes API server" +
-        " when initially creating Kubernetes resources for the driver. This file should be" +
-        " located on the submitting machine's disk.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_SUBMIT_OAUTH_TOKEN =
-    ConfigBuilder(s"$APISERVER_SUBMIT_CONF_PREFIX.oauthToken")
-      .doc("OAuth token to use when authenticating against the against the Kubernetes API server" +
-        " when initially creating Kubernetes resources for the driver. Note that unlike the other" +
-        " authentication options, this should be the exact string value of the token to use for" +
-        " the authentication.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_CA_CERT_FILE =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.caCertFile")
-      .doc("Path to the CA cert file for connecting to Kubernetes over TLS from the driver pod" +
-        " when requesting executors. This file should be located on the submitting machine's disk" +
-        " and will be uploaded to the driver pod.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_CLIENT_KEY_FILE =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.clientKeyFile")
-      .doc("Path to the client key file for authenticating against the Kubernetes API server from" +
-        " the driver pod when requesting executors. This file should be located on the submitting" +
-        " machine's disk, and will be uploaded to the driver pod.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_CLIENT_CERT_FILE =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.clientCertFile")
-      .doc("Path to the client cert file for authenticating against the Kubernetes API server" +
-        " from the driver pod when requesting executors. This file should be located on the" +
-        " submitting machine's disk, and will be uploaded to the driver pod.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_OAUTH_TOKEN =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.oauthToken")
-      .doc("OAuth token to use when authenticating against the Kubernetes API server from the" +
-        " driver pod when requesting executors. Note that unlike the other authentication options" +
-        " this should be the exact string value of the token to use for the authentication. This" +
-        " token value is mounted as a secret on the driver pod.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.caCertFile")
-      .doc("Path on the driver pod's disk containing the CA cert file to use when authenticating" +
-        " against Kubernetes. Typically this is configured by spark-submit from mounting a" +
-        " secret from the submitting machine into the pod, and hence this configuration is marked" +
-        " as internal, but this can also be set manually to use a certificate that is mounted" +
-        " into the driver pod via other means.")
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.clientKeyFile")
-      .doc("Path on the driver pod's disk containing the client key file to use when" +
-        " authenticating against Kubernetes. Typically this is configured by spark-submit from" +
-        " mounting a secret from the submitting machine into the pod, and hence this" +
-        " configuration is marked as internal, but this can also be set manually to" +
-        " use a key file that is mounted into the driver pod via other means.")
-      .internal()
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.clientCertFile")
-      .doc("Path on the driver pod's disk containing the client cert file to use when" +
-        " authenticating against Kubernetes. Typically this is configured by spark-submit from" +
-        " mounting a secret from the submitting machine into the pod, and hence this" +
-        " configuration is marked as internal, but this can also be set manually to" +
-        " use a certificate that is mounted into the driver pod via other means.")
-      .internal()
-      .stringConf
-      .createOptional
-
-  private[spark] val KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.mounted.oauthTokenFile")
-      .doc("Path on the driver pod's disk containing the OAuth token file to use when" +
-        " authenticating against Kubernetes. Typically this is configured by spark-submit from" +
-        " mounting a secret from the submitting machine into the pod, and hence this" +
-        " configuration is marked as internal, but this can also be set manually to" +
-        " use a token that is mounted into the driver pod via other means.")
-      .internal()
-      .stringConf
-      .createOptional
+  private[spark] val APISERVER_AUTH_SUBMISSION_CONF_PREFIX =
+      "spark.kubernetes.authenticate.submission"
+  private[spark] val APISERVER_AUTH_DRIVER_CONF_PREFIX =
+      "spark.kubernetes.authenticate.driver"
+  private[spark] val APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX =
+      "spark.kubernetes.authenticate.driver.mounted"
+  private[spark] val APISERVER_AUTH_RESOURCE_STAGING_SERVER_CONF_PREFIX =
+      "spark.kubernetes.authenticate.resourceStagingServer"
+  private[spark] val APISERVER_AUTH_SHUFFLE_SERVICE_CONF_PREFIX =
+      "spark.kubernetes.authenticate.shuffleService"
+  private[spark] val OAUTH_TOKEN_CONF_SUFFIX = "oauthToken"
+  private[spark] val OAUTH_TOKEN_FILE_CONF_SUFFIX = "oauthTokenFile"
+  private[spark] val CLIENT_KEY_FILE_CONF_SUFFIX = "clientKeyFile"
+  private[spark] val CLIENT_CERT_FILE_CONF_SUFFIX = "clientCertFile"
+  private[spark] val CA_CERT_FILE_CONF_SUFFIX = "caCertFile"
+
+  private[spark] val RESOURCE_STAGING_SERVER_USE_SERVICE_ACCOUNT_CREDENTIALS =
+    ConfigBuilder(
+          s"$APISERVER_AUTH_RESOURCE_STAGING_SERVER_CONF_PREFIX.useServiceAccountCredentials")
+      .doc("Use a service account token and CA certificate in the resource staging server to" +
+        " watch the API server's objects.")
+      .booleanConf
+      .createWithDefault(true)
 
   private[spark] val KUBERNETES_SERVICE_ACCOUNT_NAME =
-    ConfigBuilder(s"$APISERVER_DRIVER_CONF_PREFIX.serviceAccountName")
+    ConfigBuilder(s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.serviceAccountName")
       .doc("Service account that is used when running the driver pod. The driver pod uses" +
         " this service account when requesting executor pods from the API server. If specific" +
         " credentials are given for the driver pod to use, the driver will favor" +
@@ -259,6 +171,19 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_SHUFFLE_APISERVER_URI =
+    ConfigBuilder("spark.kubernetes.shuffle.apiServer.url")
+      .doc("URL to the Kubernetes API server that the shuffle service will monitor for Spark pods.")
+      .stringConf
+      .createWithDefault(KUBERNETES_MASTER_INTERNAL_URL)
+
+  private[spark] val KUBERNETES_SHUFFLE_USE_SERVICE_ACCOUNT_CREDENTIALS =
+    ConfigBuilder(s"$APISERVER_AUTH_SHUFFLE_SERVICE_CONF_PREFIX.useServiceAccountCredentials")
+      .doc("Whether or not to use service account credentials when contacting the API server from" +
+        " the shuffle service.")
+      .booleanConf
+      .createWithDefault(true)
+
   private[spark] val KUBERNETES_ALLOCATION_BATCH_SIZE =
     ConfigBuilder("spark.kubernetes.allocation.batch.size")
       .doc("Number of pods to launch at once in each round of dynamic allocation. ")
@@ -285,12 +210,36 @@ package object config extends Logging {
       .createWithDefaultString("1s")
 
   // Spark resource staging server.
+  private[spark] val RESOURCE_STAGING_SERVER_API_SERVER_URL =
+    ConfigBuilder("spark.kubernetes.resourceStagingServer.apiServer.url")
+      .doc("URL for the Kubernetes API server. The resource staging server monitors the API" +
+        " server to check when pods no longer are using mounted resources. Note that this isn't" +
+        " to be used in Spark applications, as the API server URL should be set via spark.master.")
+      .stringConf
+      .createWithDefault(KUBERNETES_MASTER_INTERNAL_URL)
+
+  private[spark] val RESOURCE_STAGING_SERVER_API_SERVER_CA_CERT_FILE =
+    ConfigBuilder("spark.kubernetes.resourceStagingServer.apiServer.caCertFile")
+      .doc("CA certificate for the resource staging server to use when contacting the Kubernetes" +
+        " API server over TLS.")
+      .stringConf
+      .createOptional
+
   private[spark] val RESOURCE_STAGING_SERVER_PORT =
     ConfigBuilder("spark.kubernetes.resourceStagingServer.port")
       .doc("Port for the Kubernetes resource staging server to listen on.")
       .intConf
       .createWithDefault(10000)
 
+  private[spark] val RESOURCE_STAGING_SERVER_INITIAL_ACCESS_EXPIRATION_TIMEOUT =
+    ConfigBuilder("spark.kubernetes.resourceStagingServer.initialAccessExpirationTimeout")
+      .doc("The resource staging server will wait for any resource bundle to be accessed for a" +
+        " first time for this period. If this timeout expires before the resources are accessed" +
+        " the first time, the resources are cleaned up under the assumption that the dependents" +
+        " of the given resource bundle failed to launch at all.")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefaultString("30m")
+
   private[spark] val RESOURCE_STAGING_SERVER_KEY_PEM =
     ConfigBuilder("spark.ssl.kubernetes.resourceStagingServer.keyPem")
       .doc("Key PEM file to use when having the Kubernetes dependency server listen on TLS.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index dc8a6da45495e..85dac3df57b4c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -20,10 +20,11 @@ import java.io.File
 import java.util.Collections
 
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder}
+import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.JavaConverters._
 
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.ConfigurationUtils
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, SparkKubernetesClientFactory}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.kubernetes.ResourceStagingServerSslOptionsProviderImpl
@@ -42,18 +43,18 @@ import org.apache.spark.util.Utils
  * where different steps of submission should be factored out into separate classes.
  */
 private[spark] class Client(
-    appName: String,
-    kubernetesAppId: String,
-    mainClass: String,
-    sparkConf: SparkConf,
-    appArgs: Array[String],
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    waitForAppCompletion: Boolean,
-    kubernetesClientProvider: SubmissionKubernetesClientProvider,
-    initContainerComponentsProvider: DriverInitContainerComponentsProvider,
-    kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider,
-    loggingPodStatusWatcher: LoggingPodStatusWatcher)
+      appName: String,
+      kubernetesAppId: String,
+      mainClass: String,
+      sparkConf: SparkConf,
+      appArgs: Array[String],
+      sparkJars: Seq[String],
+      sparkFiles: Seq[String],
+      waitForAppCompletion: Boolean,
+      kubernetesClient: KubernetesClient,
+      initContainerComponentsProvider: DriverInitContainerComponentsProvider,
+      kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider,
+      loggingPodStatusWatcher: LoggingPodStatusWatcher)
     extends Logging {
 
   private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
@@ -89,142 +90,134 @@ private[spark] class Client(
     val parsedCustomAnnotations = ConfigurationUtils.parseKeyValuePairs(
         customAnnotations, KUBERNETES_DRIVER_ANNOTATIONS.key, "annotations")
 
-    Utils.tryWithResource(kubernetesClientProvider.get) { kubernetesClient =>
-      val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
-        new EnvVarBuilder()
-          .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
-          .withValue(classPath)
-          .build()
-      }
-      val driverContainer = new ContainerBuilder()
-        .withName(DRIVER_CONTAINER_NAME)
-        .withImage(driverDockerImage)
-        .withImagePullPolicy("IfNotPresent")
-        .addToEnv(driverExtraClasspathEnv.toSeq: _*)
-        .addNewEnv()
-          .withName(ENV_DRIVER_MEMORY)
-          .withValue(driverContainerMemoryWithOverhead + "m")
-          .endEnv()
-        .addNewEnv()
-          .withName(ENV_DRIVER_MAIN_CLASS)
-          .withValue(mainClass)
-          .endEnv()
-        .addNewEnv()
-          .withName(ENV_DRIVER_ARGS)
-          .withValue(appArgs.mkString(" "))
-          .endEnv()
+    val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
+      new EnvVarBuilder()
+        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+        .withValue(classPath)
         .build()
-      val basePod = new PodBuilder()
-        .withNewMetadata()
-          .withName(kubernetesDriverPodName)
-          .addToLabels(allLabels.asJava)
-          .addToAnnotations(parsedCustomAnnotations.asJava)
-          .endMetadata()
-        .withNewSpec()
-          .withRestartPolicy("Never")
-          .addToContainers(driverContainer)
-          .endSpec()
+    }
+    val driverContainer = new ContainerBuilder()
+      .withName(DRIVER_CONTAINER_NAME)
+      .withImage(driverDockerImage)
+      .withImagePullPolicy("IfNotPresent")
+      .addToEnv(driverExtraClasspathEnv.toSeq: _*)
+      .addNewEnv()
+        .withName(ENV_DRIVER_MEMORY)
+        .withValue(driverContainerMemoryWithOverhead + "m")
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_MAIN_CLASS)
+        .withValue(mainClass)
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_ARGS)
+        .withValue(appArgs.mkString(" "))
+        .endEnv()
+      .build()
+    val basePod = new PodBuilder()
+      .withNewMetadata()
+        .withName(kubernetesDriverPodName)
+        .addToLabels(allLabels.asJava)
+        .addToAnnotations(parsedCustomAnnotations.asJava)
+        .endMetadata()
+      .withNewSpec()
+        .withRestartPolicy("Never")
+        .addToContainers(driverContainer)
+        .endSpec()
 
-      val maybeSubmittedDependencyUploader = initContainerComponentsProvider
+    val maybeSubmittedDependencyUploader = initContainerComponentsProvider
         .provideInitContainerSubmittedDependencyUploader(allLabels)
-      val maybeSubmittedResourceIdentifiers = maybeSubmittedDependencyUploader.map { uploader =>
-        SubmittedResources(uploader.uploadJars(), uploader.uploadFiles())
-      }
-      val maybeSecretBuilder = initContainerComponentsProvider
-          .provideSubmittedDependenciesSecretBuilder(
-              maybeSubmittedResourceIdentifiers.map(_.secrets()))
-      val maybeSubmittedDependenciesSecret = maybeSecretBuilder.map(_.build())
-      val initContainerConfigMap = initContainerComponentsProvider
+    val maybeSubmittedResourceIdentifiers = maybeSubmittedDependencyUploader.map { uploader =>
+      SubmittedResources(uploader.uploadJars(), uploader.uploadFiles())
+    }
+    val maybeSecretBuilder = initContainerComponentsProvider
+        .provideSubmittedDependenciesSecretBuilder(
+            maybeSubmittedResourceIdentifiers.map(_.secrets()))
+    val maybeSubmittedDependenciesSecret = maybeSecretBuilder.map(_.build())
+    val initContainerConfigMap = initContainerComponentsProvider
         .provideInitContainerConfigMapBuilder(maybeSubmittedResourceIdentifiers.map(_.ids()))
         .build()
-      val podWithInitContainer = initContainerComponentsProvider
+    val podWithInitContainer = initContainerComponentsProvider
         .provideInitContainerBootstrap()
         .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod)
 
-      val containerLocalizedFilesResolver = initContainerComponentsProvider
-          .provideContainerLocalizedFilesResolver()
-      val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
-      val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
+    val containerLocalizedFilesResolver = initContainerComponentsProvider
+        .provideContainerLocalizedFilesResolver()
+    val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
+    val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
 
-      val executorInitContainerConfiguration = initContainerComponentsProvider
-          .provideExecutorInitContainerConfiguration()
-      val sparkConfWithExecutorInit = executorInitContainerConfiguration
-          .configureSparkConfForExecutorInitContainer(sparkConf)
-      val credentialsMounter = kubernetesCredentialsMounterProvider
-          .getDriverPodKubernetesCredentialsMounter()
-      val credentialsSecret = credentialsMounter.createCredentialsSecret()
-      val podWithInitContainerAndMountedCreds = credentialsMounter.mountDriverKubernetesCredentials(
-        podWithInitContainer, driverContainer.getName, credentialsSecret)
-      val resolvedSparkConf = credentialsMounter.setDriverPodKubernetesCredentialLocations(
-          sparkConfWithExecutorInit)
-      if (resolvedSparkJars.nonEmpty) {
-        resolvedSparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
-      }
-      if (resolvedSparkFiles.nonEmpty) {
-        resolvedSparkConf.set("spark.files", resolvedSparkFiles.mkString(","))
-      }
-      resolvedSparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
-      resolvedSparkConf.set("spark.app.id", kubernetesAppId)
-      // We don't need this anymore since we just set the JVM options on the environment
-      resolvedSparkConf.remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
-      resolvedSparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
-        resolvedSparkConf.set(KUBERNETES_SUBMIT_OAUTH_TOKEN.key, "<present_but_redacted>")
-      }
-      resolvedSparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
-        resolvedSparkConf.set(KUBERNETES_DRIVER_OAUTH_TOKEN.key, "<present_but_redacted>")
-      }
-      val resolvedLocalClasspath = containerLocalizedFilesResolver
-        .resolveSubmittedAndRemoteSparkJars()
-      val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
-        case (confKey, confValue) => s"-D$confKey=$confValue"
-      }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-      val resolvedDriverPod = podWithInitContainerAndMountedCreds.editSpec()
-        .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
-          .addNewEnv()
-            .withName(ENV_MOUNTED_CLASSPATH)
-            .withValue(resolvedLocalClasspath.mkString(File.pathSeparator))
-            .endEnv()
-          .addNewEnv()
-            .withName(ENV_DRIVER_JAVA_OPTS)
-            .withValue(resolvedDriverJavaOpts)
-            .endEnv()
-          .endContainer()
-        .endSpec()
-        .build()
-      Utils.tryWithResource(
-          kubernetesClient
-              .pods()
-              .withName(resolvedDriverPod.getMetadata.getName)
-              .watch(loggingPodStatusWatcher)) { _ =>
-        val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
-        try {
-          val driverOwnedResources = Seq(initContainerConfigMap) ++
-            maybeSubmittedDependenciesSecret.toSeq ++
-            credentialsSecret.toSeq
-          val driverPodOwnerReference = new OwnerReferenceBuilder()
-            .withName(createdDriverPod.getMetadata.getName)
-            .withApiVersion(createdDriverPod.getApiVersion)
-            .withUid(createdDriverPod.getMetadata.getUid)
-            .withKind(createdDriverPod.getKind)
-            .withController(true)
-            .build()
-          driverOwnedResources.foreach { resource =>
-            val originalMetadata = resource.getMetadata
-            originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
-          }
-          kubernetesClient.resourceList(driverOwnedResources: _*).createOrReplace()
-        } catch {
-          case e: Throwable =>
-            kubernetesClient.pods().delete(createdDriverPod)
-            throw e
-        }
-        if (waitForAppCompletion) {
-          logInfo(s"Waiting for application $kubernetesAppId to finish...")
-          loggingPodStatusWatcher.awaitCompletion()
-          logInfo(s"Application $kubernetesAppId finished.")
-        } else {
-          logInfo(s"Deployed Spark application $kubernetesAppId into Kubernetes.")
+    val executorInitContainerConfiguration = initContainerComponentsProvider
+        .provideExecutorInitContainerConfiguration()
+    val sparkConfWithExecutorInit = executorInitContainerConfiguration
+        .configureSparkConfForExecutorInitContainer(sparkConf)
+    val credentialsMounter = kubernetesCredentialsMounterProvider
+        .getDriverPodKubernetesCredentialsMounter()
+    val credentialsSecret = credentialsMounter.createCredentialsSecret()
+    val podWithInitContainerAndMountedCreds = credentialsMounter.mountDriverKubernetesCredentials(
+      podWithInitContainer, driverContainer.getName, credentialsSecret)
+    val resolvedSparkConf = credentialsMounter.setDriverPodKubernetesCredentialLocations(
+        sparkConfWithExecutorInit)
+    if (resolvedSparkJars.nonEmpty) {
+      resolvedSparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
+    }
+    if (resolvedSparkFiles.nonEmpty) {
+      resolvedSparkConf.set("spark.files", resolvedSparkFiles.mkString(","))
+    }
+    resolvedSparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
+    resolvedSparkConf.set("spark.app.id", kubernetesAppId)
+    // We don't need this anymore since we just set the JVM options on the environment
+    resolvedSparkConf.remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
+    val resolvedLocalClasspath = containerLocalizedFilesResolver
+      .resolveSubmittedAndRemoteSparkJars()
+    val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
+      case (confKey, confValue) => s"-D$confKey=$confValue"
+    }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
+    val resolvedDriverPod = podWithInitContainerAndMountedCreds.editSpec()
+      .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
+        .addNewEnv()
+          .withName(ENV_MOUNTED_CLASSPATH)
+          .withValue(resolvedLocalClasspath.mkString(File.pathSeparator))
+          .endEnv()
+        .addNewEnv()
+          .withName(ENV_DRIVER_JAVA_OPTS)
+          .withValue(resolvedDriverJavaOpts)
+          .endEnv()
+        .endContainer()
+      .endSpec()
+      .build()
+    Utils.tryWithResource(
+        kubernetesClient
+            .pods()
+            .withName(resolvedDriverPod.getMetadata.getName)
+            .watch(loggingPodStatusWatcher)) { _ =>
+      val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
+      try {
+        val driverOwnedResources = Seq(initContainerConfigMap) ++
+          maybeSubmittedDependenciesSecret.toSeq ++
+          credentialsSecret.toSeq
+        val driverPodOwnerReference = new OwnerReferenceBuilder()
+          .withName(createdDriverPod.getMetadata.getName)
+          .withApiVersion(createdDriverPod.getApiVersion)
+          .withUid(createdDriverPod.getMetadata.getUid)
+          .withKind(createdDriverPod.getKind)
+          .withController(true)
+          .build()
+        driverOwnedResources.foreach { resource =>
+          val originalMetadata = resource.getMetadata
+          originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
         }
+        kubernetesClient.resourceList(driverOwnedResources: _*).createOrReplace()
+      } catch {
+        case e: Throwable =>
+          kubernetesClient.pods().delete(createdDriverPod)
+          throw e
+      }
+      if (waitForAppCompletion) {
+        logInfo(s"Waiting for application $kubernetesAppId to finish...")
+        loggingPodStatusWatcher.awaitCompletion()
+        logInfo(s"Application $kubernetesAppId finished.")
+      } else {
+        logInfo(s"Deployed Spark application $kubernetesAppId into Kubernetes.")
       }
     }
   }
@@ -268,27 +261,43 @@ private[spark] object Client {
     val appName = sparkConf.getOption("spark.app.name")
       .getOrElse("spark")
     val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+    val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
+    val master = resolveK8sMaster(sparkConf.get("spark.master"))
     val sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
     val initContainerComponentsProvider = new DriverInitContainerComponentsProviderImpl(
-      sparkConf, kubernetesAppId, sparkJars, sparkFiles, sslOptionsProvider.getSslOptions)
-    val kubernetesClientProvider = new SubmissionKubernetesClientProviderImpl(sparkConf)
-    val kubernetesCredentialsMounterProvider =
-        new DriverPodKubernetesCredentialsMounterProviderImpl(sparkConf, kubernetesAppId)
-    val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
-    val loggingInterval = Option(sparkConf.get(REPORT_INTERVAL)).filter( _ => waitForAppCompletion)
-    val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(kubernetesAppId, loggingInterval)
-    new Client(
-      appName,
-      kubernetesAppId,
-      mainClass,
-      sparkConf,
-      appArgs,
-      sparkJars,
-      sparkFiles,
-      waitForAppCompletion,
-      kubernetesClientProvider,
-      initContainerComponentsProvider,
-      kubernetesCredentialsMounterProvider,
-      loggingPodStatusWatcher).run()
+        sparkConf,
+        kubernetesAppId,
+        namespace,
+        sparkJars,
+        sparkFiles,
+        sslOptionsProvider.getSslOptions)
+    Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
+        master,
+        Some(namespace),
+        APISERVER_AUTH_SUBMISSION_CONF_PREFIX,
+        sparkConf,
+        None,
+        None)) { kubernetesClient =>
+      val kubernetesCredentialsMounterProvider =
+          new DriverPodKubernetesCredentialsMounterProviderImpl(sparkConf, kubernetesAppId)
+      val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+      val loggingInterval = Option(sparkConf.get(REPORT_INTERVAL))
+          .filter( _ => waitForAppCompletion)
+      val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(
+          kubernetesAppId, loggingInterval)
+      new Client(
+          appName,
+          kubernetesAppId,
+          mainClass,
+          sparkConf,
+          appArgs,
+          sparkJars,
+          sparkFiles,
+          waitForAppCompletion,
+          kubernetesClient,
+          initContainerComponentsProvider,
+          kubernetesCredentialsMounterProvider,
+          loggingPodStatusWatcher).run()
+    }
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
index 7fbb0c9274bf5..ccb349c5b2988 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
+import java.io.File
+
 import org.apache.spark.{SparkConf, SSLOptions}
 import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
@@ -46,6 +48,7 @@ private[spark] trait DriverInitContainerComponentsProvider {
 private[spark] class DriverInitContainerComponentsProviderImpl(
     sparkConf: SparkConf,
     kubernetesAppId: String,
+    namespace: String,
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
     resourceStagingServerExternalSslOptions: SSLOptions)
@@ -98,7 +101,6 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
   private val maybeSecretName = maybeResourceStagingServerUri.map { _ =>
     s"$kubernetesAppId-init-secret"
   }
-  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
   private val configMapName = s"$kubernetesAppId-init-config"
   private val configMapKey = s"$kubernetesAppId-init-config-key"
   private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
index ded0237732ce0..b13800f389605 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
@@ -53,41 +53,50 @@ private[spark] trait DriverPodKubernetesCredentialsMounter {
 }
 
 private[spark] class DriverPodKubernetesCredentialsMounterImpl(
-    kubernetesAppId: String,
-    submitterLocalDriverPodKubernetesCredentials: KubernetesCredentials,
-    maybeUserSpecifiedMountedClientKeyFile: Option[String],
-    maybeUserSpecifiedMountedClientCertFile: Option[String],
-    maybeUserSpecifiedMountedOAuthTokenFile: Option[String],
-    maybeUserSpecifiedMountedCaCertFile: Option[String])
+      kubernetesAppId: String,
+      submitterLocalDriverPodKubernetesCredentials: KubernetesCredentials,
+      maybeUserSpecifiedMountedClientKeyFile: Option[String],
+      maybeUserSpecifiedMountedClientCertFile: Option[String],
+      maybeUserSpecifiedMountedOAuthTokenFile: Option[String],
+      maybeUserSpecifiedMountedCaCertFile: Option[String])
     extends DriverPodKubernetesCredentialsMounter {
 
   override def setDriverPodKubernetesCredentialLocations(sparkConf: SparkConf): SparkConf = {
     val resolvedMountedClientKeyFile = resolveSecretLocation(
-      maybeUserSpecifiedMountedClientKeyFile,
-      submitterLocalDriverPodKubernetesCredentials.clientKeyDataBase64,
-      DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
+        maybeUserSpecifiedMountedClientKeyFile,
+        submitterLocalDriverPodKubernetesCredentials.clientKeyDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
     val resolvedMountedClientCertFile = resolveSecretLocation(
-      maybeUserSpecifiedMountedClientCertFile,
-      submitterLocalDriverPodKubernetesCredentials.clientCertDataBase64,
-      DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
+        maybeUserSpecifiedMountedClientCertFile,
+        submitterLocalDriverPodKubernetesCredentials.clientCertDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
     val resolvedMountedCaCertFile = resolveSecretLocation(
-      maybeUserSpecifiedMountedCaCertFile,
-      submitterLocalDriverPodKubernetesCredentials.caCertDataBase64,
-      DRIVER_CREDENTIALS_CA_CERT_PATH)
+        maybeUserSpecifiedMountedCaCertFile,
+        submitterLocalDriverPodKubernetesCredentials.caCertDataBase64,
+        DRIVER_CREDENTIALS_CA_CERT_PATH)
     val resolvedMountedOAuthTokenFile = resolveSecretLocation(
-      maybeUserSpecifiedMountedOAuthTokenFile,
-      submitterLocalDriverPodKubernetesCredentials.oauthTokenBase64,
-      DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
+        maybeUserSpecifiedMountedOAuthTokenFile,
+        submitterLocalDriverPodKubernetesCredentials.oauthTokenBase64,
+        DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
     val sparkConfWithCredentialLocations = sparkConf.clone()
-      .setOption(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE, resolvedMountedCaCertFile)
-      .setOption(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE, resolvedMountedClientKeyFile)
-      .setOption(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE, resolvedMountedClientCertFile)
-      .setOption(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN, resolvedMountedOAuthTokenFile)
-    sparkConfWithCredentialLocations.get(KUBERNETES_DRIVER_OAUTH_TOKEN).foreach { _ =>
-      sparkConfWithCredentialLocations.set(KUBERNETES_DRIVER_OAUTH_TOKEN, "<present_but_redacted>")
-    }
-    sparkConfWithCredentialLocations.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { _ =>
-      sparkConfWithCredentialLocations.set(KUBERNETES_SUBMIT_OAUTH_TOKEN, "<present_but_redacted>")
+      .setOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+          resolvedMountedCaCertFile)
+      .setOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+          resolvedMountedClientKeyFile)
+      .setOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+          resolvedMountedClientCertFile)
+      .setOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
+          resolvedMountedOAuthTokenFile)
+    // Redact all OAuth token values
+    sparkConfWithCredentialLocations
+        .getAll
+        .filter(_._1.endsWith(OAUTH_TOKEN_CONF_SUFFIX)).map(_._1)
+        .foreach {
+      sparkConfWithCredentialLocations.set(_, "<present_but_redacted>")
     }
     sparkConfWithCredentialLocations
   }
@@ -141,9 +150,9 @@ private[spark] class DriverPodKubernetesCredentialsMounterImpl(
   }
 
   private def resolveSecretLocation(
-    mountedUserSpecified: Option[String],
-    valueMountedFromSubmitter: Option[String],
-    mountedCanonicalLocation: String): Option[String] = {
+      mountedUserSpecified: Option[String],
+      valueMountedFromSubmitter: Option[String],
+      mountedCanonicalLocation: String): Option[String] = {
     mountedUserSpecified.orElse(valueMountedFromSubmitter.map( _ => {
       mountedCanonicalLocation
     }))
@@ -167,7 +176,7 @@ private[spark] class DriverPodKubernetesCredentialsMounterImpl(
 }
 
 private class OptionSettableSparkConf(sparkConf: SparkConf) {
-  def setOption[T](configEntry: OptionalConfigEntry[T], option: Option[T]): SparkConf = {
+  def setOption(configEntry: String, option: Option[String]): SparkConf = {
     option.map( opt => {
       sparkConf.set(configEntry, opt)
     }).getOrElse(sparkConf)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
index 3f0e7d97275a5..913279198146a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
@@ -37,9 +37,13 @@ private[spark] class DriverPodKubernetesCredentialsMounterProviderImpl(
     new DriverPodKubernetesCredentialsMounterImpl(
       kubernetesAppId,
       submitterLocalDriverPodKubernetesCredentials,
-      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE),
-      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE),
-      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN),
-      sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE))
+      sparkConf.getOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX"),
+      sparkConf.getOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX"),
+      sparkConf.getOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX"),
+      sparkConf.getOption(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX"))
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
index 404741520c059..41b0cf8ceaeab 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
@@ -29,25 +29,20 @@ import org.apache.spark.internal.config.OptionalConfigEntry
 private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf) {
 
   def get(): KubernetesCredentials = {
-    sparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME).foreach { _ =>
-      require(sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).isEmpty,
-        "Cannot specify both a service account and a driver pod OAuth token.")
-      require(sparkConf.get(KUBERNETES_DRIVER_CA_CERT_FILE).isEmpty,
-        "Cannot specify both a service account and a driver pod CA cert file.")
-      require(sparkConf.get(KUBERNETES_DRIVER_CLIENT_KEY_FILE).isEmpty,
-        "Cannot specify both a service account and a driver pod client key file.")
-      require(sparkConf.get(KUBERNETES_DRIVER_CLIENT_CERT_FILE).isEmpty,
-        "Cannot specify both a service account and a driver pod client cert file.")
-    }
-    val oauthTokenBase64 = sparkConf.get(KUBERNETES_DRIVER_OAUTH_TOKEN).map { token =>
+    val oauthTokenBase64 = sparkConf
+        .getOption(s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX")
+        .map { token =>
       BaseEncoding.base64().encode(token.getBytes(Charsets.UTF_8))
     }
-    val caCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CA_CERT_FILE,
-      s"Driver CA cert file provided at %s does not exist or is not a file.")
-    val clientKeyDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
-      s"Driver client key file provided at %s does not exist or is not a file.")
-    val clientCertDataBase64 = safeFileConfToBase64(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
-      s"Driver client cert file provided at %s does not exist or is not a file.")
+    val caCertDataBase64 = safeFileConfToBase64(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+        s"Driver CA cert file provided at %s does not exist or is not a file.")
+    val clientKeyDataBase64 = safeFileConfToBase64(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+        s"Driver client key file provided at %s does not exist or is not a file.")
+    val clientCertDataBase64 = safeFileConfToBase64(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+        s"Driver client cert file provided at %s does not exist or is not a file.")
     KubernetesCredentials(
       oauthTokenBase64 = oauthTokenBase64,
       caCertDataBase64 = caCertDataBase64,
@@ -56,9 +51,9 @@ private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf
   }
 
   private def safeFileConfToBase64(
-      conf: OptionalConfigEntry[String],
+      conf: String,
       fileNotFoundFormatString: String): Option[String] = {
-    sparkConf.get(conf)
+    sparkConf.getOption(conf)
       .map(new File(_))
       .map { file =>
         require(file.isFile, String.format(fileNotFoundFormatString, file.getAbsolutePath))
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala
deleted file mode 100644
index 17b61d4a6ace0..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmissionKubernetesClientProvider.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.internal.Logging
-
-trait SubmissionKubernetesClientProvider {
-  def get: KubernetesClient
-}
-
-private[spark] class SubmissionKubernetesClientProviderImpl(sparkConf: SparkConf)
-    extends SubmissionKubernetesClientProvider with Logging {
-
-  private val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-  private val master = resolveK8sMaster(sparkConf.get("spark.master"))
-
-  override def get: KubernetesClient = {
-    var k8ConfBuilder = new ConfigBuilder()
-      .withApiVersion("v1")
-      .withMasterUrl(master)
-      .withNamespace(namespace)
-    sparkConf.get(KUBERNETES_SUBMIT_CA_CERT_FILE).foreach {
-      f => k8ConfBuilder = k8ConfBuilder.withCaCertFile(f)
-    }
-    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_KEY_FILE).foreach {
-      f => k8ConfBuilder = k8ConfBuilder.withClientKeyFile(f)
-    }
-    sparkConf.get(KUBERNETES_SUBMIT_CLIENT_CERT_FILE).foreach {
-      f => k8ConfBuilder = k8ConfBuilder.withClientCertFile(f)
-    }
-    sparkConf.get(KUBERNETES_SUBMIT_OAUTH_TOKEN).foreach { token =>
-      k8ConfBuilder = k8ConfBuilder.withOauthToken(token)
-    }
-    val k8ClientConfig = k8ConfBuilder.build
-    new DefaultKubernetesClient(k8ClientConfig)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
index 9d0d863d174bc..a891cf3904d2d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
@@ -21,12 +21,14 @@ import javax.ws.rs.core.MediaType
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
 import okhttp3.RequestBody
 import retrofit2.Call
 
 import org.apache.spark.{SparkException, SSLOptions}
-import org.apache.spark.deploy.kubernetes.{CompressionUtils, KubernetesCredentials}
-import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
+import org.apache.spark.deploy.kubernetes.CompressionUtils
+import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServiceRetrofit, RetrofitClientFactory, StagedResourcesOwner, StagedResourcesOwnerType}
 import org.apache.spark.util.Utils
 
 private[spark] trait SubmittedDependencyUploader {
@@ -76,29 +78,23 @@ private[spark] class SubmittedDependencyUploaderImpl(
     Utils.tryWithResource(new FileOutputStream(filesTgz)) { filesOutputStream =>
       CompressionUtils.writeTarGzipToStream(filesOutputStream, files.map(_.getAbsolutePath))
     }
-    // TODO provide credentials properly when the staging server monitors the Kubernetes API.
-    val kubernetesCredentialsString = OBJECT_MAPPER.writer()
-      .writeValueAsString(KubernetesCredentials(None, None, None, None))
-    val labelsAsString = OBJECT_MAPPER.writer().writeValueAsString(podLabels)
+    val stagedResourcesOwner = StagedResourcesOwner(
+      ownerNamespace = podNamespace,
+      ownerLabels = podLabels,
+      ownerType = StagedResourcesOwnerType.Pod)
 
+    val stagedResourcesOwnerString = OBJECT_MAPPER.writeValueAsString(stagedResourcesOwner)
+    val stagedResourcesOwnerBody = RequestBody.create(
+      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), stagedResourcesOwnerString)
     val filesRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), filesTgz)
-
-    val kubernetesCredentialsBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), kubernetesCredentialsString)
-
-    val namespaceRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.TEXT_PLAIN), podNamespace)
-
-    val labelsRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsAsString)
+        okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), filesTgz)
 
     val service = retrofitClientFactory.createRetrofitClient(
       stagingServerUri,
       classOf[ResourceStagingServiceRetrofit],
       stagingServiceSslOptions)
     val uploadResponse = service.uploadResources(
-      labelsRequestBody, namespaceRequestBody, filesRequestBody, kubernetesCredentialsBody)
+      resources = filesRequestBody, resourcesOwner = stagedResourcesOwnerBody)
     getTypedResponseResult(uploadResponse)
   }
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
index 9bdc224f10c90..ac19c2463218b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
@@ -204,13 +204,6 @@ private class FileFetcherImpl(sparkConf: SparkConf, securityManager: SparkSecuri
   }
 }
 
-private case class StagedResources(
-    resourceSecret: String,
-    podLabels: Map[String, String],
-    podNamespace: String,
-    resourcesFile: File,
-    kubernetesCredentials: KubernetesCredentials)
-
 object KubernetesSparkDependencyDownloadInitContainer extends Logging {
   def main(args: Array[String]): Unit = {
     logInfo("Starting init-container to download Spark application dependencies.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala
index 34594ba518b62..0b97317eba8b1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServer.scala
@@ -21,6 +21,7 @@ import java.io.File
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import io.fabric8.kubernetes.client.Config
 import org.eclipse.jetty.http.HttpVersion
 import org.eclipse.jetty.server.{HttpConfiguration, HttpConnectionFactory, Server, ServerConnector, SslConnectionFactory}
 import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
@@ -30,9 +31,10 @@ import org.glassfish.jersey.server.ResourceConfig
 import org.glassfish.jersey.servlet.ServletContainer
 
 import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.SparkKubernetesClientFactory
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SystemClock, ThreadUtils, Utils}
 
 private[spark] class ResourceStagingServer(
     port: Int,
@@ -98,8 +100,33 @@ object ResourceStagingServer {
     } else {
       new SparkConf(true)
     }
+    val apiServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_API_SERVER_URL)
+    val initialAccessExpirationMs = sparkConf.get(
+        RESOURCE_STAGING_SERVER_INITIAL_ACCESS_EXPIRATION_TIMEOUT)
     val dependenciesRootDir = Utils.createTempDir(namePrefix = "local-application-dependencies")
-    val serviceInstance = new ResourceStagingServiceImpl(dependenciesRootDir)
+    val useServiceAccountCredentials = sparkConf.get(
+        RESOURCE_STAGING_SERVER_USE_SERVICE_ACCOUNT_CREDENTIALS)
+    // Namespace doesn't matter because we list resources from various namespaces
+    val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
+        apiServerUri,
+        None,
+        APISERVER_AUTH_RESOURCE_STAGING_SERVER_CONF_PREFIX,
+        sparkConf,
+        Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH))
+            .filter( _ => useServiceAccountCredentials),
+        Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH))
+            .filter( _ => useServiceAccountCredentials))
+
+    val stagedResourcesStore = new StagedResourcesStoreImpl(dependenciesRootDir)
+    val stagedResourcesCleaner = new StagedResourcesCleanerImpl(
+      stagedResourcesStore,
+      kubernetesClient,
+      ThreadUtils.newDaemonSingleThreadScheduledExecutor("resource-expiration"),
+      new SystemClock(),
+      initialAccessExpirationMs)
+    stagedResourcesCleaner.start()
+    val serviceInstance = new ResourceStagingServiceImpl(
+        stagedResourcesStore, stagedResourcesCleaner)
     val sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
     val server = new ResourceStagingServer(
       port = sparkConf.get(RESOURCE_STAGING_SERVER_PORT),
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala
index 525711e78c01c..b9d283a99ade9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingService.scala
@@ -52,13 +52,12 @@ private[spark] trait ResourceStagingService {
    *                  The tarball should contain the files laid out in a flat hierarchy, without
    *                  any directories. We take a stream here to avoid holding these entirely in
    *                  memory.
-   * @param podLabels Labels of pods to monitor. When no more pods are running with the given label,
-   *                  after some period of time, these dependencies will be cleared.
-   * @param podNamespace Namespace of pods to monitor.
-   * @param kubernetesCredentials These credentials are primarily used to monitor the progress of
-   *                              the application. When the application shuts down normally, shuts
-   *                              down abnormally and does not restart, or fails to start entirely,
-   *                              the data uploaded through this endpoint is cleared.
+   * @param resourcesOwner A description of the "owner" of a resource. A resource owner is a
+   *                       Kubernetes API object in a given namespace, with a specific set of
+   *                       labels. When there are no resources of the owner's type in the given
+   *                       namespace with the given labels, the resources are cleaned up. The owner
+   *                       bundle also includes any Kubernetes credentials that are required for
+   *                       resource staging server to watch the object's state over time.
    * @return A unique token that should be provided when retrieving these dependencies later.
    */
   @POST
@@ -66,10 +65,8 @@ private[spark] trait ResourceStagingService {
   @Produces(Array(MediaType.APPLICATION_JSON))
   @Path("/resources")
   def uploadResources(
-      @FormDataParam("podLabels") podLabels: Map[String, String],
-      @FormDataParam("podNamespace") podNamespace: String,
-      @FormDataParam("resources") resources: InputStream,
-      @FormDataParam("kubernetesCredentials") kubernetesCredentials: KubernetesCredentials)
+        @FormDataParam("resources") resources: InputStream,
+        @FormDataParam("resourcesOwner") resourcesOwner: StagedResourcesOwner)
       : SubmittedResourceIdAndSecret
 
   /**
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala
index abe956da9914d..7bc21c21619e1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImpl.scala
@@ -31,58 +31,28 @@ import org.apache.spark.deploy.kubernetes.submit.SubmittedResourceIdAndSecret
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
-private[spark] class ResourceStagingServiceImpl(dependenciesRootDir: File)
+private[spark] class ResourceStagingServiceImpl(
+      stagedResourcesStore: StagedResourcesStore,
+      stagedResourcesCleaner: StagedResourcesCleaner)
     extends ResourceStagingService with Logging {
 
-  private val SECURE_RANDOM = new SecureRandom()
-  // TODO clean up these resources based on the driver's lifecycle
-  private val stagedResources = TrieMap.empty[String, StagedResources]
-
   override def uploadResources(
-      podLabels: Map[String, String],
-      podNamespace: String,
       resources: InputStream,
-      kubernetesCredentials: KubernetesCredentials): SubmittedResourceIdAndSecret = {
-    val resourceId = UUID.randomUUID().toString
-    val secretBytes = new Array[Byte](1024)
-    SECURE_RANDOM.nextBytes(secretBytes)
-    val resourceSecret = resourceId + "-" + BaseEncoding.base64().encode(secretBytes)
-
-    val namespaceDir = new File(dependenciesRootDir, podNamespace)
-    val resourcesDir = new File(namespaceDir, resourceId)
-    try {
-      if (!resourcesDir.exists()) {
-        if (!resourcesDir.mkdirs()) {
-          throw new SparkException("Failed to create dependencies directory for application" +
-            s" at ${resourcesDir.getAbsolutePath}")
-        }
-      }
-      // TODO encrypt the written data with the secret.
-      val resourcesTgz = new File(resourcesDir, "resources.data")
-      Utils.tryWithResource(new FileOutputStream(resourcesTgz)) { ByteStreams.copy(resources, _) }
-      stagedResources(resourceId) = StagedResources(
-        resourceSecret,
-        podLabels,
-        podNamespace,
-        resourcesTgz,
-        kubernetesCredentials)
-      SubmittedResourceIdAndSecret(resourceId, resourceSecret)
-    } catch {
-      case e: Throwable =>
-        if (!resourcesDir.delete()) {
-          logWarning(s"Failed to delete application directory $resourcesDir.")
-        }
-        throw e
-    }
+      resourcesOwner: StagedResourcesOwner): SubmittedResourceIdAndSecret = {
+    val stagedResources = stagedResourcesStore.addResources(
+        resourcesOwner.ownerNamespace, resources)
+    stagedResourcesCleaner.registerResourceForCleaning(
+      stagedResources.resourceId, resourcesOwner)
+    SubmittedResourceIdAndSecret(stagedResources.resourceId, stagedResources.resourceSecret)
   }
 
   override def downloadResources(resourceId: String, resourceSecret: String): StreamingOutput = {
-    val resource = stagedResources
-        .get(resourceId)
+    val resource = stagedResourcesStore.getResources(resourceId)
         .getOrElse(throw new NotFoundException(s"No resource bundle found with id $resourceId"))
     if (!resource.resourceSecret.equals(resourceSecret)) {
       throw new NotAuthorizedException(s"Unauthorized to download resource with id $resourceId")
     }
+    stagedResourcesCleaner.markResourceAsUsed(resourceId)
     new StreamingOutput {
       override def write(outputStream: OutputStream) = {
         Files.copy(resource.resourcesFile, outputStream)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
index c0da44838aba3..5fbf0f9c43970 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceRetrofit.scala
@@ -31,11 +31,9 @@ private[spark] trait ResourceStagingServiceRetrofit {
   @Multipart
   @retrofit2.http.POST("api/v0/resources/")
   def uploadResources(
-      @retrofit2.http.Part("podLabels") podLabels: RequestBody,
-      @retrofit2.http.Part("podNamespace") podNamespace: RequestBody,
       @retrofit2.http.Part("resources") resources: RequestBody,
-      @retrofit2.http.Part("kubernetesCredentials")
-          kubernetesCredentials: RequestBody): Call[SubmittedResourceIdAndSecret]
+      @retrofit2.http.Part("resourcesOwner") resourcesOwner: RequestBody)
+      : Call[SubmittedResourceIdAndSecret]
 
   @Streaming
   @retrofit2.http.GET("api/v0/resources/{resourceId}")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResources.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResources.scala
new file mode 100644
index 0000000000000..81f394800f803
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResources.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.io.File
+
+case class StagedResources(
+    resourceId: String,
+    resourceSecret: String,
+    resourcesFile: File)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleaner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleaner.scala
new file mode 100644
index 0000000000000..5d9db728483fa
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleaner.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.util.concurrent.{ScheduledExecutorService, TimeUnit}
+
+import io.fabric8.kubernetes.client.KubernetesClient
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Clock
+
+private[spark] trait StagedResourcesCleaner {
+
+  def start(): Unit
+
+  def registerResourceForCleaning(
+      resourceId: String, stagedResourceOwner: StagedResourcesOwner): Unit
+
+  def markResourceAsUsed(resourceId: String): Unit
+}
+
+private class StagedResourcesCleanerImpl(
+      stagedResourcesStore: StagedResourcesStore,
+      kubernetesClient: KubernetesClient,
+      cleanupExecutorService: ScheduledExecutorService,
+      clock: Clock,
+      initialAccessExpirationMs: Long)
+    extends StagedResourcesCleaner {
+
+  private val CLEANUP_INTERVAL_MS = 30000
+  private val RESOURCE_LOCK = new Object()
+  private val activeResources = mutable.Map.empty[String, MonitoredResource]
+  private val unusedResources = mutable.Map.empty[String, UnusedMonitoredResource]
+
+  override def start(): Unit = {
+    cleanupExecutorService.scheduleAtFixedRate(
+        new CleanupRunnable(),
+        CLEANUP_INTERVAL_MS,
+        CLEANUP_INTERVAL_MS,
+        TimeUnit.MILLISECONDS)
+  }
+
+  override def registerResourceForCleaning(
+      resourceId: String, stagedResourceOwner: StagedResourcesOwner): Unit = {
+    RESOURCE_LOCK.synchronized {
+      unusedResources(resourceId) = UnusedMonitoredResource(
+          clock.getTimeMillis() + initialAccessExpirationMs,
+          MonitoredResource(resourceId, stagedResourceOwner))
+
+    }
+  }
+
+  override def markResourceAsUsed(resourceId: String): Unit = RESOURCE_LOCK.synchronized {
+    val resource = unusedResources.remove(resourceId)
+    resource.foreach { res =>
+      activeResources(resourceId) = res.resource
+    }
+  }
+
+  private class CleanupRunnable extends Runnable with Logging {
+
+    override def run(): Unit = {
+      // Make a copy so we can iterate through this while modifying
+      val activeResourcesCopy = RESOURCE_LOCK.synchronized {
+        Map.apply(activeResources.toSeq: _*)
+      }
+      for ((resourceId, resource) <- activeResourcesCopy) {
+        val namespace = kubernetesClient.namespaces()
+            .withName(resource.stagedResourceOwner.ownerNamespace)
+            .get()
+        if (namespace == null) {
+          logInfo(s"Resource files with id $resourceId is being removed. The owner's namespace" +
+            s" ${resource.stagedResourceOwner.ownerNamespace} was not found.")
+          stagedResourcesStore.removeResources(resourceId)
+          RESOURCE_LOCK.synchronized {
+            activeResources.remove(resourceId)
+          }
+        } else {
+          val metadataOperation = resource.stagedResourceOwner.ownerType match {
+            case StagedResourcesOwnerType.Pod =>
+              kubernetesClient.pods().inNamespace(resource.stagedResourceOwner.ownerNamespace)
+            case _ =>
+              throw new SparkException(s"Unsupported resource owner type for cleanup:" +
+                s" ${resource.stagedResourceOwner.ownerType}")
+          }
+          if (metadataOperation
+            .withLabels(resource.stagedResourceOwner.ownerLabels.asJava)
+            .list()
+            .getItems
+            .isEmpty) {
+            logInfo(s"Resource files with id $resourceId is being removed. Owners of the" +
+              s" resource with namespace: ${resource.stagedResourceOwner.ownerNamespace}," +
+              s" type: ${resource.stagedResourceOwner.ownerType}, and labels:" +
+              s" ${resource.stagedResourceOwner.ownerLabels} was not found on the API server.")
+            stagedResourcesStore.removeResources(resourceId)
+            RESOURCE_LOCK.synchronized {
+              activeResources.remove(resourceId)
+            }
+          }
+        }
+      }
+
+      // Make a copy so we can iterate through this while modifying
+      val unusedResourcesCopy = RESOURCE_LOCK.synchronized {
+        Map.apply(unusedResources.toSeq: _*)
+      }
+
+      for ((resourceId, resource) <- unusedResourcesCopy) {
+        if (resource.expiresAt < clock.getTimeMillis()) {
+          RESOURCE_LOCK.synchronized {
+            // Check for existence again here (via foreach) because in between the time we starting
+            // iterating over the unused resources copy, we might have already marked the resource
+            // as active in-between, and likely shouldn't remove the resources in such a case.
+            unusedResources.remove(resourceId).foreach { _ =>
+              logInfo(s"Resources with id $resourceId was not accessed after being added to" +
+                s" the staging server at least $initialAccessExpirationMs ms ago. The resource" +
+                s" will be deleted.")
+              stagedResourcesStore.removeResources(resourceId)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private case class MonitoredResource(
+      resourceId: String,
+      stagedResourceOwner: StagedResourcesOwner)
+
+  private case class UnusedMonitoredResource(expiresAt: Long, resource: MonitoredResource)
+}
+
+
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesOwner.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesOwner.scala
new file mode 100644
index 0000000000000..4061bc36764d7
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesOwner.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import com.fasterxml.jackson.core.`type`.TypeReference
+import com.fasterxml.jackson.module.scala.JsonScalaEnumeration
+
+object StagedResourcesOwnerType extends Enumeration {
+  type OwnerType = Value
+  // In more generic scenarios, we might want to be watching Deployments, etc.
+  val Pod = Value
+}
+
+class StagedResourcesOwnerTypeReference extends TypeReference[StagedResourcesOwnerType.type]
+
+case class StagedResourcesOwner(
+    ownerNamespace: String,
+    ownerLabels: Map[String, String],
+    @JsonScalaEnumeration(classOf[StagedResourcesOwnerTypeReference])
+        ownerType: StagedResourcesOwnerType.OwnerType)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStore.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStore.scala
new file mode 100644
index 0000000000000..0c0d428e035dc
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStore.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.io.{File, FileOutputStream, InputStream, IOException}
+import java.security.SecureRandom
+import java.util.UUID
+
+import com.google.common.io.{BaseEncoding, ByteStreams}
+import org.apache.commons.io.FileUtils
+import scala.collection.concurrent.TrieMap
+
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+
+private[spark] trait StagedResourcesStore {
+
+  /**
+   * Store the given stream on disk and return its resource ID and secret.
+   */
+  def addResources(
+      podNamespace: String,
+      resources: InputStream): StagedResources
+
+  /**
+   * Retrieve a resource bundle with the given id. Returns empty if no resources match this id.
+   */
+  def getResources(resourceId: String): Option[StagedResources]
+
+  def removeResources(resourceId: String): Unit
+}
+
+private[spark] class StagedResourcesStoreImpl(dependenciesRootDir: File)
+    extends StagedResourcesStore with Logging {
+
+  private val SECURE_RANDOM = new SecureRandom()
+  private val stagedResources = TrieMap.empty[String, StagedResources]
+
+  override def addResources(
+      podNamespace: String,
+      resources: InputStream): StagedResources = {
+    val resourceId = UUID.randomUUID().toString
+    val secretBytes = new Array[Byte](1024)
+    SECURE_RANDOM.nextBytes(secretBytes)
+    val resourceSecret = resourceId + "-" + BaseEncoding.base64().encode(secretBytes)
+
+    val namespaceDir = new File(dependenciesRootDir, podNamespace)
+    val resourcesDir = new File(namespaceDir, resourceId)
+    try {
+      if (!resourcesDir.exists()) {
+        if (!resourcesDir.mkdirs()) {
+          throw new SparkException("Failed to create dependencies directory for application" +
+            s" at ${resourcesDir.getAbsolutePath}")
+        }
+      }
+      // TODO encrypt the written data with the secret.
+      val resourcesFile = new File(resourcesDir, "resources.data")
+      Utils.tryWithResource(new FileOutputStream(resourcesFile)) {
+        ByteStreams.copy(resources, _)
+      }
+      val resourceBundle = StagedResources(resourceId, resourceSecret, resourcesFile)
+      stagedResources(resourceId) = resourceBundle
+      resourceBundle
+    } catch {
+      case e: Throwable =>
+        if (!resourcesDir.delete()) {
+          logWarning(s"Failed to delete application directory $resourcesDir.")
+        }
+        stagedResources.remove(resourceId)
+        throw e
+    }
+  }
+
+  override def getResources(resourceId: String): Option[StagedResources] = {
+    stagedResources.get(resourceId)
+  }
+
+  override def removeResources(resourceId: String): Unit = {
+    stagedResources.remove(resourceId)
+        .map(_.resourcesFile.getParentFile)
+        .foreach { resourcesDirectory =>
+      try {
+        FileUtils.deleteDirectory(resourcesDirectory)
+      } catch {
+        case e: IOException =>
+          logWarning(s"Failed to delete resources directory" +
+            s" at ${resourcesDirectory.getAbsolutePath}", e)
+      }
+    }
+  }
+}
+
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
deleted file mode 100644
index cc2032219f885..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/DriverPodKubernetesClientProvider.scala
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.scheduler.cluster.kubernetes
-
-import java.io.File
-
-import com.google.common.base.Charsets
-import com.google.common.io.Files
-import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
-import io.fabric8.kubernetes.client.utils.HttpClientUtils
-import okhttp3.Dispatcher
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.util.ThreadUtils
-
-private[spark] class DriverPodKubernetesClientProvider(
-  sparkConf: SparkConf,
-  namespace: Option[String] = None) {
-
-  private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)
-  private val SERVICE_ACCOUNT_CA_CERT = new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)
-  private val oauthTokenFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN)
-  private val caCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE)
-  private val clientKeyFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE)
-  private val clientCertFile = sparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE)
-
-  /**
-   * Creates a {@link KubernetesClient}, expecting to be from within the context of a pod. When
-   * doing so, service account token files can be picked up from canonical locations.
-   */
-  def get: DefaultKubernetesClient = {
-    val baseClientConfigBuilder = new ConfigBuilder()
-      .withApiVersion("v1")
-      .withMasterUrl(KUBERNETES_MASTER_INTERNAL_URL)
-
-    // Build a namespaced client if specified.
-    val namespacedClientConfigBuilder = namespace
-      .map(baseClientConfigBuilder.withNamespace(_)).getOrElse(baseClientConfigBuilder)
-
-    val configBuilder = oauthTokenFile
-        .orElse(caCertFile)
-        .orElse(clientKeyFile)
-        .orElse(clientCertFile)
-        .map { _ =>
-      var mountedAuthConfigBuilder = baseClientConfigBuilder
-      oauthTokenFile.foreach { tokenFilePath =>
-        val tokenFile = new File(tokenFilePath)
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder
-          .withOauthToken(Files.toString(tokenFile, Charsets.UTF_8))
-      }
-      caCertFile.foreach { caFile =>
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withCaCertFile(caFile)
-      }
-      clientKeyFile.foreach { keyFile =>
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientKeyFile(keyFile)
-      }
-      clientCertFile.foreach { certFile =>
-        mountedAuthConfigBuilder = mountedAuthConfigBuilder.withClientCertFile(certFile)
-      }
-      mountedAuthConfigBuilder
-    }.getOrElse {
-      var serviceAccountConfigBuilder = baseClientConfigBuilder
-      if (SERVICE_ACCOUNT_CA_CERT.isFile) {
-        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withCaCertFile(
-          SERVICE_ACCOUNT_CA_CERT.getAbsolutePath)
-      }
-
-      if (SERVICE_ACCOUNT_TOKEN.isFile) {
-        serviceAccountConfigBuilder = serviceAccountConfigBuilder.withOauthToken(
-          Files.toString(SERVICE_ACCOUNT_TOKEN, Charsets.UTF_8))
-      }
-      serviceAccountConfigBuilder
-    }
-    // Disable the ping thread that is not daemon, in order to allow
-    // the driver main thread to shut down upon errors. Otherwise, the driver
-    // will hang indefinitely.
-    val config = configBuilder
-      .withWebsocketPingInterval(0)
-      .build()
-    val httpClient = HttpClientUtils.createHttpClient(config).newBuilder()
-      // Use a Dispatcher with a custom executor service that creates daemon threads. The default
-      // executor service used by Dispatcher creates non-daemon threads.
-      .dispatcher(new Dispatcher(ThreadUtils.newDaemonCachedThreadPool("spark-on-k8s")))
-      .build()
-    new DefaultKubernetesClient(httpClient, config)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index e2630b9918b61..6abce55cff209 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -16,9 +16,14 @@
  */
 package org.apache.spark.scheduler.cluster.kubernetes
 
+import java.io.File
+
+import io.fabric8.kubernetes.client.Config
+
 import org.apache.spark.SparkContext
-import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 
@@ -75,8 +80,15 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
       logWarning("The executor's init-container config map key was not specified. Executors will" +
         " therefore not attempt to fetch remote or submitted dependencies.")
     }
+    val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
+        KUBERNETES_MASTER_INTERNAL_URL,
+        Some(sparkConf.get(KUBERNETES_NAMESPACE)),
+        APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX,
+        sparkConf,
+        Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)),
+        Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)))
     new KubernetesClusterSchedulerBackend(
-      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl], sc, bootStrap)
+      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl], sc, bootStrap, kubernetesClient)
   }
 
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 257cee80fdea9..1852ed021d91a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
 import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder}
-import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.io.FilenameUtils
 import scala.collection.JavaConverters._
@@ -43,7 +43,8 @@ import org.apache.spark.util.{ThreadUtils, Utils}
 private[spark] class KubernetesClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
     val sc: SparkContext,
-    executorInitContainerBootstrap: Option[SparkPodInitContainerBootstrap])
+    executorInitContainerBootstrap: Option[SparkPodInitContainerBootstrap],
+    kubernetesClient: KubernetesClient)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
 
   import KubernetesClusterSchedulerBackend._
@@ -102,9 +103,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
 
-  private val kubernetesClient = new DriverPodKubernetesClientProvider(conf,
-    Some(kubernetesNamespace)).get
-
   private val driverPod = try {
     kubernetesClient.pods().inNamespace(kubernetesNamespace).
       withName(kubernetesDriverPodName).get()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index ff6c710117318..00f09c64b53b7 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -37,7 +37,6 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClient
-import org.apache.spark.scheduler.cluster.kubernetes.DriverPodKubernetesClientProvider
 
 class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
@@ -131,8 +130,6 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   @Mock
   private var initContainerComponentsProvider: DriverInitContainerComponentsProvider = _
   @Mock
-  private var kubernetesClientProvider: SubmissionKubernetesClientProvider = _
-  @Mock
   private var kubernetesClient: KubernetesClient = _
   @Mock
   private var podOps: MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]] = _
@@ -174,7 +171,6 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       .thenReturn(INIT_CONTAINER_SECRET)
     when(initContainerConfigMapBuilder.build())
       .thenReturn(INIT_CONTAINER_CONFIG_MAP)
-    when(kubernetesClientProvider.get).thenReturn(kubernetesClient)
     when(kubernetesClient.pods()).thenReturn(podOps)
     when(podOps.create(any())).thenAnswer(new Answer[Pod] {
       override def answer(invocation: InvocationOnMock): Pod = {
@@ -302,37 +298,13 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       SPARK_JARS,
       SPARK_FILES,
       true,
-      kubernetesClientProvider,
+      kubernetesClient,
       initContainerComponentsProvider,
       credentialsMounterProvider,
       loggingPodStatusWatcher).run()
     verify(loggingPodStatusWatcher).awaitCompletion()
   }
 
-  test("Run kubernetes shuffle service.") {
-    expectationsForNoMountedCredentials()
-    expectationsForNoDependencyUploader()
-
-    val shuffleService = new KubernetesExternalShuffleService(
-      SPARK_CONF,
-      new SecurityManager(SPARK_CONF),
-      new DriverPodKubernetesClientProvider(SPARK_CONF))
-
-    val shuffleClient = new KubernetesExternalShuffleClient(
-      SparkTransportConf.fromSparkConf(SPARK_CONF, "shuffle"),
-      new SecurityManager(SPARK_CONF),
-      false,
-      false)
-
-    shuffleService.start()
-    shuffleClient.init("newapp")
-
-    // verifies that we can connect to the shuffle service and send
-    // it a message.
-    shuffleClient.registerDriverWithShuffleService("localhost", 7337)
-    shuffleService.stop()
-  }
-
   private def expectationsForNoDependencyUploader(): Unit = {
     when(initContainerComponentsProvider
       .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
@@ -409,7 +381,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       SPARK_JARS,
       SPARK_FILES,
       false,
-      kubernetesClientProvider,
+      kubernetesClient,
       initContainerComponentsProvider,
       credentialsMounterProvider,
       loggingPodStatusWatcher).run()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
index c1005a176408c..2e0a7ba5098b2 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
@@ -111,13 +111,17 @@ class DriverPodKubernetesCredentialsMounterSuite
         val baseSparkConf = new SparkConf()
         val resolvedSparkConf =
           credentialsMounter.setDriverPodKubernetesCredentialLocations(baseSparkConf)
-        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_KEY_FILE) ===
+        assert(resolvedSparkConf.getOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX") ===
             expectedClientKeyFile)
-        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_CLIENT_CERT_FILE) ===
+        assert(resolvedSparkConf.getOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX") ===
             expectedClientCertFile)
-        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_CA_CERT_FILE) ===
+        assert(resolvedSparkConf.getOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX") ===
             expectedCaCertFile)
-        assert(resolvedSparkConf.get(KUBERNETES_DRIVER_MOUNTED_OAUTH_TOKEN) ===
+        assert(resolvedSparkConf.getOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX") ===
             expectedOAuthTokenFile)
     }
   }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesExternalShuffleServiceSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesExternalShuffleServiceSuite.scala
new file mode 100644
index 0000000000000..0de1955884c8e
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesExternalShuffleServiceSuite.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.KubernetesExternalShuffleService
+import org.apache.spark.network.netty.SparkTransportConf
+import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClient
+
+private[spark] class KubernetesExternalShuffleServiceSuite extends SparkFunSuite {
+
+  private val SPARK_CONF = new SparkConf()
+      .set(KUBERNETES_SHUFFLE_USE_SERVICE_ACCOUNT_CREDENTIALS, false)
+
+  test("Run kubernetes shuffle service.") {
+    val shuffleService = new KubernetesExternalShuffleService(
+      SPARK_CONF,
+      new SecurityManager(SPARK_CONF))
+
+    val shuffleClient = new KubernetesExternalShuffleClient(
+      SparkTransportConf.fromSparkConf(SPARK_CONF, "shuffle"),
+      new SecurityManager(SPARK_CONF),
+      false,
+      false)
+
+    shuffleService.start()
+    shuffleClient.init("newapp")
+
+    // verifies that we can connect to the shuffle service and send
+    // it a message.
+    shuffleClient.registerDriverWithShuffleService("localhost", 7337)
+    shuffleService.stop()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
index 8693ff4e15372..c207e3c69cd3c 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
@@ -22,26 +22,24 @@ import java.util.UUID
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.google.common.base.Charsets
-import com.google.common.io.Files
+import com.google.common.io.{BaseEncoding, Files}
 import okhttp3.RequestBody
 import okio.Okio
-import org.mockito.Matchers.any
-import org.mockito.Mockito
-import org.mockito.invocation.InvocationOnMock
-import org.mockito.stubbing.Answer
+import org.mockito.{ArgumentCaptor, Mockito}
 import org.scalatest.BeforeAndAfter
 import org.scalatest.mock.MockitoSugar._
 import retrofit2.{Call, Response}
 
 import org.apache.spark.{SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.CompressionUtils
-import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServiceRetrofit, RetrofitClientFactory}
+import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServiceRetrofit, RetrofitClientFactory, StagedResourcesOwner}
 import org.apache.spark.util.Utils
 
 private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with BeforeAndAfter {
   import SubmittedDependencyUploaderSuite.createTempFile
 
   private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+  private val BASE_64 = BaseEncoding.base64()
   private val APP_ID = "app-id"
   private val LABELS = Map("label1" -> "label1value", "label2" -> "label2value")
   private val NAMESPACE = "namespace"
@@ -61,18 +59,31 @@ private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with
     trustStore = Some(TRUSTSTORE_FILE),
     trustStorePassword = Some(TRUSTSTORE_PASSWORD),
     trustStoreType = Some(TRUSTSTORE_TYPE))
+  private val CLIENT_KEY_FILE = createTempFile("pem")
+  private val CLIENT_CERT_FILE = createTempFile("pem")
+  private val OAUTH_TOKEN = "token"
   private var retrofitClientFactory: RetrofitClientFactory = _
   private var retrofitClient: ResourceStagingServiceRetrofit = _
+  private var resourcesOwnerCaptor: ArgumentCaptor[RequestBody] = _
+  private var resourcesDataCaptor: ArgumentCaptor[RequestBody] = _
 
   private var dependencyUploaderUnderTest: SubmittedDependencyUploader = _
 
   before {
+    resourcesOwnerCaptor = ArgumentCaptor.forClass(classOf[RequestBody])
+    resourcesDataCaptor = ArgumentCaptor.forClass(classOf[RequestBody])
     retrofitClientFactory = mock[RetrofitClientFactory]
     retrofitClient = mock[ResourceStagingServiceRetrofit]
     Mockito.when(
       retrofitClientFactory.createRetrofitClient(
         STAGING_SERVER_URI, classOf[ResourceStagingServiceRetrofit], STAGING_SERVER_SSL_OPTIONS))
       .thenReturn(retrofitClient)
+    val responseCall = mock[Call[SubmittedResourceIdAndSecret]]
+    Mockito.when(responseCall.execute()).thenReturn(
+        Response.success(SubmittedResourceIdAndSecret("resourceId", "resourceSecret")))
+    Mockito.when(retrofitClient.uploadResources(
+      resourcesDataCaptor.capture(), resourcesOwnerCaptor.capture()))
+      .thenReturn(responseCall)
     dependencyUploaderUnderTest = new SubmittedDependencyUploaderImpl(
       APP_ID,
       LABELS,
@@ -85,38 +96,24 @@ private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with
   }
 
   test("Uploading jars should contact the staging server with the appropriate parameters") {
-    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
-      SubmittedResourceIdAndSecret("resourceId", "resourceSecret"))
-    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
-      .thenAnswer(capturingArgumentsAnswer)
     dependencyUploaderUnderTest.uploadJars()
-    testUploadSendsCorrectFiles(LOCAL_JARS, capturingArgumentsAnswer)
+    testUploadSendsCorrectFiles(LOCAL_JARS)
   }
 
   test("Uploading files should contact the staging server with the appropriate parameters") {
-    val capturingArgumentsAnswer = new UploadDependenciesArgumentsCapturingAnswer(
-      SubmittedResourceIdAndSecret("resourceId", "resourceSecret"))
-    Mockito.when(retrofitClient.uploadResources(any(), any(), any(), any()))
-      .thenAnswer(capturingArgumentsAnswer)
     dependencyUploaderUnderTest.uploadFiles()
-    testUploadSendsCorrectFiles(LOCAL_FILES, capturingArgumentsAnswer)
+    testUploadSendsCorrectFiles(LOCAL_FILES)
   }
 
-  private def testUploadSendsCorrectFiles(
-      expectedFiles: Seq[String],
-      capturingArgumentsAnswer: UploadDependenciesArgumentsCapturingAnswer) = {
-    val requestLabelsBytes = requestBodyBytes(capturingArgumentsAnswer.podLabelsArg)
-    val requestLabelsString = new String(requestLabelsBytes, Charsets.UTF_8)
-    val requestLabelsMap = OBJECT_MAPPER.readValue(
-      requestLabelsString, classOf[Map[String, String]])
-    assert(requestLabelsMap === LABELS)
-    val requestNamespaceBytes = requestBodyBytes(capturingArgumentsAnswer.podNamespaceArg)
-    val requestNamespaceString = new String(requestNamespaceBytes, Charsets.UTF_8)
-    assert(requestNamespaceString === NAMESPACE)
-
+  private def testUploadSendsCorrectFiles(expectedFiles: Seq[String]) = {
+    val resourceOwnerString = new String(
+        requestBodyBytes(resourcesOwnerCaptor.getValue), Charsets.UTF_8)
+    val resourceOwner = OBJECT_MAPPER.readValue(resourceOwnerString, classOf[StagedResourcesOwner])
+    assert(resourceOwner.ownerLabels === LABELS)
+    assert(resourceOwner.ownerNamespace === NAMESPACE)
     val unpackedFilesDir = Utils.createTempDir(namePrefix = "test-unpacked-files")
     val compressedBytesInput = new ByteArrayInputStream(
-      requestBodyBytes(capturingArgumentsAnswer.podResourcesArg))
+        requestBodyBytes(resourcesDataCaptor.getValue()))
     CompressionUtils.unpackTarStreamToDirectory(compressedBytesInput, unpackedFilesDir)
     val writtenFiles = unpackedFilesDir.listFiles
     assert(writtenFiles.size === expectedFiles.size)
@@ -148,25 +145,6 @@ private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with
   }
 }
 
-private class UploadDependenciesArgumentsCapturingAnswer(returnValue: SubmittedResourceIdAndSecret)
-    extends Answer[Call[SubmittedResourceIdAndSecret]] {
-
-  var podLabelsArg: RequestBody = _
-  var podNamespaceArg: RequestBody = _
-  var podResourcesArg: RequestBody = _
-  var kubernetesCredentialsArg: RequestBody = _
-
-  override def answer(invocationOnMock: InvocationOnMock): Call[SubmittedResourceIdAndSecret] = {
-    podLabelsArg = invocationOnMock.getArgumentAt(0, classOf[RequestBody])
-    podNamespaceArg = invocationOnMock.getArgumentAt(1, classOf[RequestBody])
-    podResourcesArg = invocationOnMock.getArgumentAt(2, classOf[RequestBody])
-    kubernetesCredentialsArg = invocationOnMock.getArgumentAt(3, classOf[RequestBody])
-    val responseCall = mock[Call[SubmittedResourceIdAndSecret]]
-    Mockito.when(responseCall.execute()).thenReturn(Response.success(returnValue))
-    responseCall
-  }
-}
-
 private object SubmittedDependencyUploaderSuite {
   def createTempFile(extension: String): String = {
     val dir = Utils.createTempDir()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
index 0604e0d6494ae..0c0908da20d89 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
@@ -24,10 +24,11 @@ import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.google.common.io.ByteStreams
 import okhttp3.{RequestBody, ResponseBody}
 import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar.mock
 import retrofit2.Call
 
 import org.apache.spark.{SparkFunSuite, SSLOptions}
-import org.apache.spark.deploy.kubernetes.{KubernetesCredentials, SSLUtils}
+import org.apache.spark.deploy.kubernetes.SSLUtils
 import org.apache.spark.util.Utils
 
 /**
@@ -40,12 +41,21 @@ import org.apache.spark.util.Utils
  * receive streamed uploads and can stream downloads.
  */
 class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
+  private var serviceImpl: ResourceStagingService = _
+  private var stagedResourcesCleaner: StagedResourcesCleaner = _
+  private var server: ResourceStagingServer = _
   private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
 
   private val serverPort = new ServerSocket(0).getLocalPort
-  private val serviceImpl = new ResourceStagingServiceImpl(Utils.createTempDir())
+
   private val sslOptionsProvider = new SettableReferenceSslOptionsProvider()
-  private val server = new ResourceStagingServer(serverPort, serviceImpl, sslOptionsProvider)
+
+  before {
+    stagedResourcesCleaner = mock[StagedResourcesCleaner]
+    serviceImpl = new ResourceStagingServiceImpl(
+      new StagedResourcesStoreImpl(Utils.createTempDir()), stagedResourcesCleaner)
+    server = new ResourceStagingServer(serverPort, serviceImpl, sslOptionsProvider)
+  }
 
   after {
     server.stop()
@@ -83,20 +93,17 @@ class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
     val resourcesBytes = Array[Byte](1, 2, 3, 4)
     val labels = Map("label1" -> "label1Value", "label2" -> "label2value")
     val namespace = "namespace"
-    val labelsJson = OBJECT_MAPPER.writer().writeValueAsString(labels)
+    val resourcesOwner = StagedResourcesOwner(
+      ownerLabels = labels,
+      ownerNamespace = namespace,
+      ownerType = StagedResourcesOwnerType.Pod)
+    val resourcesOwnerJson = OBJECT_MAPPER.writeValueAsString(resourcesOwner)
+    val resourcesOwnerRequestBody = RequestBody.create(
+        okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), resourcesOwnerJson)
     val resourcesRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), resourcesBytes)
-    val labelsRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), labelsJson)
-    val namespaceRequestBody = RequestBody.create(
-      okhttp3.MediaType.parse(MediaType.TEXT_PLAIN), namespace)
-    val kubernetesCredentials = KubernetesCredentials(Some("token"), Some("ca-cert"), None, None)
-    val kubernetesCredentialsString = OBJECT_MAPPER.writer()
-      .writeValueAsString(kubernetesCredentials)
-    val kubernetesCredentialsBody = RequestBody.create(
-        okhttp3.MediaType.parse(MediaType.APPLICATION_JSON), kubernetesCredentialsString)
+        okhttp3.MediaType.parse(MediaType.MULTIPART_FORM_DATA), resourcesBytes)
     val uploadResponse = retrofitService.uploadResources(
-      labelsRequestBody, namespaceRequestBody, resourcesRequestBody, kubernetesCredentialsBody)
+      resourcesRequestBody, resourcesOwnerRequestBody)
     val resourceIdentifier = getTypedResponseResult(uploadResponse)
     checkResponseBodyBytesMatches(
       retrofitService.downloadResources(
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala
deleted file mode 100644
index 53396a3f27a1a..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServiceImplSuite.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.rest.kubernetes
-
-import java.io.{ByteArrayInputStream, File}
-import java.nio.file.Paths
-
-import com.google.common.io.Files
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.util.Utils
-
-/**
- * Unit, scala-level tests for KubernetesSparkDependencyServiceImpl. The coverage here
- * differs from that of KubernetesSparkDependencyServerSuite as here we invoke the
- * implementation methods directly as opposed to over HTTP, as well as check the
- * data written to the underlying disk.
- */
-class ResourceStagingServiceImplSuite extends SparkFunSuite {
-
-  private val dependencyRootDir = Utils.createTempDir()
-  private val serviceImpl = new ResourceStagingServiceImpl(dependencyRootDir)
-  private val resourceBytes = Array[Byte](1, 2, 3, 4)
-  private val kubernetesCredentials = KubernetesCredentials(
-    Some("token"), Some("caCert"), Some("key"), Some("cert"))
-  private val namespace = "namespace"
-  private val labels = Map("label1" -> "label1value", "label2" -> "label2value")
-
-  test("Uploads should write data to the underlying disk") {
-    Utils.tryWithResource(new ByteArrayInputStream(resourceBytes)) { resourceStream =>
-      serviceImpl.uploadResources(labels, namespace, resourceStream, kubernetesCredentials)
-    }
-    val resourceNamespaceDir = Paths.get(dependencyRootDir.getAbsolutePath, "namespace").toFile
-    assert(resourceNamespaceDir.isDirectory, s"Resource namespace dir was not created at" +
-      s" ${resourceNamespaceDir.getAbsolutePath} or is not a directory.")
-    val resourceDirs = resourceNamespaceDir.listFiles()
-    assert(resourceDirs.length === 1, s"Resource root directory did not have exactly one" +
-      s" subdirectory. Got: ${resourceDirs.map(_.getAbsolutePath).mkString(",")}")
-    val resourceTgz = new File(resourceDirs(0), "resources.data")
-    assert(resourceTgz.isFile,
-      s"Resources written to ${resourceTgz.getAbsolutePath} does not exist or is not a file.")
-    val resourceTgzBytes = Files.toByteArray(resourceTgz)
-    assert(resourceTgzBytes.toSeq === resourceBytes.toSeq, "Incorrect resource bytes were written.")
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleanerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleanerSuite.scala
new file mode 100644
index 0000000000000..8b398a9891f34
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesCleanerSuite.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.util.concurrent.{ScheduledExecutorService, TimeUnit}
+
+import io.fabric8.kubernetes.api.model.{DoneableNamespace, DoneablePod, Namespace, NamespaceList, Pod, PodList, PodListBuilder}
+import io.fabric8.kubernetes.client.{KubernetesClient, Watch, Watcher}
+import io.fabric8.kubernetes.client.dsl.{FilterWatchListDeletable, MixedOperation, NonNamespaceOperation, PodResource, Resource}
+import org.mockito.{ArgumentCaptor, Mock, MockitoAnnotations}
+import org.mockito.Matchers.{eq => mockitoEq}
+import org.mockito.Mockito.{never, verify, when}
+import org.scalatest.BeforeAndAfter
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.Clock
+
+private[spark] class StagedResourcesCleanerSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private type PODS = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+  private type PODSWITHLABELS = FilterWatchListDeletable[
+      Pod, PodList, java.lang.Boolean, Watch, Watcher[Pod]]
+  private type PODSINNAMESPACE = NonNamespaceOperation[
+      Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+  private type NAMESPACES = NonNamespaceOperation[
+      Namespace, NamespaceList, DoneableNamespace, Resource[Namespace, DoneableNamespace]]
+  private type NAMESPACEWITHNAME = Resource[Namespace, DoneableNamespace]
+
+  private val INITIAL_ACCESS_EXPIRATION_MS = 5000L
+  private val CURRENT_TIME = 10000L
+  private val RESOURCE_ID = "resource-id"
+  private val POD_NAMESPACE = "namespace"
+  private val POD_LABELS = Map("label1" -> "label1value", "label2" -> "label2value")
+  private val RESOURCES_OWNER = StagedResourcesOwner(
+    ownerNamespace = POD_NAMESPACE,
+    ownerLabels = POD_LABELS,
+    ownerType = StagedResourcesOwnerType.Pod)
+
+  @Mock
+  private var stagedResourcesStore: StagedResourcesStore = _
+  @Mock
+  private var kubernetesClient: KubernetesClient = _
+  @Mock
+  private var clock: Clock = _
+  @Mock
+  private var cleanerExecutorService: ScheduledExecutorService = _
+  @Mock
+  private var podOperations: PODS = _
+  @Mock
+  private var podsInNamespaceOperations: PODSINNAMESPACE = _
+  @Mock
+  private var podsWithLabelsOperations: PODSWITHLABELS = _
+  @Mock
+  private var namespaceOperations: NAMESPACES = _
+  @Mock
+  private var namedNamespaceOperations: NAMESPACEWITHNAME = _
+  private var cleanerUnderTest: StagedResourcesCleaner = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    cleanerUnderTest = new StagedResourcesCleanerImpl(
+        stagedResourcesStore,
+        kubernetesClient,
+        cleanerExecutorService,
+        clock,
+        INITIAL_ACCESS_EXPIRATION_MS)
+    when(kubernetesClient.pods()).thenReturn(podOperations)
+    when(podOperations.withLabels(POD_LABELS.asJava)).thenReturn(podsWithLabelsOperations)
+    when(kubernetesClient.namespaces()).thenReturn(namespaceOperations)
+  }
+
+  test("Clean the resource if it is never accessed for the expiration interval.") {
+    val cleanupRunnable = startCleanupAndGetCleanupRunnable()
+    cleanerUnderTest.registerResourceForCleaning(RESOURCE_ID, RESOURCES_OWNER)
+    when(clock.getTimeMillis()).thenReturn(CURRENT_TIME + INITIAL_ACCESS_EXPIRATION_MS)
+    cleanupRunnable.run()
+    verify(stagedResourcesStore).removeResources(RESOURCE_ID)
+    verify(kubernetesClient, never()).pods()
+  }
+
+  test("Don't clean the resource if it is accessed in the expiration interval" +
+    " and there are owners available.") {
+    val cleanupRunnable = startCleanupAndGetCleanupRunnable()
+    cleanerUnderTest.registerResourceForCleaning(RESOURCE_ID, RESOURCES_OWNER)
+    cleanerUnderTest.markResourceAsUsed(RESOURCE_ID)
+    when(clock.getTimeMillis()).thenReturn(CURRENT_TIME + INITIAL_ACCESS_EXPIRATION_MS)
+    when(namespaceOperations.withName(POD_NAMESPACE)).thenReturn(namedNamespaceOperations)
+    when(namedNamespaceOperations.get()).thenReturn(new Namespace())
+    when(podOperations.inNamespace(POD_NAMESPACE)).thenReturn(podsInNamespaceOperations)
+    when(podsInNamespaceOperations.withLabels(POD_LABELS.asJava))
+        .thenReturn(podsWithLabelsOperations)
+    when(podsWithLabelsOperations.list()).thenReturn(
+        new PodListBuilder().addNewItemLike(new Pod()).endItem().build())
+    cleanupRunnable.run()
+    verify(stagedResourcesStore, never()).removeResources(RESOURCE_ID)
+  }
+
+  test("Clean the resource if no owners are available.") {
+    val cleanupRunnable = startCleanupAndGetCleanupRunnable()
+    cleanerUnderTest.registerResourceForCleaning(RESOURCE_ID, RESOURCES_OWNER)
+    cleanerUnderTest.markResourceAsUsed(RESOURCE_ID)
+    when(clock.getTimeMillis()).thenReturn(CURRENT_TIME + INITIAL_ACCESS_EXPIRATION_MS)
+    when(namespaceOperations.withName(POD_NAMESPACE)).thenReturn(namedNamespaceOperations)
+    when(namedNamespaceOperations.get()).thenReturn(new Namespace())
+    when(podOperations.inNamespace(POD_NAMESPACE)).thenReturn(podsInNamespaceOperations)
+    when(podsInNamespaceOperations.withLabels(POD_LABELS.asJava))
+      .thenReturn(podsWithLabelsOperations)
+    when(podsWithLabelsOperations.list()).thenReturn(new PodListBuilder().build())
+    cleanupRunnable.run()
+    verify(stagedResourcesStore).removeResources(RESOURCE_ID)
+  }
+
+  test("Clean up the resource if the namespace does not exist.") {
+    val cleanupRunnable = startCleanupAndGetCleanupRunnable()
+    cleanerUnderTest.registerResourceForCleaning(RESOURCE_ID, RESOURCES_OWNER)
+    cleanerUnderTest.markResourceAsUsed(RESOURCE_ID)
+    when(clock.getTimeMillis()).thenReturn(CURRENT_TIME + INITIAL_ACCESS_EXPIRATION_MS)
+    when(namespaceOperations.withName(POD_NAMESPACE)).thenReturn(namedNamespaceOperations)
+    when(namedNamespaceOperations.get()).thenReturn(null)
+    cleanupRunnable.run()
+    verify(stagedResourcesStore).removeResources(RESOURCE_ID)
+  }
+
+  private def startCleanupAndGetCleanupRunnable(): Runnable = {
+    val captor = ArgumentCaptor.forClass(classOf[Runnable])
+    cleanerUnderTest.start()
+    verify(cleanerExecutorService).scheduleAtFixedRate(
+        captor.capture(),
+        mockitoEq(30000L),
+        mockitoEq(30000L),
+        mockitoEq(TimeUnit.MILLISECONDS))
+    captor.getValue
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStoreSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStoreSuite.scala
new file mode 100644
index 0000000000000..6b5737ebf2e23
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/StagedResourcesStoreSuite.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.kubernetes
+
+import java.io.{ByteArrayInputStream, File}
+import java.nio.file.Paths
+
+import com.google.common.io.Files
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.util.Utils
+
+private[spark] class StagedResourcesStoreSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val resourceBytes = Array[Byte](1, 2, 3, 4)
+  private val namespace = "namespace"
+  private var dependencyRootDir: File = _
+  private var stagedResourcesStore: StagedResourcesStore = _
+
+  before {
+    dependencyRootDir = Utils.createTempDir()
+    stagedResourcesStore = new StagedResourcesStoreImpl(dependencyRootDir)
+  }
+
+  after {
+    dependencyRootDir.delete()
+  }
+
+  test("Uploads should write data to the underlying disk") {
+    val resourceIdAndSecret = Utils.tryWithResource(new ByteArrayInputStream(resourceBytes)) {
+        resourceStream =>
+      stagedResourcesStore.addResources(namespace, resourceStream)
+    }
+    val resourceNamespaceDir = Paths.get(dependencyRootDir.getAbsolutePath, "namespace").toFile
+    assert(resourceNamespaceDir.isDirectory, s"Resource namespace dir was not created at" +
+        s" ${resourceNamespaceDir.getAbsolutePath} or is not a directory.")
+    val resourceDirs = resourceNamespaceDir.listFiles()
+    assert(resourceDirs.length === 1, s"Resource root directory did not have exactly one" +
+        s" subdirectory. Got: ${resourceDirs.map(_.getAbsolutePath).mkString(",")}")
+    assert(resourceDirs(0).getName === resourceIdAndSecret.resourceId)
+    val resourceTgz = new File(resourceDirs(0), "resources.data")
+    assert(resourceTgz.isFile,
+        s"Resources written to ${resourceTgz.getAbsolutePath} does not exist or is not a file.")
+    val resourceTgzBytes = Files.toByteArray(resourceTgz)
+    assert(resourceTgzBytes.toSeq === resourceBytes.toSeq, "Incorrect resource bytes were written.")
+  }
+
+  test("Uploading and then getting should return a stream with the written bytes.") {
+    val resourceIdAndSecret = Utils.tryWithResource(new ByteArrayInputStream(resourceBytes)) {
+      resourceStream =>
+        stagedResourcesStore.addResources(namespace, resourceStream)
+    }
+    val resources = stagedResourcesStore.getResources(resourceIdAndSecret.resourceId)
+    assert(resources.map(_.resourcesFile)
+        .map(Files.toByteArray)
+        .exists(resourceBytes.sameElements(_)))
+    assert(resources.exists(_.resourceId == resourceIdAndSecret.resourceId))
+    assert(resources.exists(_.resourceSecret == resourceIdAndSecret.resourceSecret))
+  }
+
+  test("Uploading and then deleting should result in the resource directory being deleted.") {
+    val resourceIdAndSecret = Utils.tryWithResource(new ByteArrayInputStream(resourceBytes)) {
+      resourceStream =>
+        stagedResourcesStore.addResources(namespace, resourceStream)
+    }
+    stagedResourcesStore.removeResources(resourceIdAndSecret.resourceId)
+    val resourceNamespaceDir = Paths.get(dependencyRootDir.getAbsolutePath, "namespace").toFile
+    assert(resourceNamespaceDir.listFiles().isEmpty)
+    assert(stagedResourcesStore.getResources(resourceIdAndSecret.resourceId).isEmpty)
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 95775d262a69d..6a296d6112c97 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -169,11 +169,14 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     sparkConf.setJars(Seq(
         CONTAINER_LOCAL_MAIN_APP_RESOURCE,
         CONTAINER_LOCAL_HELPER_JAR_PATH))
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_KEY_FILE,
+    sparkConf.set(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
         kubernetesTestComponents.clientConfig.getClientKeyFile)
-    sparkConf.set(KUBERNETES_DRIVER_CLIENT_CERT_FILE,
+    sparkConf.set(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
         kubernetesTestComponents.clientConfig.getClientCertFile)
-    sparkConf.set(KUBERNETES_DRIVER_CA_CERT_FILE,
+    sparkConf.set(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
         kubernetesTestComponents.clientConfig.getCaCertFile)
     runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
   }

From c325691d1c788e12d3e09a9d728481f52223ca2b Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Sun, 4 Jun 2017 07:25:28 -0700
Subject: [PATCH 499/534] Copy yaml files when making distribution (#327)

---
 dev/make-distribution.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 6f9dfa0e39072..bb5fa3da12209 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -246,6 +246,7 @@ fi
 # Copy other things
 mkdir "$DISTDIR"/conf
 cp "$SPARK_HOME"/conf/*.template "$DISTDIR"/conf
+cp "$SPARK_HOME"/conf/*.yaml "$DISTDIR"/conf
 cp "$SPARK_HOME/README.md" "$DISTDIR"
 cp -r "$SPARK_HOME/bin" "$DISTDIR"
 cp -r "$SPARK_HOME/python" "$DISTDIR"

From d835b6abd7277bcb76721124f769fba1a3ee332e Mon Sep 17 00:00:00 2001
From: Timothy Chen <tnachen@gmail.com>
Date: Sun, 4 Jun 2017 19:54:56 -0700
Subject: [PATCH 500/534] Allow docker image pull policy to be configurable
 (#328)

* Allow docker image pull policy to be configurable

* Add flag documentation

* Update running-on-kubernetes.md
---
 docs/running-on-kubernetes.md                              | 7 +++++++
 .../deploy/kubernetes/SparkPodInitContainerBootstrap.scala | 3 ++-
 .../scala/org/apache/spark/deploy/kubernetes/config.scala  | 6 ++++++
 .../org/apache/spark/deploy/kubernetes/submit/Client.scala | 3 ++-
 .../submit/DriverInitContainerComponentsProvider.scala     | 2 ++
 .../cluster/kubernetes/KubernetesClusterManager.scala      | 4 ++--
 .../kubernetes/KubernetesClusterSchedulerBackend.scala     | 3 ++-
 .../kubernetes/SparkPodInitContainerBootstrapSuite.scala   | 3 +++
 8 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e9002bdfe0502..a88b0d380fac0 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -661,6 +661,13 @@ from the other deployment modes. See the [configuration page](configuration.html
     Interval between reports of the current Spark job status in cluster mode.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.docker.image.pullPolicy</code></td>
+  <td><code>IfNotPresent</code></td>
+  <td>
+    Docker image pull policy used when pulling Docker images with Kubernetes.
+  </td>
+</tr>
 </table>
 
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
index a4d0aeb23d01f..87462dbde17a5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
@@ -36,6 +36,7 @@ private[spark] trait SparkPodInitContainerBootstrap {
 
 private[spark] class SparkPodInitContainerBootstrapImpl(
     initContainerImage: String,
+    dockerImagePullPolicy: String,
     jarsDownloadPath: String,
     filesDownloadPath: String,
     downloadTimeoutMinutes: Long,
@@ -60,7 +61,7 @@ private[spark] class SparkPodInitContainerBootstrapImpl(
     val initContainer = new ContainerBuilder()
       .withName(s"spark-init")
       .withImage(initContainerImage)
-      .withImagePullPolicy("IfNotPresent")
+      .withImagePullPolicy(dockerImagePullPolicy)
       .addNewVolumeMount()
         .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
         .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index dd99e0f7a5ae0..47c3c24fa88f7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -47,6 +47,12 @@ package object config extends Logging {
       .stringConf
       .createWithDefault(s"spark-executor:$sparkVersion")
 
+  private[spark] val DOCKER_IMAGE_PULL_POLICY =
+    ConfigBuilder("spark.kubernetes.docker.image.pullPolicy")
+      .doc("Docker image pull policy when pulling any docker image in Kubernetes integration")
+      .stringConf
+      .createWithDefault("IfNotPresent")
+
   private[spark] val APISERVER_AUTH_SUBMISSION_CONF_PREFIX =
       "spark.kubernetes.authenticate.submission"
   private[spark] val APISERVER_AUTH_DRIVER_CONF_PREFIX =
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 85dac3df57b4c..1bebaf92501f4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -60,6 +60,7 @@ private[spark] class Client(
   private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
     .getOrElse(kubernetesAppId)
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
+  private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
   private val memoryOverheadMb = sparkConf
     .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
@@ -99,7 +100,7 @@ private[spark] class Client(
     val driverContainer = new ContainerBuilder()
       .withName(DRIVER_CONTAINER_NAME)
       .withImage(driverDockerImage)
-      .withImagePullPolicy("IfNotPresent")
+      .withImagePullPolicy(dockerImagePullPolicy)
       .addToEnv(driverExtraClasspathEnv.toSeq: _*)
       .addNewEnv()
         .withName(ENV_DRIVER_MEMORY)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
index ccb349c5b2988..be9da2582cb47 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -104,6 +104,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
   private val configMapName = s"$kubernetesAppId-init-config"
   private val configMapKey = s"$kubernetesAppId-init-config-key"
   private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
+  private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
 
   override def provideInitContainerConfigMapBuilder(
@@ -196,6 +197,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
     }
     new SparkPodInitContainerBootstrapImpl(
       initContainerImage,
+      dockerImagePullPolicy,
       jarsDownloadPath,
       filesDownloadPath,
       downloadTimeoutMinutes,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index 6abce55cff209..2a0f6e78c2aea 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -46,7 +46,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     val maybeExecutorInitContainerSecretName =
       sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET)
     val maybeExecutorInitContainerSecretMount =
-        sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR)
+      sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR)
     val executorInitContainerSecretVolumePlugin = for {
       initContainerSecretName <- maybeExecutorInitContainerSecretName
       initContainerSecretMountPath <- maybeExecutorInitContainerSecretMount
@@ -65,6 +65,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     } yield {
       new SparkPodInitContainerBootstrapImpl(
         sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE),
+        sparkConf.get(DOCKER_IMAGE_PULL_POLICY),
         sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION),
         sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION),
         sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT),
@@ -95,4 +96,3 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
   }
 }
-
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 1852ed021d91a..c3a6fe28a6255 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -77,6 +77,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private var shufflePodCache: Option[ShufflePodCache] = None
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
+  private val dockerImagePullPolicy = conf.get(DOCKER_IMAGE_PULL_POLICY)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
   private val executorPort = conf.getInt("spark.executor.port", DEFAULT_STATIC_PORT)
   private val blockmanagerPort = conf
@@ -354,7 +355,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
         .addNewContainer()
           .withName(s"executor")
           .withImage(executorDockerImage)
-          .withImagePullPolicy("IfNotPresent")
+          .withImagePullPolicy(dockerImagePullPolicy)
           .withNewResources()
             .addToRequests("memory", executorMemoryQuantity)
             .addToLimits("memory", executorMemoryLimitQuantity)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
index 3feba80f800c7..90d7b10df211c 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.deploy.kubernetes.constants._
 class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAfter {
   private val OBJECT_MAPPER = new ObjectMapper()
   private val INIT_CONTAINER_IMAGE = "spark-init:latest"
+  private val DOCKER_IMAGE_PULL_POLICY = "IfNotPresent"
   private val JARS_DOWNLOAD_PATH = "/var/data/spark-jars"
   private val FILES_DOWNLOAD_PATH = "/var/data/spark-files"
   private val DOWNLOAD_TIMEOUT_MINUTES = 5
@@ -137,6 +138,7 @@ class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAf
   private def bootstrapPodWithoutSubmittedDependencies(): Pod = {
     val bootstrapUnderTest = new SparkPodInitContainerBootstrapImpl(
       INIT_CONTAINER_IMAGE,
+      DOCKER_IMAGE_PULL_POLICY,
       JARS_DOWNLOAD_PATH,
       FILES_DOWNLOAD_PATH,
       DOWNLOAD_TIMEOUT_MINUTES,
@@ -150,6 +152,7 @@ class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAf
   private def bootstrapPodWithSubmittedDependencies(): Pod = {
     val bootstrapUnderTest = new SparkPodInitContainerBootstrapImpl(
       INIT_CONTAINER_IMAGE,
+      DOCKER_IMAGE_PULL_POLICY,
       JARS_DOWNLOAD_PATH,
       FILES_DOWNLOAD_PATH,
       DOWNLOAD_TIMEOUT_MINUTES,

From 4751371f115a81ea2f965721f1a79f74b6feee8b Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <ramanathana@google.com>
Date: Mon, 5 Jun 2017 10:08:57 -0700
Subject: [PATCH 501/534] POM update 0.2.0 (#329)

---
 assembly/pom.xml                                                | 2 +-
 common/network-common/pom.xml                                   | 2 +-
 common/network-shuffle/pom.xml                                  | 2 +-
 common/network-yarn/pom.xml                                     | 2 +-
 common/sketch/pom.xml                                           | 2 +-
 common/tags/pom.xml                                             | 2 +-
 common/unsafe/pom.xml                                           | 2 +-
 core/pom.xml                                                    | 2 +-
 examples/pom.xml                                                | 2 +-
 external/docker-integration-tests/pom.xml                       | 2 +-
 external/flume-assembly/pom.xml                                 | 2 +-
 external/flume-sink/pom.xml                                     | 2 +-
 external/flume/pom.xml                                          | 2 +-
 external/java8-tests/pom.xml                                    | 2 +-
 external/kafka-0-10-assembly/pom.xml                            | 2 +-
 external/kafka-0-10-sql/pom.xml                                 | 2 +-
 external/kafka-0-10/pom.xml                                     | 2 +-
 external/kafka-0-8-assembly/pom.xml                             | 2 +-
 external/kafka-0-8/pom.xml                                      | 2 +-
 external/kinesis-asl-assembly/pom.xml                           | 2 +-
 external/kinesis-asl/pom.xml                                    | 2 +-
 external/spark-ganglia-lgpl/pom.xml                             | 2 +-
 graphx/pom.xml                                                  | 2 +-
 launcher/pom.xml                                                | 2 +-
 mesos/pom.xml                                                   | 2 +-
 mllib-local/pom.xml                                             | 2 +-
 mllib/pom.xml                                                   | 2 +-
 pom.xml                                                         | 2 +-
 repl/pom.xml                                                    | 2 +-
 resource-managers/kubernetes/core/pom.xml                       | 2 +-
 resource-managers/kubernetes/docker-minimal-bundle/pom.xml      | 2 +-
 .../kubernetes/integration-tests-spark-jobs-helpers/pom.xml     | 2 +-
 .../kubernetes/integration-tests-spark-jobs/pom.xml             | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml          | 2 +-
 sql/catalyst/pom.xml                                            | 2 +-
 sql/core/pom.xml                                                | 2 +-
 sql/hive-thriftserver/pom.xml                                   | 2 +-
 sql/hive/pom.xml                                                | 2 +-
 streaming/pom.xml                                               | 2 +-
 tools/pom.xml                                                   | 2 +-
 yarn/pom.xml                                                    | 2 +-
 41 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/assembly/pom.xml b/assembly/pom.xml
index c1f2c5b29f7e8..a4f695e790ce3 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 7d016120e44d7..58889a55cf651 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 718f609178e24..2daacc14d42b5 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index d543991cb6a94..e14b4748efca9 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 384ef55b6f8a9..24fd97315ef4e 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index 3d8eb2703ed6b..e07e51c34ec93 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index cd53039ed9a47..0bf7005b32eeb 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 23510cb50bcb7..9cac063dc62e7 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/examples/pom.xml b/examples/pom.xml
index e3f7529ecbcec..0e91ae2a14dab 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 60fa11d8a5043..e4da21cb9b4be 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index a21ec3f3d7fcb..41cf53acdb38c 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 25e34698d831e..75c6f0596eae6 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 8058732e72e74..ac410bd46c403 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/java8-tests/pom.xml b/external/java8-tests/pom.xml
index fc326931315a3..884660d7dffdf 100644
--- a/external/java8-tests/pom.xml
+++ b/external/java8-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index b023bc49203b2..7d0bd87bdcb93 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 03cd565f93025..c2dafb03bcb8f 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 5e294e6acc006..d147aef12b9cc 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 3654ede0e192a..e11c2d0937517 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index f9d61028e906a..9955cd65f6475 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 1ba318d8d39c2..284425d4f43f2 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index fcc4456ef4a13..6d7fa95aec967 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 2bef062c0e0b3..379526b682c85 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 78ca270157dcf..ec4cfbad99d01 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index 1f0549f901f47..6a536039b3975 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mesos/pom.xml b/mesos/pom.xml
index e97743cf1bbd9..ab3744863a5c6 100644
--- a/mesos/pom.xml
+++ b/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 24718fd46a3ec..073b6482ce930 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index 384b6af178d43..6d8fe24d4e185 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 9533076a211ee..7f9325fa5f185 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+  <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/repl/pom.xml b/repl/pom.xml
index a42c69a99573d..94ecb10dfa6e4 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index aa429f73a5627..a227342f46771 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index c66b87ac0952d..51ca26c0134fa 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
index 581bf9453f2f2..206059bd8e5b1 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
index 9639811479ff5..555398aa3e6d9 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 02904c0e5fe21..bbf4b02cdaaf9 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 818f33868ef7a..2b5faf37ddd0b 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index d1b8982b2d464..f98deb0893af7 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 3dc6539e178c3..ece565e607315 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index ce3879131e736..2d12eb50905b6 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 715aa4bbf6373..f55bbc61df071 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index d978d6ef6fdd4..eafb3f283c619 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
index 424965419c10b..00812174cdf0c 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.1.0-k8s-0.1.0-SNAPSHOT</version>
+    <version>2.1.0-k8s-0.2.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 5470366ccf9a504dcd96ab83491c211cd6508e50 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <ramanathana@google.com>
Date: Mon, 5 Jun 2017 18:28:16 -0700
Subject: [PATCH 502/534] Update tags (#332)

* Update tags

* update tags in conf directory
---
 conf/kubernetes-resource-staging-server.yaml |  2 +-
 conf/kubernetes-shuffle-service.yaml         |  2 +-
 docs/running-on-kubernetes.md                | 30 ++++++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/conf/kubernetes-resource-staging-server.yaml b/conf/kubernetes-resource-staging-server.yaml
index 11f5d3a13b9e3..025b9b125d9e0 100644
--- a/conf/kubernetes-resource-staging-server.yaml
+++ b/conf/kubernetes-resource-staging-server.yaml
@@ -32,7 +32,7 @@ spec:
             name: spark-resource-staging-server-config
       containers:
         - name: spark-resource-staging-server
-          image: kubespark/spark-resource-staging-server:v2.1.0-kubernetes-0.1.0-alpha.3
+          image: kubespark/spark-resource-staging-server:v2.1.0-kubernetes-0.2.0
           resources:
             requests:
               cpu: 100m
diff --git a/conf/kubernetes-shuffle-service.yaml b/conf/kubernetes-shuffle-service.yaml
index c0cc310cf4755..55c170b01a4f5 100644
--- a/conf/kubernetes-shuffle-service.yaml
+++ b/conf/kubernetes-shuffle-service.yaml
@@ -38,7 +38,7 @@ spec:
           # This is an official image that is built
           # from the dockerfiles/shuffle directory
           # in the spark distribution.
-          image: spark-shuffle:latest
+          image: kubespark/spark-shuffle:v2.1.0-kubernetes-0.2.0
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: '/tmp'
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index a88b0d380fac0..36b45526dfb44 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -36,15 +36,15 @@ If you wish to use pre-built docker images, you may use the images published in
 <tr><th>Component</th><th>Image</th></tr>
 <tr>
   <td>Spark Driver Image</td>
-  <td><code>kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
+  <td><code>kubespark/spark-driver:v2.1.0-kubernetes-0.2.0</code></td>
 </tr>
 <tr>
   <td>Spark Executor Image</td>
-  <td><code>kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
+  <td><code>kubespark/spark-executor:v2.1.0-kubernetes-0.2.0</code></td>
 </tr>
 <tr>
   <td>Spark Initialization Image</td>
-  <td><code>kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2</code></td>
+  <td><code>kubespark/spark-init:v2.1.0-kubernetes-0.2.0</code></td>
 </tr>
 </table>
 
@@ -76,9 +76,9 @@ are set up as described above:
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.2.0 \
       local:///opt/spark/examples/jars/spark_examples_2.11-2.2.0.jar
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
@@ -125,9 +125,9 @@ and then you can compute the value of Pi as follows:
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.2.0 \
       --conf spark.kubernetes.resourceStagingServer.uri=http://<address-of-any-cluster-node>:31000 \
       examples/jars/spark_examples_2.11-2.2.0.jar
 
@@ -168,9 +168,9 @@ If our local proxy were listening on port 8001, we would have our submission loo
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.2.0 \
       local:///opt/spark/examples/jars/spark_examples_2.11-2.2.0.jar
 
 Communication between Spark and Kubernetes clusters is performed using the fabric8 kubernetes-client library.
@@ -284,9 +284,9 @@ communicate with the resource staging server over TLS. The trustStore can be set
       --kubernetes-namespace default \
       --conf spark.executor.instances=5 \
       --conf spark.app.name=spark-pi \
-      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.1.0-alpha.2 \
-      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.1.0-alpha.2 \
+      --conf spark.kubernetes.driver.docker.image=kubespark/spark-driver:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.1.0-kubernetes-0.2.0 \
+      --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.1.0-kubernetes-0.2.0 \
       --conf spark.kubernetes.resourceStagingServer.uri=https://<address-of-any-cluster-node>:31000 \
       --conf spark.ssl.kubernetes.resourceStagingServer.enabled=true \
       --conf spark.ssl.kubernetes.resourceStagingServer.clientCertPem=/home/myuser/cert.pem \

From ca4309febafba48948ea2546801cfeae69fecd9a Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <ramanathana@google.com>
Date: Mon, 5 Jun 2017 22:26:24 -0700
Subject: [PATCH 503/534] nicer readme (#333)

---
 README.md                     | 113 +---------------------------------
 docs/running-on-kubernetes.md |   2 -
 2 files changed, 2 insertions(+), 113 deletions(-)

diff --git a/README.md b/README.md
index 484fef67dc180..cf6b4fa80242b 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ This is a collaboratively maintained project working on [SPARK-18278](https://is
 
 ## Getting Started
 
-- [Usage guide](docs/running-on-kubernetes.md) shows how to run the code
+- [Usage guide](https://apache-spark-on-k8s.github.io/userdocs/) shows how to run the code
 - [Development docs](resource-managers/kubernetes/README.md) shows how to get set up for development
 - Code is primarily located in the [resource-managers/kubernetes](resource-managers/kubernetes) folder
 
@@ -30,113 +30,4 @@ This is a collaborative effort by several folks from different companies who are
 - Intel
 - Palantir
 - Pepperdata
-- Red Hat
-
---------------------
-
-(original README below)
-
-# Apache Spark
-
-Spark is a fast and general cluster computing system for Big Data. It provides
-high-level APIs in Scala, Java, Python, and R, and an optimized engine that
-supports general computation graphs for data analysis. It also supports a
-rich set of higher-level tools including Spark SQL for SQL and DataFrames,
-MLlib for machine learning, GraphX for graph processing,
-and Spark Streaming for stream processing.
-
-<http://spark.apache.org/>
-
-
-## Online Documentation
-
-You can find the latest Spark documentation, including a programming
-guide, on the [project web page](http://spark.apache.org/documentation.html).
-This README file only contains basic setup instructions.
-
-## Building Spark
-
-Spark is built using [Apache Maven](http://maven.apache.org/).
-To build Spark and its example programs, run:
-
-    build/mvn -DskipTests clean package
-
-(You do not need to do this if you downloaded a pre-built package.)
-
-You can build Spark using more than one thread by using the -T option with Maven, see ["Parallel builds in Maven 3"](https://cwiki.apache.org/confluence/display/MAVEN/Parallel+builds+in+Maven+3).
-More detailed documentation is available from the project site, at
-["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
-
-For general development tips, including info on developing Spark using an IDE, see 
-[http://spark.apache.org/developer-tools.html](the Useful Developer Tools page).
-
-## Interactive Scala Shell
-
-The easiest way to start using Spark is through the Scala shell:
-
-    ./bin/spark-shell
-
-Try the following command, which should return 1000:
-
-    scala> sc.parallelize(1 to 1000).count()
-
-## Interactive Python Shell
-
-Alternatively, if you prefer Python, you can use the Python shell:
-
-    ./bin/pyspark
-
-And run the following command, which should also return 1000:
-
-    >>> sc.parallelize(range(1000)).count()
-
-## Example Programs
-
-Spark also comes with several sample programs in the `examples` directory.
-To run one of them, use `./bin/run-example <class> [params]`. For example:
-
-    ./bin/run-example SparkPi
-
-will run the Pi example locally.
-
-You can set the MASTER environment variable when running examples to submit
-examples to a cluster. This can be a mesos:// or spark:// URL,
-"yarn" to run on YARN, and "local" to run
-locally with one thread, or "local[N]" to run locally with N threads. You
-can also use an abbreviated class name if the class is in the `examples`
-package. For instance:
-
-    MASTER=spark://host:7077 ./bin/run-example SparkPi
-
-Many of the example programs print usage help if no params are given.
-
-## Running Tests
-
-Testing first requires [building Spark](#building-spark). Once Spark is built, tests
-can be run using:
-
-    ./dev/run-tests
-
-Please see the guidance on how to
-[run tests for a module, or individual tests](http://spark.apache.org/developer-tools.html#individual-tests).
-
-## A Note About Hadoop Versions
-
-Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported
-storage systems. Because the protocols have changed in different versions of
-Hadoop, you must build Spark against the same version that your cluster runs.
-
-Please refer to the build documentation at
-["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)
-for detailed guidance on building for a particular distribution of Hadoop, including
-building for particular Hive and Hive Thriftserver distributions.
-
-## Configuration
-
-Please refer to the [Configuration Guide](http://spark.apache.org/docs/latest/configuration.html)
-in the online documentation for an overview on how to configure Spark.
-
-## Contributing
-
-Please review the [Contribution to Spark guide](http://spark.apache.org/contributing.html)
-for information on how to get started contributing to the project.
+- Red Hat
\ No newline at end of file
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 36b45526dfb44..dc3cf738832ad 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -149,8 +149,6 @@ environment variable in your Dockerfiles.
 
 ### Accessing Kubernetes Clusters
 
-For details about running on public cloud environments, such as Google Container Engine (GKE), refer to [running Spark in the cloud with Kubernetes](running-on-kubernetes-cloud.md).
-
 Spark-submit also supports submission through the
 [local kubectl proxy](https://kubernetes.io/docs/user-guide/accessing-the-cluster/#using-kubectl-proxy). One can use the
 authenticating proxy to communicate with the api server directly without passing credentials to spark-submit.

From 0dd146c168b121f4c73ef0ed62bf4be0b679233f Mon Sep 17 00:00:00 2001
From: dyhfighter <1294057873@qq.com>
Date: Fri, 9 Jun 2017 00:41:15 +0800
Subject: [PATCH 504/534] Support specify CPU cores and Memory restricts for
 driver (#340)

Signed-off-by: duyanghao <1294057873@qq.com>
---
 .../deploy/kubernetes/submit/Client.scala     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 1bebaf92501f4..0544bf064844f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.kubernetes.submit
 import java.io.File
 import java.util.Collections
 
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder, QuantityBuilder}
 import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.JavaConverters._
 
@@ -61,6 +61,11 @@ private[spark] class Client(
     .getOrElse(kubernetesAppId)
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
   private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
+
+  // CPU settings
+  private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
+
+  // Memory settings
   private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
   private val memoryOverheadMb = sparkConf
     .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
@@ -97,6 +102,15 @@ private[spark] class Client(
         .withValue(classPath)
         .build()
     }
+    val driverCpuQuantity = new QuantityBuilder(false)
+      .withAmount(driverCpuCores)
+      .build()
+    val driverMemoryQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverMemoryMb}M")
+      .build()
+    val driverMemoryLimitQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverContainerMemoryWithOverhead}M")
+      .build()
     val driverContainer = new ContainerBuilder()
       .withName(DRIVER_CONTAINER_NAME)
       .withImage(driverDockerImage)
@@ -114,6 +128,12 @@ private[spark] class Client(
         .withName(ENV_DRIVER_ARGS)
         .withValue(appArgs.mkString(" "))
         .endEnv()
+      .withNewResources()
+        .addToRequests("cpu", driverCpuQuantity)
+        .addToLimits("cpu", driverCpuQuantity)
+        .addToRequests("memory", driverMemoryQuantity)
+        .addToLimits("memory", driverMemoryLimitQuantity)
+        .endResources()
       .build()
     val basePod = new PodBuilder()
       .withNewMetadata()

From bcf57cf1d33dec415b2edf05a33ee9e2ff15cbe7 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Thu, 8 Jun 2017 12:40:46 -0700
Subject: [PATCH 505/534] Generate the application ID label irrespective of app
 name. (#331)

* Generate the application ID label irrespective of app name.

* Add an integration test.

* Fix scalastyle
---
 .../KubernetesExternalShuffleService.scala    |   2 +-
 .../spark/deploy/kubernetes/config.scala      |   7 ++
 .../spark/deploy/kubernetes/constants.scala   |   6 +-
 .../deploy/kubernetes/submit/Client.scala     |  76 ++++++------
 ...riverInitContainerComponentsProvider.scala | 111 +++++++++---------
 .../SubmittedDependencyUploaderImpl.scala     |   1 -
 .../KubernetesClusterSchedulerBackend.scala   |  15 ++-
 .../kubernetes/submit/ClientV2Suite.scala     |  18 +--
 .../SubmittedDependencyUploaderSuite.scala    |  15 ++-
 .../integrationtest/KubernetesSuite.scala     |   7 ++
 10 files changed, 144 insertions(+), 114 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
index 01a8a9a6899fd..c61f4f1d44acf 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesExternalShuffleService.scala
@@ -91,7 +91,7 @@ private[spark] class KubernetesShuffleBlockHandler (
     try {
       Some(kubernetesClient
         .pods()
-        .withLabels(Map(SPARK_ROLE_LABEL -> "driver").asJava)
+        .withLabels(Map(SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE).asJava)
         .watch(new Watcher[Pod] {
           override def eventReceived(action: Watcher.Action, p: Pod): Unit = {
             action match {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 47c3c24fa88f7..d1fd88fc880d1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -151,6 +151,13 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_EXECUTOR_POD_NAME_PREFIX =
+    ConfigBuilder("spark.kubernetes.executor.podNamePrefix")
+      .doc("Prefix to use in front of the executor pod names.")
+      .internal()
+      .stringConf
+      .createWithDefault("spark")
+
   private[spark] val KUBERNETES_SHUFFLE_NAMESPACE =
     ConfigBuilder("spark.kubernetes.shuffle.namespace")
       .doc("Namespace of the shuffle service")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index e267c9ff7e1d1..9c46d7494b187 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -19,10 +19,12 @@ package org.apache.spark.deploy.kubernetes
 package object constants {
   // Labels
   private[spark] val SPARK_DRIVER_LABEL = "spark-driver"
-  private[spark] val SPARK_APP_ID_LABEL = "spark-app-id"
-  private[spark] val SPARK_APP_NAME_LABEL = "spark-app-name"
+  private[spark] val SPARK_APP_ID_LABEL = "spark-app-selector"
   private[spark] val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
   private[spark] val SPARK_ROLE_LABEL = "spark-role"
+  private[spark] val SPARK_POD_DRIVER_ROLE = "driver"
+  private[spark] val SPARK_POD_EXECUTOR_ROLE = "executor"
+  private[spark] val SPARK_APP_NAME_ANNOTATION = "spark-app-name"
 
   // Credentials secrets
   private[spark] val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 0544bf064844f..c2e616eadc1e0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -17,13 +17,13 @@
 package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
-import java.util.Collections
+import java.util.{Collections, UUID}
 
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder, QuantityBuilder}
 import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.JavaConverters._
 
-import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, SparkKubernetesClientFactory}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
@@ -43,22 +43,21 @@ import org.apache.spark.util.Utils
  * where different steps of submission should be factored out into separate classes.
  */
 private[spark] class Client(
-      appName: String,
-      kubernetesAppId: String,
-      mainClass: String,
-      sparkConf: SparkConf,
-      appArgs: Array[String],
-      sparkJars: Seq[String],
-      sparkFiles: Seq[String],
-      waitForAppCompletion: Boolean,
-      kubernetesClient: KubernetesClient,
-      initContainerComponentsProvider: DriverInitContainerComponentsProvider,
-      kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider,
-      loggingPodStatusWatcher: LoggingPodStatusWatcher)
-    extends Logging {
-
+    appName: String,
+    kubernetesResourceNamePrefix: String,
+    kubernetesAppId: String,
+    mainClass: String,
+    sparkConf: SparkConf,
+    appArgs: Array[String],
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    waitForAppCompletion: Boolean,
+    kubernetesClient: KubernetesClient,
+    initContainerComponentsProvider: DriverInitContainerComponentsProvider,
+    kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider,
+    loggingPodStatusWatcher: LoggingPodStatusWatcher) extends Logging {
   private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
-    .getOrElse(kubernetesAppId)
+    .getOrElse(s"$kubernetesResourceNamePrefix-driver")
   private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
   private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
 
@@ -86,15 +85,16 @@ private[spark] class Client(
     val parsedCustomLabels = ConfigurationUtils.parseKeyValuePairs(
         customLabels, KUBERNETES_DRIVER_LABELS.key, "labels")
     require(!parsedCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
-      s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
-    require(!parsedCustomLabels.contains(SPARK_APP_NAME_LABEL), s"Label with key" +
-      s" $SPARK_APP_NAME_LABEL is not allowed as it is reserved for Spark bookkeeping operations.")
+      s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping" +
+      s" operations.")
+    val parsedCustomAnnotations = ConfigurationUtils.parseKeyValuePairs(
+      customAnnotations, KUBERNETES_DRIVER_ANNOTATIONS.key, "annotations")
+    require(!parsedCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION), s"Annotation with key" +
+      s" $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for Spark bookkeeping" +
+      s" operations.")
     val allLabels = parsedCustomLabels ++ Map(
         SPARK_APP_ID_LABEL -> kubernetesAppId,
-        SPARK_APP_NAME_LABEL -> appName,
-        SPARK_ROLE_LABEL -> "driver")
-    val parsedCustomAnnotations = ConfigurationUtils.parseKeyValuePairs(
-        customAnnotations, KUBERNETES_DRIVER_ANNOTATIONS.key, "annotations")
+        SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
 
     val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
       new EnvVarBuilder()
@@ -140,6 +140,7 @@ private[spark] class Client(
         .withName(kubernetesDriverPodName)
         .addToLabels(allLabels.asJava)
         .addToAnnotations(parsedCustomAnnotations.asJava)
+        .addToAnnotations(SPARK_APP_NAME_ANNOTATION, appName)
         .endMetadata()
       .withNewSpec()
         .withRestartPolicy("Never")
@@ -186,6 +187,7 @@ private[spark] class Client(
     }
     resolvedSparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
     resolvedSparkConf.set("spark.app.id", kubernetesAppId)
+    resolvedSparkConf.set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, kubernetesResourceNamePrefix)
     // We don't need this anymore since we just set the JVM options on the environment
     resolvedSparkConf.remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
     val resolvedLocalClasspath = containerLocalizedFilesResolver
@@ -234,11 +236,11 @@ private[spark] class Client(
           throw e
       }
       if (waitForAppCompletion) {
-        logInfo(s"Waiting for application $kubernetesAppId to finish...")
+        logInfo(s"Waiting for application $appName to finish...")
         loggingPodStatusWatcher.awaitCompletion()
-        logInfo(s"Application $kubernetesAppId finished.")
+        logInfo(s"Application $appName finished.")
       } else {
-        logInfo(s"Deployed Spark application $kubernetesAppId into Kubernetes.")
+        logInfo(s"Deployed Spark application $appName into Kubernetes.")
       }
     }
   }
@@ -279,15 +281,21 @@ private[spark] object Client {
     val sparkFiles = sparkConf.getOption("spark.files")
       .map(_.split(","))
       .getOrElse(Array.empty[String])
-    val appName = sparkConf.getOption("spark.app.name")
-      .getOrElse("spark")
-    val kubernetesAppId = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+    val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
+    // The resource name prefix is derived from the application name, making it easy to connect the
+    // names of the Kubernetes resources from e.g. Kubectl or the Kubernetes dashboard to the
+    // application the user submitted. However, we can't use the application name in the label, as
+    // label values are considerably restrictive, e.g. must be no longer than 63 characters in
+    // length. So we generate a separate identifier for the app ID itself, and bookkeeping that
+    // requires finding "all pods for this application" should use the kubernetesAppId.
+    val kubernetesResourceNamePrefix = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+    val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}"
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
     val master = resolveK8sMaster(sparkConf.get("spark.master"))
     val sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
     val initContainerComponentsProvider = new DriverInitContainerComponentsProviderImpl(
         sparkConf,
-        kubernetesAppId,
+        kubernetesResourceNamePrefix,
         namespace,
         sparkJars,
         sparkFiles,
@@ -300,14 +308,16 @@ private[spark] object Client {
         None,
         None)) { kubernetesClient =>
       val kubernetesCredentialsMounterProvider =
-          new DriverPodKubernetesCredentialsMounterProviderImpl(sparkConf, kubernetesAppId)
+          new DriverPodKubernetesCredentialsMounterProviderImpl(
+              sparkConf, kubernetesResourceNamePrefix)
       val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
       val loggingInterval = Option(sparkConf.get(REPORT_INTERVAL))
           .filter( _ => waitForAppCompletion)
       val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(
-          kubernetesAppId, loggingInterval)
+          kubernetesResourceNamePrefix, loggingInterval)
       new Client(
           appName,
+          kubernetesResourceNamePrefix,
           kubernetesAppId,
           mainClass,
           sparkConf,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
index be9da2582cb47..cfc61e193dcff 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -16,8 +16,6 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
-import java.io.File
-
 import org.apache.spark.{SparkConf, SSLOptions}
 import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
@@ -46,12 +44,12 @@ private[spark] trait DriverInitContainerComponentsProvider {
 }
 
 private[spark] class DriverInitContainerComponentsProviderImpl(
-    sparkConf: SparkConf,
-    kubernetesAppId: String,
-    namespace: String,
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    resourceStagingServerExternalSslOptions: SSLOptions)
+      sparkConf: SparkConf,
+      kubernetesResourceNamePrefix: String,
+      namespace: String,
+      sparkJars: Seq[String],
+      sparkFiles: Seq[String],
+      resourceStagingServerExternalSslOptions: SSLOptions)
     extends DriverInitContainerComponentsProvider {
 
   private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
@@ -99,10 +97,10 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
   private val jarsDownloadPath = sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
   private val filesDownloadPath = sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
   private val maybeSecretName = maybeResourceStagingServerUri.map { _ =>
-    s"$kubernetesAppId-init-secret"
+    s"$kubernetesResourceNamePrefix-init-secret"
   }
-  private val configMapName = s"$kubernetesAppId-init-config"
-  private val configMapKey = s"$kubernetesAppId-init-config-key"
+  private val configMapName = s"$kubernetesResourceNamePrefix-init-config"
+  private val configMapKey = s"$kubernetesResourceNamePrefix-init-config-key"
   private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
   private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
@@ -116,29 +114,29 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
       filesResourceId <- maybeSubmittedResourceIds.map(_.filesResourceId)
     } yield {
       new SubmittedDependencyInitContainerConfigPluginImpl(
-        // Configure the init-container with the internal URI over the external URI.
-        maybeResourceStagingServerInternalUri.getOrElse(stagingServerUri),
-        jarsResourceId,
-        filesResourceId,
-        INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
-        INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
-        INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
-        INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
-        resourceStagingServerInternalSslEnabled,
-        maybeResourceStagingServerInternalTrustStore,
-        maybeResourceStagingServerInternalClientCert,
-        maybeResourceStagingServerInternalTrustStorePassword,
-        maybeResourceStagingServerInternalTrustStoreType,
-        INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
+          // Configure the init-container with the internal URI over the external URI.
+          maybeResourceStagingServerInternalUri.getOrElse(stagingServerUri),
+          jarsResourceId,
+          filesResourceId,
+          INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
+          INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
+          INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
+          INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
+          resourceStagingServerInternalSslEnabled,
+          maybeResourceStagingServerInternalTrustStore,
+          maybeResourceStagingServerInternalClientCert,
+          maybeResourceStagingServerInternalTrustStorePassword,
+          maybeResourceStagingServerInternalTrustStoreType,
+          INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
     }
     new SparkInitContainerConfigMapBuilderImpl(
-      sparkJars,
-      sparkFiles,
-      jarsDownloadPath,
-      filesDownloadPath,
-      configMapName,
-      configMapKey,
-      submittedDependencyConfigPlugin)
+        sparkJars,
+        sparkFiles,
+        jarsDownloadPath,
+        filesDownloadPath,
+        configMapName,
+        configMapKey,
+        submittedDependencyConfigPlugin)
   }
 
   override def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver = {
@@ -158,14 +156,13 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
       driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader] = {
     maybeResourceStagingServerUri.map { stagingServerUri =>
       new SubmittedDependencyUploaderImpl(
-        kubernetesAppId,
-        driverPodLabels,
-        namespace,
-        stagingServerUri,
-        sparkJars,
-        sparkFiles,
-        resourceStagingServerExternalSslOptions,
-        RetrofitClientFactoryImpl)
+          driverPodLabels,
+          namespace,
+          stagingServerUri,
+          sparkJars,
+          sparkFiles,
+          resourceStagingServerExternalSslOptions,
+          RetrofitClientFactoryImpl)
     }
   }
 
@@ -178,15 +175,15 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
       filesResourceSecret <- maybeSubmittedResourceSecrets.map(_.filesResourceSecret)
     } yield {
       new SubmittedDependencySecretBuilderImpl(
-        secretName,
-        jarsResourceSecret,
-        filesResourceSecret,
-        INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
-        INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
-        INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
-        INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
-        maybeResourceStagingServerInternalTrustStore,
-        maybeResourceStagingServerInternalClientCert)
+          secretName,
+          jarsResourceSecret,
+          filesResourceSecret,
+          INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
+          INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
+          INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
+          INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
+          maybeResourceStagingServerInternalTrustStore,
+          maybeResourceStagingServerInternalClientCert)
     }
   }
 
@@ -196,13 +193,13 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
           secret, INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
     }
     new SparkPodInitContainerBootstrapImpl(
-      initContainerImage,
-      dockerImagePullPolicy,
-      jarsDownloadPath,
-      filesDownloadPath,
-      downloadTimeoutMinutes,
-      configMapName,
-      configMapKey,
-      resourceStagingServerSecretPlugin)
+        initContainerImage,
+        dockerImagePullPolicy,
+        jarsDownloadPath,
+        filesDownloadPath,
+        downloadTimeoutMinutes,
+        configMapName,
+        configMapKey,
+        resourceStagingServerSecretPlugin)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
index a891cf3904d2d..83d7a28f5ca10 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderImpl.scala
@@ -50,7 +50,6 @@ private[spark] trait SubmittedDependencyUploader {
  * Resource Staging Service.
  */
 private[spark] class SubmittedDependencyUploaderImpl(
-    kubernetesAppId: String,
     podLabels: Map[String, String],
     podNamespace: String,
     stagingServerUri: String,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index c3a6fe28a6255..6ab6480d848a2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -65,7 +65,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
       "executor labels")
   require(
       !executorLabels.contains(SPARK_APP_ID_LABEL),
-      s"Custom executor labels cannot contain $SPARK_APP_ID_LABEL as it is reserved for Spark.")
+      s"Custom executor labels cannot contain $SPARK_APP_ID_LABEL as it is" +
+        s" reserved for Spark.")
   require(
       !executorLabels.contains(SPARK_EXECUTOR_ID_LABEL),
       s"Custom executor labels cannot contain $SPARK_EXECUTOR_ID_LABEL as it is reserved for" +
@@ -87,6 +88,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     .get(KUBERNETES_DRIVER_POD_NAME)
     .getOrElse(
       throw new SparkException("Must specify the driver pod name"))
+  private val executorPodNamePrefix = conf.get(KUBERNETES_EXECUTOR_POD_NAME_PREFIX)
 
   private val executorMemoryMb = conf.get(org.apache.spark.internal.config.EXECUTOR_MEMORY)
   private val executorMemoryString = conf.get(
@@ -225,8 +227,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   override def start(): Unit = {
     super.start()
-    executorWatchResource.set(kubernetesClient.pods().withLabel(SPARK_APP_ID_LABEL, applicationId())
-      .watch(new ExecutorPodsWatcher()))
+    executorWatchResource.set(
+        kubernetesClient
+            .pods()
+            .withLabel(SPARK_APP_ID_LABEL, applicationId())
+            .watch(new ExecutorPodsWatcher()))
 
     allocator.scheduleWithFixedDelay(
       allocatorRunnable, 0, podAllocationInterval, TimeUnit.SECONDS)
@@ -280,7 +285,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private def allocateNewExecutorPod(): (String, Pod) = {
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
-    val name = s"${applicationId()}-exec-$executorId"
+    val name = s"$executorPodNamePrefix-exec-$executorId"
 
     // hostname must be no longer than 63 characters, so take the last 63 characters of the pod
     // name as the hostname.  This preserves uniqueness since the end of name contains
@@ -289,7 +294,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val resolvedExecutorLabels = Map(
       SPARK_EXECUTOR_ID_LABEL -> executorId,
       SPARK_APP_ID_LABEL -> applicationId(),
-      SPARK_ROLE_LABEL -> "executor") ++
+      SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE) ++
       executorLabels
     val executorMemoryQuantity = new QuantityBuilder(false)
       .withAmount(s"${executorMemoryMb}M")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index 00f09c64b53b7..193f36a7423b2 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -45,14 +45,14 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val BOOTSTRAPPED_POD_ANNOTATION = "bootstrapped"
   private val TRUE = "true"
   private val APP_NAME = "spark-test"
-  private val APP_ID = "spark-app-id"
+  private val APP_RESOURCE_PREFIX = "spark-prefix"
+  private val APP_ID = "spark-id"
   private val CUSTOM_LABEL_KEY = "customLabel"
   private val CUSTOM_LABEL_VALUE = "customLabelValue"
   private val ALL_EXPECTED_LABELS = Map(
       CUSTOM_LABEL_KEY -> CUSTOM_LABEL_VALUE,
       SPARK_APP_ID_LABEL -> APP_ID,
-      SPARK_APP_NAME_LABEL -> APP_NAME,
-      SPARK_ROLE_LABEL -> "driver")
+      SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
   private val INIT_CONTAINER_SECRET_NAME = "init-container-secret"
@@ -183,7 +183,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
           .build()
       }
     })
-    when(podOps.withName(APP_ID)).thenReturn(namedPodResource)
+    when(podOps.withName(s"$APP_RESOURCE_PREFIX-driver")).thenReturn(namedPodResource)
     when(namedPodResource.watch(loggingPodStatusWatcher)).thenReturn(watch)
     when(containerLocalizedFilesResolver.resolveSubmittedAndRemoteSparkJars())
         .thenReturn(RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS)
@@ -291,6 +291,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     expectationsForNoDependencyUploader()
     new Client(
       APP_NAME,
+      APP_RESOURCE_PREFIX,
       APP_ID,
       MAIN_CLASS,
       SPARK_CONF,
@@ -334,7 +335,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
         owners.head.getController &&
         owners.head.getKind == DRIVER_POD_KIND &&
         owners.head.getUid == DRIVER_POD_UID &&
-        owners.head.getName == APP_ID &&
+        owners.head.getName == s"$APP_RESOURCE_PREFIX-driver" &&
         owners.head.getApiVersion == DRIVER_POD_API_VERSION
     })
   }
@@ -354,14 +355,15 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       .toMap ++
       Map(
         "spark.app.id" -> APP_ID,
-        KUBERNETES_DRIVER_POD_NAME.key -> APP_ID,
+        KUBERNETES_DRIVER_POD_NAME.key -> s"$APP_RESOURCE_PREFIX-driver",
+        KUBERNETES_EXECUTOR_POD_NAME_PREFIX.key -> APP_RESOURCE_PREFIX,
         EXECUTOR_INIT_CONF_KEY -> TRUE,
         CUSTOM_JAVA_OPTION_KEY -> CUSTOM_JAVA_OPTION_VALUE,
         "spark.jars" -> RESOLVED_SPARK_JARS.mkString(","),
         "spark.files" -> RESOLVED_SPARK_FILES.mkString(","))
     runAndVerifyPodMatchesPredicate { p =>
       Option(p)
-        .filter(_.getMetadata.getName == APP_ID)
+        .filter(_.getMetadata.getName == s"$APP_RESOURCE_PREFIX-driver")
         .filter(podHasCorrectAnnotations)
         .filter(_.getMetadata.getLabels.asScala == ALL_EXPECTED_LABELS)
         .filter(containerHasCorrectBasicContainerConfiguration)
@@ -374,6 +376,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private def runAndVerifyPodMatchesPredicate(pred: (Pod => Boolean)): Unit = {
     new Client(
       APP_NAME,
+      APP_RESOURCE_PREFIX,
       APP_ID,
       MAIN_CLASS,
       SPARK_CONF,
@@ -442,6 +445,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private def podHasCorrectAnnotations(pod: Pod): Boolean = {
     val expectedAnnotations = Map(
       CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
+      SPARK_APP_NAME_ANNOTATION -> APP_NAME,
       BOOTSTRAPPED_POD_ANNOTATION -> TRUE)
     pod.getMetadata.getAnnotations.asScala == expectedAnnotations
   }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
index c207e3c69cd3c..96fa92c254297 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyUploaderSuite.scala
@@ -85,14 +85,13 @@ private[spark] class SubmittedDependencyUploaderSuite extends SparkFunSuite with
       resourcesDataCaptor.capture(), resourcesOwnerCaptor.capture()))
       .thenReturn(responseCall)
     dependencyUploaderUnderTest = new SubmittedDependencyUploaderImpl(
-      APP_ID,
-      LABELS,
-      NAMESPACE,
-      STAGING_SERVER_URI,
-      JARS,
-      FILES,
-      STAGING_SERVER_SSL_OPTIONS,
-      retrofitClientFactory)
+        LABELS,
+        NAMESPACE,
+        STAGING_SERVER_URI,
+        JARS,
+        FILES,
+        STAGING_SERVER_SSL_OPTIONS,
+        retrofitClientFactory)
   }
 
   test("Uploading jars should contact the staging server with the appropriate parameters") {
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 6a296d6112c97..e377f285eb9a6 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -195,6 +195,13 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS))
   }
 
+  test("Use a very long application name.") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH)).setAppName("long" * 40)
+    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
   private def launchStagingServer(
       resourceStagingServerSslOptions: SSLOptions, keyAndCertPem: Option[KeyAndCertPem]): Unit = {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)

From 78baf9bd8ed5b515712533289aaffbfddd8e856b Mon Sep 17 00:00:00 2001
From: Johannes Scheuermann <johscheuer@users.noreply.github.com>
Date: Fri, 9 Jun 2017 00:33:40 +0200
Subject: [PATCH 506/534] Create base-image and minimize layer count (#324)

* Create base-image and minimize layer count

* Create running-on-kubernetes.md
---
 docs/running-on-kubernetes.md                 |  6 +++-
 .../src/main/docker/driver/Dockerfile         | 17 ++-------
 .../src/main/docker/executor/Dockerfile       | 17 ++-------
 .../src/main/docker/init-container/Dockerfile | 16 +--------
 .../docker/resource-staging-server/Dockerfile | 16 +--------
 .../main/docker/shuffle-service/Dockerfile    | 17 ++-------
 .../src/main/docker/spark-base/Dockerfile     | 35 +++++++++++++++++++
 .../docker/SparkDockerImageBuilder.scala      |  2 ++
 8 files changed, 50 insertions(+), 76 deletions(-)
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/spark-base/Dockerfile

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index dc3cf738832ad..c10630fc5c5c6 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -49,7 +49,7 @@ If you wish to use pre-built docker images, you may use the images published in
 </table>
 
 You may also build these docker images from sources, or customize them as required. Spark distributions include the
-Docker files for the driver, executor, and init-container at `dockerfiles/driver/Dockerfile`,
+Docker files for the base-image, driver, executor, and init-container at `dockerfiles/spark-base/Dockerfile`, `dockerfiles/driver/Dockerfile`,
 `dockerfiles/executor/Dockerfile`, and `dockerfiles/init-container/Dockerfile` respectively. Use these Docker files to
 build the Docker images, and then tag them with the registry that the images should be sent to. Finally, push the images
 to the registry.
@@ -57,12 +57,16 @@ to the registry.
 For example, if the registry host is `registry-host` and the registry is listening on port 5000:
 
     cd $SPARK_HOME
+    docker build -t registry-host:5000/spark-base:latest -f dockerfiles/driver/spark-base .
     docker build -t registry-host:5000/spark-driver:latest -f dockerfiles/driver/Dockerfile .
     docker build -t registry-host:5000/spark-executor:latest -f dockerfiles/executor/Dockerfile .
     docker build -t registry-host:5000/spark-init:latest -f dockerfiles/init-container/Dockerfile .
+    docker push registry-host:5000/spark-base:latest
     docker push registry-host:5000/spark-driver:latest
     docker push registry-host:5000/spark-executor:latest
     docker push registry-host:5000/spark-init:latest
+    
+Note that `spark-base` is the base image for the other images.  It must be built first before the other images, and then afterwards the other images can be built in any order.
 
 ## Submitting Applications to Kubernetes
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
index fa651ff43aaa0..6bbff8ef64a0f 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver/Dockerfile
@@ -15,26 +15,13 @@
 # limitations under the License.
 #
 
-FROM openjdk:8-alpine
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-driver:latest -f dockerfiles/driver/Dockerfile .
 
-RUN apk upgrade --update
-RUN apk add --update bash tini
-RUN mkdir -p /opt/spark
-RUN touch /opt/spark/RELEASE
-
-ADD jars /opt/spark/jars
-ADD examples /opt/spark/examples
-ADD bin /opt/spark/bin
-ADD sbin /opt/spark/sbin
-ADD conf /opt/spark/conf
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark
+COPY examples /opt/spark/examples
 
 CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index fbad43b6255b9..9c9efb23d7e95 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -15,26 +15,13 @@
 # limitations under the License.
 #
 
-FROM openjdk:8-alpine
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
-RUN apk upgrade --update
-RUN apk add --update bash tini
-RUN mkdir -p /opt/spark
-RUN touch /opt/spark/RELEASE
-
-ADD jars /opt/spark/jars
-ADD examples /opt/spark/examples
-ADD bin /opt/spark/bin
-ADD sbin /opt/spark/sbin
-ADD conf /opt/spark/conf
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark
+COPY examples /opt/spark/examples
 
 # TODO support spark.executor.extraClassPath
 CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
index 40557a7465a8a..6bff06da12840 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
@@ -15,24 +15,10 @@
 # limitations under the License.
 #
 
-FROM openjdk:8-alpine
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
-RUN apk upgrade --update
-RUN apk add --update bash tini
-RUN mkdir -p /opt/spark
-RUN touch /opt/spark/RELEASE
-
-ADD jars /opt/spark/jars
-ADD bin /opt/spark/bin
-ADD sbin /opt/spark/sbin
-ADD conf /opt/spark/conf
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark
-
 ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.KubernetesSparkDependencyDownloadInitContainer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
index c8b13c44207bc..c9a92fa1c5b62 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
@@ -15,24 +15,10 @@
 # limitations under the License.
 #
 
-FROM openjdk:8-alpine
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
 
-RUN apk upgrade --update
-RUN apk add --update bash tini
-RUN mkdir -p /opt/spark
-RUN touch /opt/spark/RELEASE
-
-ADD jars /opt/spark/jars
-ADD bin /opt/spark/bin
-ADD sbin /opt/spark/sbin
-ADD conf /opt/spark/conf
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark
-
 ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.ResourceStagingServer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
index 1f64376b89aae..7f4e2aa51b67d 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
@@ -15,25 +15,12 @@
 # limitations under the License.
 #
 
-FROM openjdk:8-alpine
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-shuffle:latest -f dockerfiles/shuffle/Dockerfile .
 
-RUN apk upgrade --update
-RUN apk add --update bash tini
-RUN mkdir -p /opt/spark
-RUN touch /opt/spark/RELEASE
-
-ADD jars /opt/spark/jars
-ADD examples /opt/spark/examples
-ADD bin /opt/spark/bin
-ADD sbin /opt/spark/sbin
-ADD conf /opt/spark/conf
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark
+COPY examples /opt/spark/examples
 
 ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.kubernetes.KubernetesExternalShuffleService", "1" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/spark-base/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/spark-base/Dockerfile
new file mode 100644
index 0000000000000..b0925e3bb0416
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/spark-base/Dockerfile
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM openjdk:8-alpine
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-base:latest -f dockerfiles/spark-base/Dockerfile .
+
+RUN apk upgrade --no-cache && \
+    apk add --no-cache bash tini && \
+    mkdir -p /opt/spark && \
+    touch /opt/spark/RELEASE
+
+COPY jars /opt/spark/jars
+COPY bin /opt/spark/bin
+COPY sbin /opt/spark/sbin
+COPY conf /opt/spark/conf
+
+ENV SPARK_HOME /opt/spark
+
+WORKDIR /opt/spark
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index 3ff72829f88a7..4db19478f44bc 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -28,6 +28,7 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
 
   private val DOCKER_BUILD_PATH = Paths.get("target", "docker")
   // Dockerfile paths must be relative to the build path.
+  private val BASE_DOCKER_FILE = "dockerfiles/spark-base/Dockerfile"
   private val DRIVER_DOCKER_FILE = "dockerfiles/driver/Dockerfile"
   private val EXECUTOR_DOCKER_FILE = "dockerfiles/executor/Dockerfile"
   private val SHUFFLE_SERVICE_DOCKER_FILE = "dockerfiles/shuffle-service/Dockerfile"
@@ -60,6 +61,7 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
 
   def buildSparkDockerImages(): Unit = {
     Eventually.eventually(TIMEOUT, INTERVAL) { dockerClient.ping() }
+    buildImage("spark-base", BASE_DOCKER_FILE)
     buildImage("spark-driver", DRIVER_DOCKER_FILE)
     buildImage("spark-executor", EXECUTOR_DOCKER_FILE)
     buildImage("spark-shuffle", SHUFFLE_SERVICE_DOCKER_FILE)

From 2f80b1d3e7754afe588fd16243907c68fbf614d0 Mon Sep 17 00:00:00 2001
From: Shuai Lin <linshuai2012@gmail.com>
Date: Thu, 8 Jun 2017 19:07:14 -0500
Subject: [PATCH 507/534] Added log4j config for k8s unit tests. (#314)

---
 .../core/src/test/resources/log4j.properties  | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 resource-managers/kubernetes/core/src/test/resources/log4j.properties

diff --git a/resource-managers/kubernetes/core/src/test/resources/log4j.properties b/resource-managers/kubernetes/core/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..ad95fadb7c0c0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/resources/log4j.properties
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/unit-tests.log
+log4j.rootCategory=INFO, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Ignore messages below warning level from a few verbose libraries.
+log4j.logger.com.sun.jersey=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.mortbay=WARN
+log4j.logger.org.spark_project.jetty=WARN

From d4ec1360547693e71c9f1fb42b5691b7d7bfc362 Mon Sep 17 00:00:00 2001
From: Kimoon Kim <kimoon@pepperdata.com>
Date: Tue, 13 Jun 2017 18:08:45 -0700
Subject: [PATCH 508/534] Use node affinity to launch executors on preferred
 nodes benefitting from data locality (#316)

* Use node affinity to launch executors on data local nodes

* Fix comment style

* Use JSON object mapper

* Address review comments

* Fix a style issue

* Clean up and add a TODO

* Fix style issue

* Address review comments
---
 .../spark/deploy/kubernetes/constants.scala   |   1 +
 .../KubernetesClusterSchedulerBackend.scala   | 103 ++++++++++++++++--
 2 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 9c46d7494b187..f2f1136e54fe4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -90,6 +90,7 @@ package object constants {
   private[spark] val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
 
   // Miscellaneous
+  private[spark] val ANNOTATION_EXECUTOR_NODE_AFFINITY = "scheduler.alpha.kubernetes.io/affinity"
   private[spark] val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
   private[spark] val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
   private[spark] val MEMORY_OVERHEAD_FACTOR = 0.10
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 6ab6480d848a2..85ce5f01200b2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -17,9 +17,12 @@
 package org.apache.spark.scheduler.cluster.kubernetes
 
 import java.io.Closeable
+import java.net.InetAddress
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder}
 import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
@@ -177,16 +180,18 @@ private[spark] class KubernetesClusterSchedulerBackend(
     .newDaemonSingleThreadScheduledExecutor("kubernetes-pod-allocator")
 
   private val allocatorRunnable: Runnable = new Runnable {
+
     override def run(): Unit = {
       if (totalRegisteredExecutors.get() < runningExecutorPods.size) {
         logDebug("Waiting for pending executors before scaling")
       } else if (totalExpectedExecutors.get() <= runningExecutorPods.size) {
         logDebug("Maximum allowed executor limit reached. Not scaling up further.")
       } else {
+        val nodeToLocalTaskCount = getNodesWithLocalTaskCounts
         RUNNING_EXECUTOR_PODS_LOCK.synchronized {
           for (i <- 0 until math.min(
             totalExpectedExecutors.get - runningExecutorPods.size, podAllocationSize)) {
-            runningExecutorPods += allocateNewExecutorPod()
+            runningExecutorPods += allocateNewExecutorPod(nodeToLocalTaskCount)
             logInfo(
               s"Requesting a new executor, total executors is now ${runningExecutorPods.size}")
           }
@@ -195,6 +200,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
+  private val objectMapper = new ObjectMapper().registerModule(DefaultScalaModule)
+
   private def getShuffleClient(): KubernetesExternalShuffleClient = {
     new KubernetesExternalShuffleClient(
       SparkTransportConf.fromSparkConf(conf, "shuffle"),
@@ -283,7 +290,70 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
-  private def allocateNewExecutorPod(): (String, Pod) = {
+  /**
+   * @return A map of K8s cluster nodes to the number of tasks that could benefit from data
+   *         locality if an executor launches on the cluster node.
+   */
+  private def getNodesWithLocalTaskCounts() : Map[String, Int] = {
+    val executorPodsWithIPs = EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+      executorPodsByIPs.values.toList  // toList makes a defensive copy.
+    }
+    val nodeToLocalTaskCount = mutable.Map[String, Int]() ++
+      KubernetesClusterSchedulerBackend.this.synchronized {
+        hostToLocalTaskCount
+      }
+    for (pod <- executorPodsWithIPs) {
+      // Remove cluster nodes that are running our executors already.
+      // TODO: This prefers spreading out executors across nodes. In case users want
+      // consolidating executors on fewer nodes, introduce a flag. See the spark.deploy.spreadOut
+      // flag that Spark standalone has: https://spark.apache.org/docs/latest/spark-standalone.html
+      nodeToLocalTaskCount.remove(pod.getSpec.getNodeName).nonEmpty ||
+        nodeToLocalTaskCount.remove(pod.getStatus.getHostIP).nonEmpty ||
+        nodeToLocalTaskCount.remove(
+          InetAddress.getByName(pod.getStatus.getHostIP).getCanonicalHostName).nonEmpty
+    }
+    nodeToLocalTaskCount.toMap[String, Int]
+  }
+
+  private def addNodeAffinityAnnotationIfUseful(basePodBuilder: PodBuilder,
+                                                nodeToTaskCount: Map[String, Int]): PodBuilder = {
+    def scaleToRange(value: Int, baseMin: Double, baseMax: Double,
+                     rangeMin: Double, rangeMax: Double): Int =
+      (((rangeMax - rangeMin) * (value - baseMin) / (baseMax - baseMin)) + rangeMin).toInt
+
+    if (nodeToTaskCount.nonEmpty) {
+      val taskTotal = nodeToTaskCount.foldLeft(0)(_ + _._2)
+      // Normalize to node affinity weights in 1 to 100 range.
+      val nodeToWeight = nodeToTaskCount.map{
+        case (node, taskCount) =>
+          (node, scaleToRange(taskCount, 1, taskTotal, rangeMin = 1, rangeMax = 100))}
+      val weightToNodes = nodeToWeight.groupBy(_._2).mapValues(_.keys)
+      // @see https://kubernetes.io/docs/concepts/configuration/assign-pod-node
+      val nodeAffinityJson = objectMapper.writeValueAsString(SchedulerAffinity(NodeAffinity(
+          preferredDuringSchedulingIgnoredDuringExecution =
+            for ((weight, nodes) <- weightToNodes) yield
+              WeightedPreference(weight,
+                Preference(Array(MatchExpression("kubernetes.io/hostname", "In", nodes))))
+        )))
+      // TODO: Use non-annotation syntax when we switch to K8s version 1.6.
+      logDebug(s"Adding nodeAffinity as annotation $nodeAffinityJson")
+      basePodBuilder.editMetadata()
+        .addToAnnotations(ANNOTATION_EXECUTOR_NODE_AFFINITY, nodeAffinityJson)
+        .endMetadata()
+    } else {
+      basePodBuilder
+    }
+  }
+
+  /**
+   * Allocates a new executor pod
+   *
+   * @param nodeToLocalTaskCount  A map of K8s cluster nodes to the number of tasks that could
+   *                              benefit from data locality if an executor launches on the cluster
+   *                              node.
+   * @return A tuple of the new executor name and the Pod data structure.
+   */
+  private def allocateNewExecutorPod(nodeToLocalTaskCount: Map[String, Int]): (String, Pod) = {
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
     val name = s"$executorPodNamePrefix-exec-$executorId"
 
@@ -393,14 +463,19 @@ private[spark] class KubernetesClusterSchedulerBackend(
             .endSpec()
         }
       }.getOrElse(basePodBuilder)
-    val resolvedExecutorPod = executorInitContainerBootstrap.map { bootstrap =>
-      bootstrap.bootstrapInitContainerAndVolumes(
-        "executor",
-        withMaybeShuffleConfigPodBuilder)
-    }.getOrElse(withMaybeShuffleConfigPodBuilder)
+
+    val executorInitContainerPodBuilder = executorInitContainerBootstrap.map {
+        bootstrap =>
+          bootstrap.bootstrapInitContainerAndVolumes(
+            "executor",
+            withMaybeShuffleConfigPodBuilder)
+      }.getOrElse(withMaybeShuffleConfigPodBuilder)
+
+    val resolvedExecutorPodBuilder = addNodeAffinityAnnotationIfUseful(
+      executorInitContainerPodBuilder, nodeToLocalTaskCount)
 
     try {
-      (executorId, kubernetesClient.pods.create(resolvedExecutorPod.build()))
+      (executorId, kubernetesClient.pods.create(resolvedExecutorPodBuilder.build()))
     } catch {
       case throwable: Throwable =>
         logError("Failed to allocate executor pod.", throwable)
@@ -521,3 +596,15 @@ private object KubernetesClusterSchedulerBackend {
   private val DEFAULT_STATIC_PORT = 10000
   private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
 }
+
+/**
+ * These case classes model K8s node affinity syntax for
+ * preferredDuringSchedulingIgnoredDuringExecution.
+ * @see https://kubernetes.io/docs/concepts/configuration/assign-pod-node
+ */
+case class SchedulerAffinity(nodeAffinity: NodeAffinity)
+case class NodeAffinity(preferredDuringSchedulingIgnoredDuringExecution:
+                        Iterable[WeightedPreference])
+case class WeightedPreference(weight: Int, preference: Preference)
+case class Preference(matchExpressions: Array[MatchExpression])
+case class MatchExpression(key: String, operator: String, values: Iterable[String])

From d6a311192cbd59314f59368991bd9ee1ff65b3e9 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 13 Jun 2017 21:52:19 -0700
Subject: [PATCH 509/534] Fix sbt build. (#344)

* Fix sbt build.

- Remove extraneous Feign dependency that we no longer use in submission
v2.
- Exclude Jackson from various modules to ensure every Jackson module is
forced to 2.6.5.
- Fix a linter error only caught by sbt.
- Add Kubernetes modules to various parts of the SBT infrastructure

* Actually remove feign

* Actually exclude Jackson from kubernetes client.
---
 dev/deps/spark-deps-hadoop-2.2                | 20 +++++++
 dev/deps/spark-deps-hadoop-2.3                | 21 +++++++-
 dev/deps/spark-deps-hadoop-2.4                | 21 +++++++-
 dev/deps/spark-deps-hadoop-2.6                | 21 +++++++-
 dev/deps/spark-deps-hadoop-2.7                | 21 +++++++-
 dev/sparktestsupport/modules.py               |  8 +++
 dev/test-dependencies.sh                      |  2 +-
 pom.xml                                       | 54 +++++++++++--------
 resource-managers/kubernetes/core/pom.xml     | 42 ++++++++-------
 ...riverPodKubernetesCredentialsMounter.scala |  2 +-
 10 files changed, 165 insertions(+), 47 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 89bfcef4d9466..97c8a38cf6143 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -8,9 +8,12 @@ aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar
 arpack_combined_all-0.1.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
+bcpkix-jdk15on-1.54.jar
+bcprov-jdk15on-1.54.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.12.jar
 breeze_2.11-0.12.jar
@@ -40,6 +43,8 @@ commons-math3-3.4.1.jar
 commons-net-2.2.jar
 commons-pool-1.5.4.jar
 compress-lzf-1.0.3.jar
+converter-jackson-2.2.0.jar
+converter-scalars-2.2.0.jar
 core-1.1.2.jar
 curator-client-2.4.0.jar
 curator-framework-2.4.0.jar
@@ -49,6 +54,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+generex-1.0.1.jar
 guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
@@ -77,7 +83,11 @@ jackson-annotations-2.6.5.jar
 jackson-core-2.6.5.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.5.jar
+jackson-dataformat-yaml-2.6.5.jar
+jackson-jaxrs-base-2.6.5.jar
+jackson-jaxrs-json-provider-2.6.5.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.5.jar
 jackson-module-paranamer-2.6.5.jar
 jackson-module-scala_2.11-2.6.5.jar
 janino-3.0.0.jar
@@ -96,6 +106,7 @@ jersey-container-servlet-2.22.2.jar
 jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
+jersey-media-multipart-2.22.2.jar
 jersey-server-2.22.2.jar
 jets3t-0.7.1.jar
 jetty-util-6.1.26.jar
@@ -111,20 +122,26 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-2.2.13.jar
+kubernetes-model-1.0.67.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.2.jar
 libthrift-0.9.2.jar
 log4j-1.2.17.jar
+logging-interceptor-3.6.0.jar
 lz4-1.3.0.jar
 mesos-1.0.0-shaded-protobuf.jar
 metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
+mimepull-1.9.6.jar
 minlog-1.3.0.jar
 netty-3.8.0.Final.jar
 netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
+okhttp-3.6.0.jar
+okio-1.11.0.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
@@ -141,6 +158,7 @@ pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.4.jar
 pyrolite-4.13.jar
+retrofit-2.2.0.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
@@ -150,6 +168,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.0.0.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.7.4.jar
@@ -163,4 +182,5 @@ validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 8df3858825e13..21ca9259ed3ff 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -9,11 +9,13 @@ aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar
 arpack_combined_all-0.1.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
-bcprov-jdk15on-1.51.jar
+bcpkix-jdk15on-1.54.jar
+bcprov-jdk15on-1.54.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.12.jar
 breeze_2.11-0.12.jar
@@ -42,6 +44,8 @@ commons-math3-3.4.1.jar
 commons-net-2.2.jar
 commons-pool-1.5.4.jar
 compress-lzf-1.0.3.jar
+converter-jackson-2.2.0.jar
+converter-scalars-2.2.0.jar
 core-1.1.2.jar
 curator-client-2.4.0.jar
 curator-framework-2.4.0.jar
@@ -51,6 +55,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+generex-1.0.1.jar
 guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
@@ -79,7 +84,11 @@ jackson-annotations-2.6.5.jar
 jackson-core-2.6.5.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.5.jar
+jackson-dataformat-yaml-2.6.5.jar
+jackson-jaxrs-base-2.6.5.jar
+jackson-jaxrs-json-provider-2.6.5.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.5.jar
 jackson-module-paranamer-2.6.5.jar
 jackson-module-scala_2.11-2.6.5.jar
 janino-3.0.0.jar
@@ -100,6 +109,7 @@ jersey-container-servlet-2.22.2.jar
 jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
+jersey-media-multipart-2.22.2.jar
 jersey-server-2.22.2.jar
 jets3t-0.9.3.jar
 jetty-6.1.26.jar
@@ -116,10 +126,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-2.2.13.jar
+kubernetes-model-1.0.67.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.2.jar
 libthrift-0.9.2.jar
 log4j-1.2.17.jar
+logging-interceptor-3.6.0.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
 mesos-1.0.0-shaded-protobuf.jar
@@ -127,11 +140,14 @@ metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
+mimepull-1.9.6.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
 netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
+okhttp-3.6.0.jar
+okio-1.11.0.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
@@ -148,6 +164,7 @@ pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.4.jar
 pyrolite-4.13.jar
+retrofit-2.2.0.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
@@ -157,6 +174,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.0.0.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.7.4.jar
@@ -171,4 +189,5 @@ validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 71e7fb6dd243d..f71a3cd06216c 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -9,11 +9,13 @@ aopalliance-1.0.jar
 aopalliance-repackaged-2.4.0-b34.jar
 apache-log4j-extras-1.2.17.jar
 arpack_combined_all-0.1.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
-bcprov-jdk15on-1.51.jar
+bcpkix-jdk15on-1.54.jar
+bcprov-jdk15on-1.54.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.12.jar
 breeze_2.11-0.12.jar
@@ -42,6 +44,8 @@ commons-math3-3.4.1.jar
 commons-net-2.2.jar
 commons-pool-1.5.4.jar
 compress-lzf-1.0.3.jar
+converter-jackson-2.2.0.jar
+converter-scalars-2.2.0.jar
 core-1.1.2.jar
 curator-client-2.4.0.jar
 curator-framework-2.4.0.jar
@@ -51,6 +55,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+generex-1.0.1.jar
 guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
@@ -79,7 +84,11 @@ jackson-annotations-2.6.5.jar
 jackson-core-2.6.5.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.5.jar
+jackson-dataformat-yaml-2.6.5.jar
+jackson-jaxrs-base-2.6.5.jar
+jackson-jaxrs-json-provider-2.6.5.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.5.jar
 jackson-module-paranamer-2.6.5.jar
 jackson-module-scala_2.11-2.6.5.jar
 janino-3.0.0.jar
@@ -100,6 +109,7 @@ jersey-container-servlet-2.22.2.jar
 jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
+jersey-media-multipart-2.22.2.jar
 jersey-server-2.22.2.jar
 jets3t-0.9.3.jar
 jetty-6.1.26.jar
@@ -116,10 +126,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-2.2.13.jar
+kubernetes-model-1.0.67.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.2.jar
 libthrift-0.9.2.jar
 log4j-1.2.17.jar
+logging-interceptor-3.6.0.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
 mesos-1.0.0-shaded-protobuf.jar
@@ -127,11 +140,14 @@ metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
+mimepull-1.9.6.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
 netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
+okhttp-3.6.0.jar
+okio-1.11.0.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
@@ -148,6 +164,7 @@ pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.4.jar
 pyrolite-4.13.jar
+retrofit-2.2.0.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
@@ -157,6 +174,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.0.0.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.7.4.jar
@@ -171,4 +189,5 @@ validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index ba31391495f54..211946d583879 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -13,11 +13,13 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
-bcprov-jdk15on-1.51.jar
+bcpkix-jdk15on-1.54.jar
+bcprov-jdk15on-1.54.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.12.jar
 breeze_2.11-0.12.jar
@@ -46,6 +48,8 @@ commons-math3-3.4.1.jar
 commons-net-2.2.jar
 commons-pool-1.5.4.jar
 compress-lzf-1.0.3.jar
+converter-jackson-2.2.0.jar
+converter-scalars-2.2.0.jar
 core-1.1.2.jar
 curator-client-2.6.0.jar
 curator-framework-2.6.0.jar
@@ -55,6 +59,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -85,8 +90,12 @@ jackson-annotations-2.6.5.jar
 jackson-core-2.6.5.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.5.jar
+jackson-dataformat-yaml-2.6.5.jar
 jackson-jaxrs-1.9.13.jar
+jackson-jaxrs-base-2.6.5.jar
+jackson-jaxrs-json-provider-2.6.5.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.5.jar
 jackson-module-paranamer-2.6.5.jar
 jackson-module-scala_2.11-2.6.5.jar
 jackson-xc-1.9.13.jar
@@ -108,6 +117,7 @@ jersey-container-servlet-2.22.2.jar
 jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
+jersey-media-multipart-2.22.2.jar
 jersey-server-2.22.2.jar
 jets3t-0.9.3.jar
 jetty-6.1.26.jar
@@ -124,10 +134,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-2.2.13.jar
+kubernetes-model-1.0.67.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.2.jar
 libthrift-0.9.2.jar
 log4j-1.2.17.jar
+logging-interceptor-3.6.0.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
 mesos-1.0.0-shaded-protobuf.jar
@@ -135,11 +148,14 @@ metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
+mimepull-1.9.6.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
 netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
+okhttp-3.6.0.jar
+okio-1.11.0.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
@@ -156,6 +172,7 @@ pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.4.jar
 pyrolite-4.13.jar
+retrofit-2.2.0.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
@@ -165,6 +182,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.0.0.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.7.4.jar
@@ -180,4 +198,5 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index b129e5a99e2ff..d0a472d3d3805 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -13,11 +13,13 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
-bcprov-jdk15on-1.51.jar
+bcpkix-jdk15on-1.54.jar
+bcprov-jdk15on-1.54.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.12.jar
 breeze_2.11-0.12.jar
@@ -46,6 +48,8 @@ commons-math3-3.4.1.jar
 commons-net-2.2.jar
 commons-pool-1.5.4.jar
 compress-lzf-1.0.3.jar
+converter-jackson-2.2.0.jar
+converter-scalars-2.2.0.jar
 core-1.1.2.jar
 curator-client-2.6.0.jar
 curator-framework-2.6.0.jar
@@ -55,6 +59,7 @@ datanucleus-core-3.2.10.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -85,8 +90,12 @@ jackson-annotations-2.6.5.jar
 jackson-core-2.6.5.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.5.jar
+jackson-dataformat-yaml-2.6.5.jar
 jackson-jaxrs-1.9.13.jar
+jackson-jaxrs-base-2.6.5.jar
+jackson-jaxrs-json-provider-2.6.5.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.5.jar
 jackson-module-paranamer-2.6.5.jar
 jackson-module-scala_2.11-2.6.5.jar
 jackson-xc-1.9.13.jar
@@ -108,6 +117,7 @@ jersey-container-servlet-2.22.2.jar
 jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
+jersey-media-multipart-2.22.2.jar
 jersey-server-2.22.2.jar
 jets3t-0.9.3.jar
 jetty-6.1.26.jar
@@ -125,10 +135,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-2.2.13.jar
+kubernetes-model-1.0.67.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.2.jar
 libthrift-0.9.2.jar
 log4j-1.2.17.jar
+logging-interceptor-3.6.0.jar
 lz4-1.3.0.jar
 mail-1.4.7.jar
 mesos-1.0.0-shaded-protobuf.jar
@@ -136,11 +149,14 @@ metrics-core-3.1.2.jar
 metrics-graphite-3.1.2.jar
 metrics-json-3.1.2.jar
 metrics-jvm-3.1.2.jar
+mimepull-1.9.6.jar
 minlog-1.3.0.jar
 mx4j-3.0.2.jar
 netty-3.8.0.Final.jar
 netty-all-4.0.42.Final.jar
 objenesis-2.1.jar
+okhttp-3.6.0.jar
+okio-1.11.0.jar
 opencsv-2.3.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
@@ -157,6 +173,7 @@ pmml-schema-1.2.15.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.4.jar
 pyrolite-4.13.jar
+retrofit-2.2.0.jar
 scala-compiler-2.11.8.jar
 scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
@@ -166,6 +183,7 @@ scalap-2.11.8.jar
 shapeless_2.11-2.0.0.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.7.4.jar
@@ -181,4 +199,5 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index b34ab51f3b996..0cc9d71d962ce 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -490,6 +490,14 @@ def __hash__(self):
     sbt_test_goals=["mesos/test"]
 )
 
+kubernetes = Module(
+    name="kubernetes",
+    dependencies=[],
+    source_file_regexes=["resource-managers/kubernetes/core"],
+    build_profile_flags=["-Pkubernetes"],
+    sbt_test_goals=["kubernetes/test"]
+)
+
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 4014f42e1983c..b1555957ecc50 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pyarn -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pyarn -Phive -Pkubernetes"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.2
diff --git a/pom.xml b/pom.xml
index 7f9325fa5f185..7035938515a58 100644
--- a/pom.xml
+++ b/pom.xml
@@ -136,7 +136,6 @@
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.1</parquet.version>
     <hive.parquet.version>1.6.0</hive.parquet.version>
-    <feign.version>8.18.0</feign.version>
     <retrofit.version>2.2.0</retrofit.version>
     <bouncycastle.version>1.54</bouncycastle.version>
     <jetty.version>9.2.16.v20160414</jetty.version>
@@ -308,35 +307,43 @@
         <version>${chill.version}</version>
       </dependency>
 
-      <dependency>
-        <groupId>com.netflix.feign</groupId>
-        <artifactId>feign-core</artifactId>
-        <version>${feign.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>com.netflix.feign</groupId>
-        <artifactId>feign-okhttp</artifactId>
-        <version>${feign.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>com.netflix.feign</groupId>
-        <artifactId>feign-jackson</artifactId>
-        <version>${feign.version}</version>
-      </dependency>
-      <dependency>
-        <groupId>com.netflix.feign</groupId>
-        <artifactId>feign-jaxrs</artifactId>
-        <version>${feign.version}</version>
-      </dependency>
       <dependency>
         <groupId>com.squareup.retrofit2</groupId>
         <artifactId>retrofit</artifactId>
         <version>${retrofit.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-annotations</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
       <dependency>
         <groupId>com.squareup.retrofit2</groupId>
         <artifactId>converter-jackson</artifactId>
         <version>${retrofit.version}</version>
+        <exclusions>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-core</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-annotations</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
       <dependency>
         <groupId>com.squareup.retrofit2</groupId>
@@ -649,6 +656,11 @@
         <artifactId>jackson-annotations</artifactId>
         <version>${fasterxml.jackson.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.dataformat</groupId>
+        <artifactId>jackson-dataformat-yaml</artifactId>
+        <version>${fasterxml.jackson.version}</version>
+      </dependency>
       <!-- Guava is excluded because of SPARK-6149.  The Guava version referenced in this module is
            15.0, which causes runtime incompatibility issues. -->
       <dependency>
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index a227342f46771..c90a824b1b8b1 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -51,14 +51,30 @@
       <groupId>io.fabric8</groupId>
       <artifactId>kubernetes-client</artifactId>
       <version>${kubernetes.client.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-core</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-databind</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>jackson-annotations</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.dataformat</groupId>
+          <artifactId>jackson-dataformat-yaml</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
+    <!-- Required by kubernetes-client but we exclude it -->
     <dependency>
-      <groupId>com.netflix.feign</groupId>
-      <artifactId>feign-core</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>com.netflix.feign</groupId>
-      <artifactId>feign-okhttp</artifactId>
+      <groupId>com.fasterxml.jackson.dataformat</groupId>
+      <artifactId>jackson-dataformat-yaml</artifactId>
+      <version>${fasterxml.jackson.version}</version>
     </dependency>
     <dependency>
       <groupId>org.glassfish.jersey.containers</groupId>
@@ -68,10 +84,6 @@
       <groupId>org.glassfish.jersey.media</groupId>
       <artifactId>jersey-media-multipart</artifactId>
     </dependency>
-    <dependency>
-      <groupId>com.netflix.feign</groupId>
-      <artifactId>feign-jackson</artifactId>
-    </dependency>
     <dependency>
       <groupId>com.squareup.retrofit2</groupId>
       <artifactId>retrofit</artifactId>
@@ -85,16 +97,6 @@
       <artifactId>converter-scalars</artifactId>
     </dependency>
 
-    <dependency>
-      <groupId>com.netflix.feign</groupId>
-      <artifactId>feign-jaxrs</artifactId>
-      <exclusions>
-        <exclusion>
-          <groupId>javax.ws.rs</groupId>
-          <artifactId>jsr311-api</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.jaxrs</groupId>
       <artifactId>jackson-jaxrs-json-provider</artifactId>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
index b13800f389605..25e7c3b3ebd89 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
@@ -18,12 +18,12 @@ package org.apache.spark.deploy.kubernetes.submit
 
 import io.fabric8.kubernetes.api.model.{PodBuilder, Secret, SecretBuilder}
 import scala.collection.JavaConverters._
+import scala.language.implicitConversions
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.KubernetesCredentials
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.internal.config.OptionalConfigEntry
 
 private[spark] trait DriverPodKubernetesCredentialsMounter {
 

From fdd50f19cceca13f821be5c3676ea5f7f7ab2b9c Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 14 Jun 2017 14:03:09 -0700
Subject: [PATCH 510/534] New API for custom labels and annotations. (#346)

* New API for custom labels and annotations.

This APi allows for these labels and annotations to have = and ,
characters, which is hard to accomplish in the old scheme.

* Compare correct values in requirements

* Use helper method

* Address comments.

* Fix scalastyle

* Use variable

* Remove unused import
---
 docs/running-on-kubernetes.md                 | 48 +++++++++++++++++++
 .../kubernetes/ConfigurationUtils.scala       | 31 +++++++++++-
 .../spark/deploy/kubernetes/config.scala      |  5 ++
 .../deploy/kubernetes/submit/Client.scala     | 37 ++++++++------
 .../KubernetesClusterSchedulerBackend.scala   | 19 ++++----
 .../kubernetes/submit/ClientV2Suite.scala     | 13 ++++-
 6 files changed, 127 insertions(+), 26 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index c10630fc5c5c6..52d847b4420cf 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -524,10 +524,52 @@ from the other deployment modes. See the [configuration page](configuration.html
     (typically 6-10%).
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.label.[labelKey]</code></td>
+  <td>(none)</td>
+  <td>
+    Adds a label to the driver pod, with key <code>labelKey</code> and the value as the configuration's value. For
+    example, setting <code>spark.kubernetes.driver.label.identifier</code> to <code>myIdentifier</code> will result in
+    the driver pod having a label with key <code>identifier</code> and value <code>myIdentifier</code>. Multiple labels
+    can be added by setting multiple configurations with this prefix.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.annotation.[annotationKey]</code></td>
+  <td>(none)</td>
+  <td>
+    Adds an annotation to the driver pod, with key <code>annotationKey</code> and the value as the configuration's
+    value. For example, setting <code>spark.kubernetes.driver.annotation.identifier</code> to <code>myIdentifier</code>
+    will result in the driver pod having an annotation with key <code>identifier</code> and value
+    <code>myIdentifier</code>. Multiple annotations can be added by setting multiple configurations with this prefix.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.label.[labelKey]</code></td>
+  <td>(none)</td>
+  <td>
+    Adds a label to all executor pods, with key <code>labelKey</code> and the value as the configuration's value. For
+    example, setting <code>spark.kubernetes.executor.label.identifier</code> to <code>myIdentifier</code> will result in
+    the executor pods having a label with key <code>identifier</code> and value <code>myIdentifier</code>. Multiple
+    labels can be added by setting multiple configurations with this prefix.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.annotation.[annotationKey]</code></td>
+  <td>(none)</td>
+  <td>
+    Adds an annotation to the executor pods, with key <code>annotationKey</code> and the value as the configuration's
+    value. For example, setting <code>spark.kubernetes.executor.annotation.identifier</code> to <code>myIdentifier</code>
+    will result in the executor pods having an annotation with key <code>identifier</code> and value
+    <code>myIdentifier</code>. Multiple annotations can be added by setting multiple configurations with this prefix.
+  </td>
+</tr>
 <tr>
   <td><code>spark.kubernetes.driver.labels</code></td>
   <td>(none)</td>
   <td>
+    <i>Deprecated.</i> Use <code>spark.kubernetes.driver.label.<labelKey></code> instead which supports <code>=</code>
+    and <code>,</code> characters in label values.
     Custom labels that will be added to the driver pod. This should be a comma-separated list of label key-value pairs,
     where each label is in the format <code>key=value</code>. Note that Spark also adds its own labels to the driver pod
     for bookkeeping purposes.
@@ -537,6 +579,8 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td><code>spark.kubernetes.driver.annotations</code></td>
   <td>(none)</td>
   <td>
+    <i>Deprecated.</i> Use <code>spark.kubernetes.driver.annotation.<annotationKey></code> instead which supports
+    <code>=</code> and <code>,</code> characters in annotation values.
     Custom annotations that will be added to the driver pod. This should be a comma-separated list of label key-value
     pairs, where each annotation is in the format <code>key=value</code>.
   </td>
@@ -545,6 +589,8 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td><code>spark.kubernetes.executor.labels</code></td>
   <td>(none)</td>
   <td>
+    <i>Deprecated.</i> Use <code>spark.kubernetes.executor.label.<labelKey></code> instead which supports
+    <code>=</code> and <code>,</code> characters in label values.
     Custom labels that will be added to the executor pods. This should be a comma-separated list of label key-value
     pairs, where each label is in the format <code>key=value</code>. Note that Spark also adds its own labels to the
     executor pods for bookkeeping purposes.
@@ -554,6 +600,8 @@ from the other deployment modes. See the [configuration page](configuration.html
   <td><code>spark.kubernetes.executor.annotations</code></td>
   <td>(none)</td>
   <td>
+    <i>Deprecated.</i> Use <code>spark.kubernetes.executor.annotation.<annotationKey></code> instead which supports
+    <code>=</code> and <code>,</code> characters in annotation values.
     Custom annotations that will be added to the executor pods. This should be a comma-separated list of annotation
     key-value pairs, where each annotation is in the format <code>key=value</code>.
   </td>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
index f3bd598556019..f461da4809b4d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.deploy.kubernetes
 
-import org.apache.spark.SparkException
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.OptionalConfigEntry
 
-object ConfigurationUtils {
+object ConfigurationUtils extends Logging {
   def parseKeyValuePairs(
     maybeKeyValues: Option[String],
     configKey: String,
@@ -38,4 +40,29 @@ object ConfigurationUtils {
       }).toMap
     }).getOrElse(Map.empty[String, String])
   }
+
+  def combinePrefixedKeyValuePairsWithDeprecatedConf(
+      sparkConf: SparkConf,
+      prefix: String,
+      deprecatedConf: OptionalConfigEntry[String],
+      configType: String): Map[String, String] = {
+    val deprecatedKeyValuePairsString = sparkConf.get(deprecatedConf)
+    deprecatedKeyValuePairsString.foreach { _ =>
+      logWarning(s"Configuration with key ${deprecatedConf.key} is deprecated. Use" +
+        s" configurations with prefix $prefix<key> instead.")
+    }
+    val fromDeprecated = parseKeyValuePairs(
+        deprecatedKeyValuePairsString,
+        deprecatedConf.key,
+        configType)
+    val fromPrefix = sparkConf.getAllWithPrefix(prefix)
+    val combined = fromDeprecated.toSeq ++ fromPrefix
+    combined.groupBy(_._1).foreach {
+      case (key, values) =>
+        require(values.size == 1,
+          s"Cannot have multiple values for a given $configType key, got key $key with" +
+            s" values $values")
+    }
+    combined.toMap
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index d1fd88fc880d1..70ea19e44ef8c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -113,6 +113,11 @@ package object config extends Logging {
       .bytesConf(ByteUnit.MiB)
       .createOptional
 
+  private[spark] val KUBERNETES_DRIVER_LABEL_PREFIX = "spark.kubernetes.driver.label."
+  private[spark] val KUBERNETES_DRIVER_ANNOTATION_PREFIX = "spark.kubernetes.driver.annotation."
+  private[spark] val KUBERNETES_EXECUTOR_LABEL_PREFIX = "spark.kubernetes.executor.label."
+  private[spark] val KUBERNETES_EXECUTOR_ANNOTATION_PREFIX = "spark.kubernetes.executor.annotation."
+
   private[spark] val KUBERNETES_DRIVER_LABELS =
     ConfigBuilder("spark.kubernetes.driver.labels")
       .doc("Custom labels that will be added to the driver pod. This should be a comma-separated" +
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index c2e616eadc1e0..a9699d8c34b4e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -29,6 +29,7 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.rest.kubernetes.ResourceStagingServerSslOptionsProviderImpl
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.ConfigEntry
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
 
@@ -82,17 +83,25 @@ private[spark] class Client(
   def run(): Unit = {
     validateNoDuplicateFileNames(sparkJars)
     validateNoDuplicateFileNames(sparkFiles)
-    val parsedCustomLabels = ConfigurationUtils.parseKeyValuePairs(
-        customLabels, KUBERNETES_DRIVER_LABELS.key, "labels")
-    require(!parsedCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
-      s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping" +
-      s" operations.")
-    val parsedCustomAnnotations = ConfigurationUtils.parseKeyValuePairs(
-      customAnnotations, KUBERNETES_DRIVER_ANNOTATIONS.key, "annotations")
-    require(!parsedCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION), s"Annotation with key" +
-      s" $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for Spark bookkeeping" +
-      s" operations.")
-    val allLabels = parsedCustomLabels ++ Map(
+
+    val driverCustomLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
+      sparkConf,
+      KUBERNETES_DRIVER_LABEL_PREFIX,
+      KUBERNETES_DRIVER_LABELS,
+      "label")
+    require(!driverCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
+        s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping" +
+        s" operations.")
+
+    val driverCustomAnnotations = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
+      sparkConf,
+      KUBERNETES_DRIVER_ANNOTATION_PREFIX,
+      KUBERNETES_DRIVER_ANNOTATIONS,
+      "annotation")
+    require(!driverCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION),
+        s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
+        s" Spark bookkeeping operations.")
+    val allDriverLabels = driverCustomLabels ++ Map(
         SPARK_APP_ID_LABEL -> kubernetesAppId,
         SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
 
@@ -138,8 +147,8 @@ private[spark] class Client(
     val basePod = new PodBuilder()
       .withNewMetadata()
         .withName(kubernetesDriverPodName)
-        .addToLabels(allLabels.asJava)
-        .addToAnnotations(parsedCustomAnnotations.asJava)
+        .addToLabels(allDriverLabels.asJava)
+        .addToAnnotations(driverCustomAnnotations.toMap.asJava)
         .addToAnnotations(SPARK_APP_NAME_ANNOTATION, appName)
         .endMetadata()
       .withNewSpec()
@@ -148,7 +157,7 @@ private[spark] class Client(
         .endSpec()
 
     val maybeSubmittedDependencyUploader = initContainerComponentsProvider
-        .provideInitContainerSubmittedDependencyUploader(allLabels)
+        .provideInitContainerSubmittedDependencyUploader(allDriverLabels)
     val maybeSubmittedResourceIdentifiers = maybeSubmittedDependencyUploader.map { uploader =>
       SubmittedResources(uploader.uploadJars(), uploader.uploadFiles())
     }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 85ce5f01200b2..4165eb8cbd067 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -62,10 +62,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
     org.apache.spark.internal.config.EXECUTOR_CLASS_PATH)
   private val executorJarsDownloadDir = conf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
 
-  private val executorLabels = ConfigurationUtils.parseKeyValuePairs(
-      conf.get(KUBERNETES_EXECUTOR_LABELS),
-      KUBERNETES_EXECUTOR_LABELS.key,
-      "executor labels")
+  private val executorLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
+      conf,
+      KUBERNETES_EXECUTOR_LABEL_PREFIX,
+      KUBERNETES_EXECUTOR_LABELS,
+      "executor label")
   require(
       !executorLabels.contains(SPARK_APP_ID_LABEL),
       s"Custom executor labels cannot contain $SPARK_APP_ID_LABEL as it is" +
@@ -74,11 +75,13 @@ private[spark] class KubernetesClusterSchedulerBackend(
       !executorLabels.contains(SPARK_EXECUTOR_ID_LABEL),
       s"Custom executor labels cannot contain $SPARK_EXECUTOR_ID_LABEL as it is reserved for" +
         s" Spark.")
-  private val executorAnnotations = ConfigurationUtils.parseKeyValuePairs(
-      conf.get(KUBERNETES_EXECUTOR_ANNOTATIONS),
-      KUBERNETES_EXECUTOR_ANNOTATIONS.key,
-      "executor annotations")
 
+  private val executorAnnotations =
+      ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
+          conf,
+          KUBERNETES_EXECUTOR_ANNOTATION_PREFIX,
+          KUBERNETES_EXECUTOR_ANNOTATIONS,
+          "executor annotation")
   private var shufflePodCache: Option[ShufflePodCache] = None
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val dockerImagePullPolicy = conf.get(DOCKER_IMAGE_PULL_POLICY)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index 193f36a7423b2..3945bef5bcfb8 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -49,12 +49,17 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val APP_ID = "spark-id"
   private val CUSTOM_LABEL_KEY = "customLabel"
   private val CUSTOM_LABEL_VALUE = "customLabelValue"
+  private val DEPRECATED_CUSTOM_LABEL_KEY = "deprecatedCustomLabel"
+  private val DEPRECATED_CUSTOM_LABEL_VALUE = "deprecatedCustomLabelValue"
   private val ALL_EXPECTED_LABELS = Map(
       CUSTOM_LABEL_KEY -> CUSTOM_LABEL_VALUE,
+      DEPRECATED_CUSTOM_LABEL_KEY -> DEPRECATED_CUSTOM_LABEL_VALUE,
       SPARK_APP_ID_LABEL -> APP_ID,
       SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
+  private val DEPRECATED_CUSTOM_ANNOTATION_KEY = "deprecatedCustomAnnotation"
+  private val DEPRECATED_CUSTOM_ANNOTATION_VALUE = "deprecatedCustomAnnotationValue"
   private val INIT_CONTAINER_SECRET_NAME = "init-container-secret"
   private val INIT_CONTAINER_SECRET_DATA = Map("secret-key" -> "secret-data")
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
@@ -94,8 +99,11 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       .set(DRIVER_DOCKER_IMAGE, CUSTOM_DRIVER_IMAGE)
       .set(org.apache.spark.internal.config.DRIVER_MEMORY, DRIVER_MEMORY_MB.toLong)
       .set(KUBERNETES_DRIVER_MEMORY_OVERHEAD, DRIVER_MEMORY_OVERHEAD_MB.toLong)
-      .set(KUBERNETES_DRIVER_LABELS, s"$CUSTOM_LABEL_KEY=$CUSTOM_LABEL_VALUE")
-      .set(KUBERNETES_DRIVER_ANNOTATIONS, s"$CUSTOM_ANNOTATION_KEY=$CUSTOM_ANNOTATION_VALUE")
+      .set(KUBERNETES_DRIVER_LABELS, s"$DEPRECATED_CUSTOM_LABEL_KEY=$DEPRECATED_CUSTOM_LABEL_VALUE")
+      .set(KUBERNETES_DRIVER_ANNOTATIONS,
+          s"$DEPRECATED_CUSTOM_ANNOTATION_KEY=$DEPRECATED_CUSTOM_ANNOTATION_VALUE")
+      .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
+      .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
       .set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, DRIVER_EXTRA_CLASSPATH)
       .set(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS, DRIVER_JAVA_OPTIONS)
   private val EXECUTOR_INIT_CONF_KEY = "executor-init-conf"
@@ -444,6 +452,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
 
   private def podHasCorrectAnnotations(pod: Pod): Boolean = {
     val expectedAnnotations = Map(
+      DEPRECATED_CUSTOM_ANNOTATION_KEY -> DEPRECATED_CUSTOM_ANNOTATION_VALUE,
       CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
       SPARK_APP_NAME_ANNOTATION -> APP_NAME,
       BOOTSTRAPPED_POD_ANNOTATION -> TRUE)

From a6291c67ef1dcc3abac72b8813be21fdef27d8a4 Mon Sep 17 00:00:00 2001
From: Hong Zhiguo <zhiguohong@tencent.com>
Date: Thu, 22 Jun 2017 16:57:12 +0800
Subject: [PATCH 511/534] Allow spark driver find shuffle pods in specified
 namespace (#357)

The conf property spark.kubernetes.shuffle.namespace is used to
specify the namesapce of shuffle pods.

In normal cases, only one "shuffle daemonset" is deployed and
shared by all spark pods.

The spark driver should be able to list and watch shuffle pods
in the namespace specified by user.

Note: by default, spark driver pod doesn't have authority to
list and watch shuffle pods in another namespace. Some action
is needed to grant it the authority. For example, below ABAC
policy works.

```
{"apiVersion": "abac.authorization.kubernetes.io/v1beta1", "kind":
"Policy", "spec": {"group": "system:serviceaccounts", "namespace":
"SHUFFLE_NAMESPACE",
"resource": "pods", "readonly": true}}
```
---
 .../spark/scheduler/cluster/kubernetes/ShufflePodCache.scala  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala
index 53b4e745ce7c7..15e02664589eb 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/ShufflePodCache.scala
@@ -37,7 +37,8 @@ private[spark] class ShufflePodCache (
 
   def start(): Unit = {
     // seed the initial cache.
-    val pods = client.pods().withLabels(dsLabels.asJava).list()
+    val pods = client.pods()
+      .inNamespace(dsNamespace).withLabels(dsLabels.asJava).list()
     pods.getItems.asScala.foreach {
       pod =>
         if (Readiness.isReady(pod)) {
@@ -50,6 +51,7 @@ private[spark] class ShufflePodCache (
 
     watcher = client
       .pods()
+      .inNamespace(dsNamespace)
       .withLabels(dsLabels.asJava)
       .watch(new Watcher[Pod] {
         override def eventReceived(action: Watcher.Action, p: Pod): Unit = {

From 08fe9446a5f2a53de5ea7ec299d06ce5085d921c Mon Sep 17 00:00:00 2001
From: Chun Chen <ramichen@tencent.com>
Date: Fri, 23 Jun 2017 14:03:24 +0800
Subject: [PATCH 512/534] Bypass init-containers when possible (#348)

---
 .../deploy/kubernetes/submit/Client.scala     | 41 ++++++++++---------
 ...riverInitContainerComponentsProvider.scala | 29 +++++++++----
 .../submit/InitContainerBundle.scala          | 26 ++++++++++++
 .../submit/KubernetesFileUtils.scala          |  4 ++
 .../kubernetes/submit/ClientV2Suite.scala     | 24 ++++-------
 5 files changed, 80 insertions(+), 44 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index a9699d8c34b4e..ac3a51e74f838 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -156,31 +156,33 @@ private[spark] class Client(
         .addToContainers(driverContainer)
         .endSpec()
 
-    val maybeSubmittedDependencyUploader = initContainerComponentsProvider
-        .provideInitContainerSubmittedDependencyUploader(allDriverLabels)
-    val maybeSubmittedResourceIdentifiers = maybeSubmittedDependencyUploader.map { uploader =>
+    val maybeSubmittedResourceIdentifiers = initContainerComponentsProvider
+      .provideInitContainerSubmittedDependencyUploader(allDriverLabels)
+      .map { uploader =>
       SubmittedResources(uploader.uploadJars(), uploader.uploadFiles())
     }
-    val maybeSecretBuilder = initContainerComponentsProvider
-        .provideSubmittedDependenciesSecretBuilder(
-            maybeSubmittedResourceIdentifiers.map(_.secrets()))
-    val maybeSubmittedDependenciesSecret = maybeSecretBuilder.map(_.build())
-    val initContainerConfigMap = initContainerComponentsProvider
-        .provideInitContainerConfigMapBuilder(maybeSubmittedResourceIdentifiers.map(_.ids()))
-        .build()
-    val podWithInitContainer = initContainerComponentsProvider
-        .provideInitContainerBootstrap()
-        .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod)
+    val maybeSubmittedDependenciesSecret = initContainerComponentsProvider
+      .provideSubmittedDependenciesSecretBuilder(
+        maybeSubmittedResourceIdentifiers.map(_.secrets()))
+      .map(_.build())
 
     val containerLocalizedFilesResolver = initContainerComponentsProvider
-        .provideContainerLocalizedFilesResolver()
+      .provideContainerLocalizedFilesResolver()
     val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
     val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
 
-    val executorInitContainerConfiguration = initContainerComponentsProvider
-        .provideExecutorInitContainerConfiguration()
-    val sparkConfWithExecutorInit = executorInitContainerConfiguration
-        .configureSparkConfForExecutorInitContainer(sparkConf)
+    val initContainerBundler = initContainerComponentsProvider
+      .provideInitContainerBundle(maybeSubmittedResourceIdentifiers.map(_.ids()),
+        resolvedSparkJars ++ resolvedSparkFiles)
+
+    val podWithInitContainer = initContainerBundler.map(
+      _.sparkPodInitContainerBootstrap
+        .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod))
+      .getOrElse(basePod)
+    val sparkConfWithExecutorInit = initContainerBundler.map(
+      _.executorInitContainerConfiguration
+        .configureSparkConfForExecutorInitContainer(sparkConf))
+      .getOrElse(sparkConf)
     val credentialsMounter = kubernetesCredentialsMounterProvider
         .getDriverPodKubernetesCredentialsMounter()
     val credentialsSecret = credentialsMounter.createCredentialsSecret()
@@ -224,7 +226,8 @@ private[spark] class Client(
             .watch(loggingPodStatusWatcher)) { _ =>
       val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
       try {
-        val driverOwnedResources = Seq(initContainerConfigMap) ++
+        val driverOwnedResources = initContainerBundler.map(
+          _.sparkInitContainerConfigMap).toSeq ++
           maybeSubmittedDependenciesSecret.toSeq ++
           credentialsSecret.toSeq
         val driverPodOwnerReference = new OwnerReferenceBuilder()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
index cfc61e193dcff..cc1837cce6736 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
+import io.fabric8.kubernetes.api.model.ConfigMap
+
 import org.apache.spark.{SparkConf, SSLOptions}
 import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
@@ -30,17 +32,15 @@ import org.apache.spark.util.Utils
  */
 private[spark] trait DriverInitContainerComponentsProvider {
 
-  def provideInitContainerConfigMapBuilder(
-      maybeSubmittedResourceIds: Option[SubmittedResourceIds])
-      : SparkInitContainerConfigMapBuilder
   def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver
-  def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration
   def provideInitContainerSubmittedDependencyUploader(
       driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader]
   def provideSubmittedDependenciesSecretBuilder(
       maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
       : Option[SubmittedDependencySecretBuilder]
   def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap
+  def provideInitContainerBundle(maybeSubmittedResourceIds: Option[SubmittedResourceIds],
+      uris: Iterable[String]): Option[InitContainerBundle]
 }
 
 private[spark] class DriverInitContainerComponentsProviderImpl(
@@ -105,9 +105,8 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
   private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
 
-  override def provideInitContainerConfigMapBuilder(
-      maybeSubmittedResourceIds: Option[SubmittedResourceIds])
-      : SparkInitContainerConfigMapBuilder = {
+  private def provideInitContainerConfigMap(
+      maybeSubmittedResourceIds: Option[SubmittedResourceIds]): ConfigMap = {
     val submittedDependencyConfigPlugin = for {
       stagingServerUri <- maybeResourceStagingServerUri
       jarsResourceId <- maybeSubmittedResourceIds.map(_.jarsResourceId)
@@ -136,7 +135,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         filesDownloadPath,
         configMapName,
         configMapKey,
-        submittedDependencyConfigPlugin)
+        submittedDependencyConfigPlugin).build()
   }
 
   override def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver = {
@@ -144,7 +143,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         sparkJars, sparkFiles, jarsDownloadPath, filesDownloadPath)
   }
 
-  override def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration = {
+  private def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration = {
     new ExecutorInitContainerConfigurationImpl(
         maybeSecretName,
         INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH,
@@ -202,4 +201,16 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         configMapKey,
         resourceStagingServerSecretPlugin)
   }
+
+  override def provideInitContainerBundle(
+      maybeSubmittedResourceIds: Option[SubmittedResourceIds],
+      uris: Iterable[String]): Option[InitContainerBundle] = {
+    val containerLocalizedFilesResolver = provideContainerLocalizedFilesResolver()
+    // Bypass init-containers if `spark.jars` and `spark.files` is empty or only has `local://` URIs
+    if (KubernetesFileUtils.getNonContainerLocalFiles(uris).nonEmpty) {
+      Some(InitContainerBundle(provideInitContainerConfigMap(maybeSubmittedResourceIds),
+        provideInitContainerBootstrap(),
+        provideExecutorInitContainerConfiguration()))
+    } else None
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala
new file mode 100644
index 0000000000000..ba44f794d5811
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import io.fabric8.kubernetes.api.model.ConfigMap
+
+import org.apache.spark.deploy.kubernetes.{SparkPodInitContainerBootstrap}
+
+case class InitContainerBundle(
+    sparkInitContainerConfigMap: ConfigMap,
+    sparkPodInitContainerBootstrap: SparkPodInitContainerBootstrap,
+    executorInitContainerConfiguration: ExecutorInitContainerConfiguration)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
index 1b0af3fa9fb01..d688bf29808fb 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
@@ -33,6 +33,10 @@ private[spark] object KubernetesFileUtils {
     filterUriStringsByScheme(uris, _ == "local")
   }
 
+  def getNonContainerLocalFiles(uris: Iterable[String]): Iterable[String] = {
+    filterUriStringsByScheme(uris, _ != "local")
+  }
+
   def getOnlySubmitterLocalFiles(uris: Iterable[String]): Iterable[String] = {
     filterUriStringsByScheme(uris, _ == "file")
   }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index 3945bef5bcfb8..8992a56e20c80 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -123,8 +123,6 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val CREDENTIALS_SET_CONF = "spark.kubernetes.driverCredentials.provided"
   private val CREDENTIALS_SET_ANNOTATION = "credentials-set"
 
-  @Mock
-  private var initContainerConfigMapBuilder: SparkInitContainerConfigMapBuilder = _
   @Mock
   private var containerLocalizedFilesResolver: ContainerLocalizedFilesResolver = _
   @Mock
@@ -173,12 +171,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       })
     when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver())
       .thenReturn(containerLocalizedFilesResolver)
-    when(initContainerComponentsProvider.provideExecutorInitContainerConfiguration())
-      .thenReturn(executorInitContainerConfiguration)
     when(submittedDependenciesSecretBuilder.build())
       .thenReturn(INIT_CONTAINER_SECRET)
-    when(initContainerConfigMapBuilder.build())
-      .thenReturn(INIT_CONTAINER_CONFIG_MAP)
     when(kubernetesClient.pods()).thenReturn(podOps)
     when(podOps.create(any())).thenAnswer(new Answer[Pod] {
       override def answer(invocation: InvocationOnMock): Pod = {
@@ -214,9 +208,10 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     when(initContainerComponentsProvider
         .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets())))
         .thenReturn(Some(submittedDependenciesSecretBuilder))
-    when(initContainerComponentsProvider
-        .provideInitContainerConfigMapBuilder(Some(SUBMITTED_RESOURCES.ids())))
-        .thenReturn(initContainerConfigMapBuilder)
+    when(initContainerComponentsProvider.provideInitContainerBundle(Some(SUBMITTED_RESOURCES.ids()),
+      RESOLVED_SPARK_JARS ++ RESOLVED_SPARK_FILES))
+        .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
+          initContainerBootstrap, executorInitContainerConfiguration)))
     runAndVerifyDriverPodHasCorrectProperties()
     val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
     verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
@@ -232,8 +227,6 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     verifyConfigMapWasCreated(createdResources)
     verify(submittedDependencyUploader).uploadJars()
     verify(submittedDependencyUploader).uploadFiles()
-    verify(initContainerComponentsProvider)
-        .provideInitContainerConfigMapBuilder(Some(SUBMITTED_RESOURCES.ids()))
     verify(initContainerComponentsProvider)
       .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets()))
   }
@@ -250,8 +243,6 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     verifyConfigMapWasCreated(createdResources)
     verify(submittedDependencyUploader, times(0)).uploadJars()
     verify(submittedDependencyUploader, times(0)).uploadFiles()
-    verify(initContainerComponentsProvider)
-      .provideInitContainerConfigMapBuilder(None)
     verify(initContainerComponentsProvider)
       .provideSubmittedDependenciesSecretBuilder(None)
   }
@@ -321,9 +312,10 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     when(initContainerComponentsProvider
       .provideSubmittedDependenciesSecretBuilder(None))
       .thenReturn(None)
-    when(initContainerComponentsProvider
-      .provideInitContainerConfigMapBuilder(None))
-      .thenReturn(initContainerConfigMapBuilder)
+    when(initContainerComponentsProvider.provideInitContainerBundle(None, RESOLVED_SPARK_JARS ++
+        RESOLVED_SPARK_FILES))
+        .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
+          initContainerBootstrap, executorInitContainerConfiguration)))
   }
 
   private def expectationsForNoMountedCredentials(): Unit = {

From 8b3248fcca89e9958a65c7519df18288c097c74e Mon Sep 17 00:00:00 2001
From: sandflee <moonfang@tencent.com>
Date: Fri, 23 Jun 2017 15:48:51 +0800
Subject: [PATCH 513/534] Config for hard cpu limit on pods; default unlimited
 (#356)

---
 docs/running-on-kubernetes.md                   | 14 ++++++++++++++
 .../apache/spark/deploy/kubernetes/config.scala | 12 ++++++++++++
 .../spark/deploy/kubernetes/submit/Client.scala | 17 ++++++++++++++++-
 .../KubernetesClusterSchedulerBackend.scala     | 17 ++++++++++++++++-
 4 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 52d847b4420cf..3a50860f826c5 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -718,6 +718,20 @@ from the other deployment modes. See the [configuration page](configuration.html
     Docker image pull policy used when pulling Docker images with Kubernetes.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.driver.limit.cores</code></td>
+  <td>(none)</td>
+  <td>
+    Specify the hard cpu limit for the driver pod
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.limit.cores</code></td>
+  <td>(none)</td>
+  <td>
+    Specify the hard cpu limit for a single executor pod
+  </td>
+</tr>
 </table>
 
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 70ea19e44ef8c..e1c1ab9d459fc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -485,6 +485,18 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_DRIVER_LIMIT_CORES =
+    ConfigBuilder("spark.kubernetes.driver.limit.cores")
+      .doc("Specify the hard cpu limit for the driver pod")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_EXECUTOR_LIMIT_CORES =
+    ConfigBuilder("spark.kubernetes.executor.limit.cores")
+      .doc("Specify the hard cpu limit for a single executor pod")
+      .stringConf
+      .createOptional
+
   private[spark] def resolveK8sMaster(rawMasterString: String): String = {
     if (!rawMasterString.startsWith("k8s://")) {
       throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index ac3a51e74f838..8220127eac449 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -64,6 +64,7 @@ private[spark] class Client(
 
   // CPU settings
   private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
+  private val driverLimitCores = sparkConf.getOption(KUBERNETES_DRIVER_LIMIT_CORES.key)
 
   // Memory settings
   private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
@@ -139,7 +140,6 @@ private[spark] class Client(
         .endEnv()
       .withNewResources()
         .addToRequests("cpu", driverCpuQuantity)
-        .addToLimits("cpu", driverCpuQuantity)
         .addToRequests("memory", driverMemoryQuantity)
         .addToLimits("memory", driverMemoryLimitQuantity)
         .endResources()
@@ -156,6 +156,21 @@ private[spark] class Client(
         .addToContainers(driverContainer)
         .endSpec()
 
+    driverLimitCores.map {
+      limitCores =>
+        val driverCpuLimitQuantity = new QuantityBuilder(false)
+          .withAmount(limitCores)
+          .build()
+        basePod
+          .editSpec()
+            .editFirstContainer()
+              .editResources
+                .addToLimits("cpu", driverCpuLimitQuantity)
+              .endResources()
+            .endContainer()
+          .endSpec()
+    }
+
     val maybeSubmittedResourceIdentifiers = initContainerComponentsProvider
       .provideInitContainerSubmittedDependencyUploader(allDriverLabels)
       .map { uploader =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 4165eb8cbd067..31cf929b94e8b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -108,6 +108,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   private val executorMemoryWithOverhead = executorMemoryMb + memoryOverheadMb
 
   private val executorCores = conf.getOption("spark.executor.cores").getOrElse("1")
+  private val executorLimitCores = conf.getOption(KUBERNETES_EXECUTOR_LIMIT_CORES.key)
 
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests"))
@@ -438,7 +439,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
             .addToRequests("memory", executorMemoryQuantity)
             .addToLimits("memory", executorMemoryLimitQuantity)
             .addToRequests("cpu", executorCpuQuantity)
-            .addToLimits("cpu", executorCpuQuantity)
           .endResources()
           .addAllToEnv(requiredEnv.asJava)
           .addToEnv(executorExtraClasspathEnv.toSeq: _*)
@@ -446,6 +446,21 @@ private[spark] class KubernetesClusterSchedulerBackend(
         .endContainer()
       .endSpec()
 
+    executorLimitCores.map {
+      limitCores =>
+        val executorCpuLimitQuantity = new QuantityBuilder(false)
+          .withAmount(limitCores)
+          .build()
+        basePodBuilder
+          .editSpec()
+            .editFirstContainer()
+              .editResources
+                .addToLimits("cpu", executorCpuLimitQuantity)
+              .endResources()
+            .endContainer()
+          .endSpec()
+    }
+
     val withMaybeShuffleConfigPodBuilder = shuffleServiceConfig
       .map { config =>
         config.shuffleDirs.foldLeft(basePodBuilder) { (builder, dir) =>

From 6f6cfd62529d4928d9c72e436fd5bc0da6e65db9 Mon Sep 17 00:00:00 2001
From: Yinan Li <ynli@google.com>
Date: Thu, 29 Jun 2017 12:14:42 -0700
Subject: [PATCH 514/534] Allow number of executor cores to have fractional
 values (#361)

This commit tries to solve issue #359 by allowing the `spark.executor.cores` configuration key to take fractional values, e.g., 0.5 or 1.5. The value is used to specify the cpu request when creating the executor pods, which is allowed to be fractional by Kubernetes. When the value is passed to the executor process through the environment variable `SPARK_EXECUTOR_CORES`, the value is rounded up to the closest integer as required by the `CoarseGrainedExecutorBackend`.

Signed-off-by: Yinan Li <ynli@google.com>
---
 .../kubernetes/KubernetesClusterSchedulerBackend.scala     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 31cf929b94e8b..d880cee315c0d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -107,7 +107,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       MEMORY_OVERHEAD_MIN))
   private val executorMemoryWithOverhead = executorMemoryMb + memoryOverheadMb
 
-  private val executorCores = conf.getOption("spark.executor.cores").getOrElse("1")
+  private val executorCores = conf.getDouble("spark.executor.cores", 1d)
   private val executorLimitCores = conf.getOption(KUBERNETES_EXECUTOR_LIMIT_CORES.key)
 
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
@@ -377,7 +377,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       .withAmount(s"${executorMemoryWithOverhead}M")
       .build()
     val executorCpuQuantity = new QuantityBuilder(false)
-      .withAmount(executorCores)
+      .withAmount(executorCores.toString)
       .build()
     val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
       new EnvVarBuilder()
@@ -388,7 +388,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val requiredEnv = Seq(
       (ENV_EXECUTOR_PORT, executorPort.toString),
       (ENV_DRIVER_URL, driverUrl),
-      (ENV_EXECUTOR_CORES, executorCores),
+      // Executor backend expects integral value for executor cores, so round it up to an int.
+      (ENV_EXECUTOR_CORES, math.ceil(executorCores).toInt.toString),
       (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId()),
       (ENV_EXECUTOR_ID, executorId),

From befcf0a30651d0335bb57c242a824e43748db33f Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <ifilondz@gmail.com>
Date: Mon, 3 Jul 2017 12:25:59 -0700
Subject: [PATCH 515/534] Python Bindings for launching PySpark Jobs from the
 JVM  (#364)

* Adding PySpark Submit functionality. Launching Python from JVM

* Addressing scala idioms related to PR351

* Removing extends Logging which was necessary for LogInfo

* Refactored code to leverage the ContainerLocalizedFileResolver

* Modified Unit tests so that they would pass

* Modified Unit Test input to pass Unit Tests

* Setup working environent for integration tests for PySpark

* Comment out Python thread logic until Jenkins has python in Python

* Modifying PythonExec to pass on Jenkins

* Modifying python exec

* Added unit tests to ClientV2 and refactored to include pyspark submission resources

* Modified unit test check

* Scalastyle

* PR 348 file conflicts

* Refactored unit tests and styles

* further scala stylzing and logic

* Modified unit tests to be more specific towards Class in question

* Removed space delimiting for methods

* Submission client redesign to use a step-based builder pattern.

This change overhauls the underlying architecture of the submission
client, but it is intended to entirely preserve existing behavior of
Spark applications. Therefore users will find this to be an invisible
change.

The philosophy behind this design is to reconsider the breakdown of the
submission process. It operates off the abstraction of "submission
steps", which are transformation functions that take the previous state
of the driver and return the new state of the driver. The driver's state
includes its Spark configurations and the Kubernetes resources that will
be used to deploy it.

Such a refactor moves away from a features-first API design, which
considers different containers to serve a set of features. The previous
design, for example, had a container files resolver API object that
returned different resolutions of the dependencies added by the user.
However, it was up to the main Client to know how to intelligently
invoke all of those APIs. Therefore the API surface area of the file
resolver became untenably large and it was not intuitive of how it was
to be used or extended.

This design changes the encapsulation layout; every module is now
responsible for changing the driver specification directly. An
orchestrator builds the correct chain of steps and hands it to the
client, which then calls it verbatim. The main client then makes any
final modifications that put the different pieces of the driver
together, particularly to attach the driver container itself to the pod
and to apply the Spark configuration as command-line arguments.

* Don't add the init-container step if all URIs are local.

* Python arguments patch + tests + docs

* Revert "Python arguments patch + tests + docs"

This reverts commit 4533df2a03e2a8922988b0bd01691ad1f26e5d03.

* Revert "Don't add the init-container step if all URIs are local."

This reverts commit e103225d9ff54ca17692279cc6a7999f9b8c3265.

* Revert "Submission client redesign to use a step-based builder pattern."

This reverts commit 5499f6ddf9b42c0526f1dc053317afb38dc71294.

* style changes

* space for styling
---
 README.md                                     |   1 +
 .../org/apache/spark/deploy/SparkSubmit.scala |  14 +-
 docs/running-on-kubernetes.md                 |  26 ++++
 .../spark/deploy/kubernetes/constants.scala   |   2 +
 .../deploy/kubernetes/submit/Client.scala     |  77 ++++++----
 .../ContainerLocalizedFilesResolver.scala     |  39 +++--
 ...riverInitContainerComponentsProvider.scala |  25 ++--
 .../DriverPodKubernetesFileMounter.scala      |  55 +++++++
 .../submit/PythonSubmissionResources.scala    |  75 ++++++++++
 .../kubernetes/submit/ClientV2Suite.scala     | 139 +++++++++++++++---
 ...ContainerLocalizedFilesResolverSuite.scala |  24 +++
 .../PythonSubmissionResourcesSuite.scala      | 109 ++++++++++++++
 .../src/main/docker/driver-py/Dockerfile      |  48 ++++++
 .../src/main/docker/executor-py/Dockerfile    |  46 ++++++
 .../src/main/docker/init-container/Dockerfile |   2 +-
 .../docker/resource-staging-server/Dockerfile |   3 +-
 .../main/docker/shuffle-service/Dockerfile    |   2 +-
 .../kubernetes/integration-tests/pom.xml      | 102 +++++++++++++
 .../integration-tests/src/test/python/pi.py   |  46 ++++++
 .../integrationtest/KubernetesSuite.scala     |  40 ++++-
 .../docker/SparkDockerImageBuilder.scala      |  29 +++-
 21 files changed, 831 insertions(+), 73 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-py/Dockerfile
 create mode 100644 resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor-py/Dockerfile
 create mode 100755 resource-managers/kubernetes/integration-tests/src/test/python/pi.py

diff --git a/README.md b/README.md
index cf6b4fa80242b..cb747225a11d4 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ We've been asked by an Apache Spark Committer to work outside of the Apache infr
 
 This is a collaborative effort by several folks from different companies who are interested in seeing this feature be successful.  Companies active in this project include (alphabetically):
 
+- Bloomberg
 - Google
 - Haiwen
 - Hyperpilot
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 59ccf3af24ce7..9256a9ddd9960 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -335,8 +335,8 @@ object SparkSubmit {
     (clusterManager, deployMode) match {
       case (KUBERNETES, CLIENT) =>
         printErrorAndExit("Client mode is currently not supported for Kubernetes.")
-      case (KUBERNETES, CLUSTER) if args.isPython || args.isR =>
-        printErrorAndExit("Kubernetes does not currently support python or R applications.")
+      case (KUBERNETES, CLUSTER) if args.isR =>
+        printErrorAndExit("Kubernetes does not currently support R applications.")
       case (STANDALONE, CLUSTER) if args.isPython =>
         printErrorAndExit("Cluster deploy mode is currently not supported for python " +
           "applications on standalone clusters.")
@@ -620,8 +620,14 @@ object SparkSubmit {
 
     if (isKubernetesCluster) {
       childMainClass = "org.apache.spark.deploy.kubernetes.submit.Client"
-      childArgs += args.primaryResource
-      childArgs += args.mainClass
+      if (args.isPython) {
+        childArgs += args.primaryResource
+        childArgs += "org.apache.spark.deploy.PythonRunner"
+        childArgs += args.pyFiles
+      } else {
+        childArgs += args.primaryResource
+        childArgs += args.mainClass
+      }
       childArgs ++= args.childArgs
     }
 
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 3a50860f826c5..2b4e9a6f96af1 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -180,6 +180,32 @@ The above mechanism using `kubectl proxy` can be used when we have authenticatio
 kubernetes-client library does not support. Authentication using X509 Client Certs and OAuth tokens
 is currently supported.
 
+### Running PySpark
+
+Running PySpark on Kubernetes leverages the same spark-submit logic when launching on Yarn and Mesos. 
+Python files can be distributed by including, in the conf, `--py-files` 
+
+Below is an example submission: 
+
+
+```
+    bin/spark-submit \
+      --deploy-mode cluster \
+      --master k8s://http://127.0.0.1:8001 \
+      --kubernetes-namespace default \
+      --conf spark.executor.memory=500m \
+      --conf spark.driver.memory=1G \
+      --conf spark.driver.cores=1 \
+      --conf spark.executor.cores=1 \
+      --conf spark.executor.instances=1 \
+      --conf spark.app.name=spark-pi \
+      --conf spark.kubernetes.driver.docker.image=spark-driver-py:latest \
+      --conf spark.kubernetes.executor.docker.image=spark-executor-py:latest \
+      --conf spark.kubernetes.initcontainer.docker.image=spark-init:latest \
+      --py-files local:///opt/spark/examples/src/main/python/sort.py \
+      local:///opt/spark/examples/src/main/python/pi.py 100
+```
+
 ## Dynamic Executor Scaling
 
 Spark on Kubernetes supports Dynamic Allocation with cluster mode. This mode requires running
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index f2f1136e54fe4..92f051b2ac298 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -67,6 +67,8 @@ package object constants {
   private[spark] val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   private[spark] val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
   private[spark] val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
+  private[spark] val ENV_PYSPARK_FILES = "PYSPARK_FILES"
+  private[spark] val ENV_PYSPARK_PRIMARY = "PYSPARK_PRIMARY"
 
   // Bootstrapping dependencies with the init-container
   private[spark] val INIT_CONTAINER_ANNOTATION = "pod.beta.kubernetes.io/init-containers"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 8220127eac449..781ecbd6c5416 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -47,11 +47,11 @@ private[spark] class Client(
     appName: String,
     kubernetesResourceNamePrefix: String,
     kubernetesAppId: String,
+    mainAppResource: String,
+    pythonResource: Option[PythonSubmissionResourcesImpl],
     mainClass: String,
     sparkConf: SparkConf,
     appArgs: Array[String],
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
     waitForAppCompletion: Boolean,
     kubernetesClient: KubernetesClient,
     initContainerComponentsProvider: DriverInitContainerComponentsProvider,
@@ -82,9 +82,7 @@ private[spark] class Client(
     org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
 
   def run(): Unit = {
-    validateNoDuplicateFileNames(sparkJars)
-    validateNoDuplicateFileNames(sparkFiles)
-
+    val arguments = (pythonResource map {p => p.arguments}).getOrElse(appArgs)
     val driverCustomLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
       sparkConf,
       KUBERNETES_DRIVER_LABEL_PREFIX,
@@ -136,7 +134,7 @@ private[spark] class Client(
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.mkString(" "))
+        .withValue(arguments.mkString(" "))
         .endEnv()
       .withNewResources()
         .addToRequests("cpu", driverCpuQuantity)
@@ -182,10 +180,13 @@ private[spark] class Client(
       .map(_.build())
 
     val containerLocalizedFilesResolver = initContainerComponentsProvider
-      .provideContainerLocalizedFilesResolver()
+      .provideContainerLocalizedFilesResolver(mainAppResource)
     val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
     val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
-
+    val resolvedPySparkFiles = containerLocalizedFilesResolver.resolveSubmittedPySparkFiles()
+    val resolvedPrimaryPySparkResource = pythonResource.map {
+        p => p.primaryPySparkResource(containerLocalizedFilesResolver)
+      }.getOrElse("")
     val initContainerBundler = initContainerComponentsProvider
       .provideInitContainerBundle(maybeSubmittedResourceIdentifiers.map(_.ids()),
         resolvedSparkJars ++ resolvedSparkFiles)
@@ -221,7 +222,7 @@ private[spark] class Client(
     val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
       case (confKey, confValue) => s"-D$confKey=$confValue"
     }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-    val resolvedDriverPod = podWithInitContainerAndMountedCreds.editSpec()
+    val resolvedDriverPodBuilder = podWithInitContainerAndMountedCreds.editSpec()
       .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
         .addNewEnv()
           .withName(ENV_MOUNTED_CLASSPATH)
@@ -233,7 +234,15 @@ private[spark] class Client(
           .endEnv()
         .endContainer()
       .endSpec()
-      .build()
+    val driverPodFileMounter = initContainerComponentsProvider.provideDriverPodFileMounter()
+    val resolvedDriverPod = pythonResource.map {
+      p => p.driverPodWithPySparkEnvs(
+        driverPodFileMounter,
+        resolvedPrimaryPySparkResource,
+        resolvedPySparkFiles.mkString(","),
+        driverContainer.getName,
+        resolvedDriverPodBuilder
+      )}.getOrElse(resolvedDriverPodBuilder.build())
     Utils.tryWithResource(
         kubernetesClient
             .pods()
@@ -271,17 +280,6 @@ private[spark] class Client(
       }
     }
   }
-
-  private def validateNoDuplicateFileNames(allFiles: Seq[String]): Unit = {
-    val fileNamesToUris = allFiles.map { file =>
-      (new File(Utils.resolveURI(file).getPath).getName, file)
-    }
-    fileNamesToUris.groupBy(_._1).foreach {
-      case (fileName, urisWithFileName) =>
-        require(urisWithFileName.size == 1, "Cannot add multiple files with the same name, but" +
-          s" file name $fileName is shared by all of these URIs: $urisWithFileName")
-    }
-  }
 }
 
 private[spark] object Client {
@@ -292,22 +290,34 @@ private[spark] object Client {
     val appArgs = args.drop(2)
     run(sparkConf, mainAppResource, mainClass, appArgs)
   }
-
   def run(
       sparkConf: SparkConf,
       mainAppResource: String,
       mainClass: String,
       appArgs: Array[String]): Unit = {
+    val isPython = mainAppResource.endsWith(".py")
+    val pythonResource: Option[PythonSubmissionResourcesImpl] =
+      if (isPython) {
+        Option(new PythonSubmissionResourcesImpl(mainAppResource, appArgs))
+      } else None
+    // Since you might need jars for SQL UDFs in PySpark
+    def sparkJarFilter(): Seq[String] =
+      pythonResource.map {p => p.sparkJars}.getOrElse(
+        Option(mainAppResource)
+          .filterNot(_ == SparkLauncher.NO_RESOURCE)
+          .toSeq)
     val sparkJars = sparkConf.getOption("spark.jars")
       .map(_.split(","))
-      .getOrElse(Array.empty[String]) ++
-      Option(mainAppResource)
-        .filterNot(_ == SparkLauncher.NO_RESOURCE)
-        .toSeq
+      .getOrElse(Array.empty[String]) ++ sparkJarFilter()
     val launchTime = System.currentTimeMillis
     val sparkFiles = sparkConf.getOption("spark.files")
       .map(_.split(","))
       .getOrElse(Array.empty[String])
+    val pySparkFilesOption = pythonResource.map {p => p.pySparkFiles}
+    validateNoDuplicateFileNames(sparkJars)
+    validateNoDuplicateFileNames(sparkFiles)
+    pySparkFilesOption.foreach {b => validateNoDuplicateFileNames(b)}
+    val pySparkFiles = pySparkFilesOption.getOrElse(Array.empty[String])
     val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
     // The resource name prefix is derived from the application name, making it easy to connect the
     // names of the Kubernetes resources from e.g. Kubectl or the Kubernetes dashboard to the
@@ -326,6 +336,7 @@ private[spark] object Client {
         namespace,
         sparkJars,
         sparkFiles,
+        pySparkFiles,
         sslOptionsProvider.getSslOptions)
     Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
         master,
@@ -346,11 +357,11 @@ private[spark] object Client {
           appName,
           kubernetesResourceNamePrefix,
           kubernetesAppId,
+          mainAppResource,
+          pythonResource,
           mainClass,
           sparkConf,
           appArgs,
-          sparkJars,
-          sparkFiles,
           waitForAppCompletion,
           kubernetesClient,
           initContainerComponentsProvider,
@@ -358,4 +369,14 @@ private[spark] object Client {
           loggingPodStatusWatcher).run()
     }
   }
+  private def validateNoDuplicateFileNames(allFiles: Seq[String]): Unit = {
+    val fileNamesToUris = allFiles.map { file =>
+      (new File(Utils.resolveURI(file).getPath).getName, file)
+    }
+    fileNamesToUris.groupBy(_._1).foreach {
+      case (fileName, urisWithFileName) =>
+        require(urisWithFileName.size == 1, "Cannot add multiple files with the same name, but" +
+          s" file name $fileName is shared by all of these URIs: $urisWithFileName")
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
index c635484c4c124..c31aa5f306bea 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
@@ -24,14 +24,19 @@ private[spark] trait ContainerLocalizedFilesResolver {
   def resolveSubmittedAndRemoteSparkJars(): Seq[String]
   def resolveSubmittedSparkJars(): Seq[String]
   def resolveSubmittedSparkFiles(): Seq[String]
+  def resolveSubmittedPySparkFiles(): Seq[String]
+  def resolvePrimaryResourceFile(): String
 }
 
 private[spark] class ContainerLocalizedFilesResolverImpl(
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
+    pySparkFiles: Seq[String],
+    primaryPyFile: String,
     jarsDownloadPath: String,
     filesDownloadPath: String) extends ContainerLocalizedFilesResolver {
 
+
   override def resolveSubmittedAndRemoteSparkJars(): Seq[String] = {
     sparkJars.map { jar =>
       val jarUri = Utils.resolveURI(jar)
@@ -53,16 +58,30 @@ private[spark] class ContainerLocalizedFilesResolverImpl(
     resolveSubmittedFiles(sparkFiles, filesDownloadPath)
   }
 
-  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
-    files.map { file =>
-      val fileUri = Utils.resolveURI(file)
-      Option(fileUri.getScheme).getOrElse("file") match {
-        case "file" =>
-          val fileName = new File(fileUri.getPath).getName
-          s"$downloadPath/$fileName"
-        case _ =>
-          file
-      }
+  override def resolveSubmittedPySparkFiles(): Seq[String] = {
+    def filterMainResource(x: String) = x match {
+      case `primaryPyFile` => None
+      case _ => Some(resolveFile(x, filesDownloadPath))
+    }
+    pySparkFiles.flatMap(x => filterMainResource(x))
+  }
+
+  override def resolvePrimaryResourceFile(): String = {
+    Option(primaryPyFile).map(p => resolveFile(p, filesDownloadPath)).getOrElse("")
+  }
+
+  private def resolveFile(file: String, downloadPath: String) = {
+    val fileUri = Utils.resolveURI(file)
+    Option(fileUri.getScheme).getOrElse("file") match {
+      case "file" =>
+        val fileName = new File(fileUri.getPath).getName
+        s"$downloadPath/$fileName"
+      case _ =>
+        file
     }
   }
+
+  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
+    files.map { file => resolveFile(file, downloadPath) }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
index cc1837cce6736..6e185d2c069f6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
@@ -32,13 +32,15 @@ import org.apache.spark.util.Utils
  */
 private[spark] trait DriverInitContainerComponentsProvider {
 
-  def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver
+  def provideContainerLocalizedFilesResolver(
+      mainAppResource: String): ContainerLocalizedFilesResolver
   def provideInitContainerSubmittedDependencyUploader(
       driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader]
   def provideSubmittedDependenciesSecretBuilder(
       maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
       : Option[SubmittedDependencySecretBuilder]
   def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap
+  def provideDriverPodFileMounter(): DriverPodKubernetesFileMounter
   def provideInitContainerBundle(maybeSubmittedResourceIds: Option[SubmittedResourceIds],
       uris: Iterable[String]): Option[InitContainerBundle]
 }
@@ -49,6 +51,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
       namespace: String,
       sparkJars: Seq[String],
       sparkFiles: Seq[String],
+      pySparkFiles: Seq[String],
       resourceStagingServerExternalSslOptions: SSLOptions)
     extends DriverInitContainerComponentsProvider {
 
@@ -104,6 +107,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
   private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
   private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
+  private val pySparkSubmitted = KubernetesFileUtils.getOnlySubmitterLocalFiles(pySparkFiles)
 
   private def provideInitContainerConfigMap(
       maybeSubmittedResourceIds: Option[SubmittedResourceIds]): ConfigMap = {
@@ -130,7 +134,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
     }
     new SparkInitContainerConfigMapBuilderImpl(
         sparkJars,
-        sparkFiles,
+        sparkFiles ++ pySparkSubmitted,
         jarsDownloadPath,
         filesDownloadPath,
         configMapName,
@@ -138,9 +142,10 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         submittedDependencyConfigPlugin).build()
   }
 
-  override def provideContainerLocalizedFilesResolver(): ContainerLocalizedFilesResolver = {
+  override def provideContainerLocalizedFilesResolver(mainAppResource: String)
+    : ContainerLocalizedFilesResolver = {
     new ContainerLocalizedFilesResolverImpl(
-        sparkJars, sparkFiles, jarsDownloadPath, filesDownloadPath)
+        sparkJars, sparkFiles, pySparkFiles, mainAppResource, jarsDownloadPath, filesDownloadPath)
   }
 
   private def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration = {
@@ -159,7 +164,7 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
           namespace,
           stagingServerUri,
           sparkJars,
-          sparkFiles,
+          sparkFiles ++ pySparkSubmitted,
           resourceStagingServerExternalSslOptions,
           RetrofitClientFactoryImpl)
     }
@@ -201,13 +206,15 @@ private[spark] class DriverInitContainerComponentsProviderImpl(
         configMapKey,
         resourceStagingServerSecretPlugin)
   }
-
+  override def provideDriverPodFileMounter(): DriverPodKubernetesFileMounter = {
+    new DriverPodKubernetesFileMounterImpl()
+  }
   override def provideInitContainerBundle(
       maybeSubmittedResourceIds: Option[SubmittedResourceIds],
       uris: Iterable[String]): Option[InitContainerBundle] = {
-    val containerLocalizedFilesResolver = provideContainerLocalizedFilesResolver()
-    // Bypass init-containers if `spark.jars` and `spark.files` is empty or only has `local://` URIs
-    if (KubernetesFileUtils.getNonContainerLocalFiles(uris).nonEmpty) {
+    // Bypass init-containers if `spark.jars` and `spark.files` and '--py-rilfes'
+    // is empty or only has `local://` URIs
+    if ((KubernetesFileUtils.getNonContainerLocalFiles(uris) ++ pySparkSubmitted).nonEmpty) {
       Some(InitContainerBundle(provideInitContainerConfigMap(maybeSubmittedResourceIds),
         provideInitContainerBootstrap(),
         provideExecutorInitContainerConfiguration()))
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
new file mode 100644
index 0000000000000..cc0ef0eedb457
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import io.fabric8.kubernetes.api.model.{Container, PodBuilder}
+
+import org.apache.spark.deploy.kubernetes.constants._
+
+ /**
+  * Trait that is responsible for providing full file-paths dynamically after
+  * the filesDownloadPath has been defined. The file-names are then stored in the
+  * environmental variables in the driver-pod.
+  */
+private[spark] trait DriverPodKubernetesFileMounter {
+  def addPySparkFiles(primaryFile: String, pySparkFiles: String,
+    mainContainerName: String, originalPodSpec: PodBuilder) : PodBuilder
+}
+
+private[spark] class DriverPodKubernetesFileMounterImpl()
+  extends DriverPodKubernetesFileMounter {
+  override def addPySparkFiles(
+        primaryFile: String,
+        pySparkFiles: String,
+        mainContainerName: String,
+        originalPodSpec: PodBuilder): PodBuilder = {
+
+    originalPodSpec
+      .editSpec()
+        .editMatchingContainer(new ContainerNameEqualityPredicate(mainContainerName))
+          .addNewEnv()
+            .withName(ENV_PYSPARK_PRIMARY)
+            .withValue(primaryFile)
+          .endEnv()
+          .addNewEnv()
+            .withName(ENV_PYSPARK_FILES)
+            .withValue(pySparkFiles)
+          .endEnv()
+        .endContainer()
+      .endSpec()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala
new file mode 100644
index 0000000000000..c61e930a2b97f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import io.fabric8.kubernetes.api.model.{Pod, PodBuilder}
+
+private[spark] trait PythonSubmissionResources {
+  def sparkJars: Seq[String]
+  def pySparkFiles: Array[String]
+  def arguments: Array[String]
+  def primaryPySparkResource(containerLocalizedFilesResolver: ContainerLocalizedFilesResolver)
+    : String
+  def driverPodWithPySparkEnvs(
+    driverPodFileMounter: DriverPodKubernetesFileMounter,
+    resolvedPrimaryPySparkResource: String,
+    resolvedPySparkFiles: String,
+    driverContainerName: String,
+    driverPodBuilder: PodBuilder): Pod
+}
+
+private[spark] class PythonSubmissionResourcesImpl(
+  private val mainAppResource: String,
+  private val appArgs: Array[String] ) extends PythonSubmissionResources {
+
+  private val pyFiles: Array[String] = {
+    Option(appArgs(0)).map(a => mainAppResource +: a.split(","))
+      .getOrElse(Array(mainAppResource))
+  }
+
+  override def sparkJars: Seq[String] = Seq.empty[String]
+
+  override def pySparkFiles: Array[String] = pyFiles
+
+  override def arguments: Array[String] = {
+    pyFiles.toList match {
+      case Nil => appArgs
+      case a :: b => a match {
+        case _ if a == mainAppResource && b == Nil => appArgs
+        case _ => appArgs.drop(1)
+      }
+    }
+  }
+  override def primaryPySparkResource(
+    containerLocalizedFilesResolver: ContainerLocalizedFilesResolver) : String =
+      containerLocalizedFilesResolver.resolvePrimaryResourceFile()
+
+  override def driverPodWithPySparkEnvs(
+    driverPodFileMounter: DriverPodKubernetesFileMounter,
+    resolvedPrimaryPySparkResource: String,
+    resolvedPySparkFiles: String,
+    driverContainerName: String,
+    driverPodBuilder: PodBuilder) : Pod = {
+      driverPodFileMounter
+        .addPySparkFiles(
+          resolvedPrimaryPySparkResource,
+          resolvedPySparkFiles,
+          driverContainerName,
+          driverPodBuilder)
+        .build()
+    }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
index 8992a56e20c80..a58a37691f4eb 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.deploy.kubernetes.submit
 
 import java.io.File
 
-import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
+import io.fabric8.kubernetes.api.model._
 import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
 import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
 import org.hamcrest.{BaseMatcher, Description}
@@ -27,10 +27,10 @@ import org.mockito.Matchers.{any, anyVararg, argThat, eq => mockitoEq}
 import org.mockito.Mockito.{times, verify, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
-import org.scalatest.BeforeAndAfter
+import org.scalatest.{BeforeAndAfter, Matchers}
+
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-
 import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.{KubernetesExternalShuffleService, KubernetesShuffleBlockHandler, SparkPodInitContainerBootstrap}
 import org.apache.spark.deploy.kubernetes.config._
@@ -63,6 +63,7 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   private val INIT_CONTAINER_SECRET_NAME = "init-container-secret"
   private val INIT_CONTAINER_SECRET_DATA = Map("secret-key" -> "secret-data")
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
+  private val PYSPARK_APP_ARGS = Array(null, "500")
   private val APP_ARGS = Array("3", "20")
   private val SPARK_JARS = Seq(
       "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
@@ -72,6 +73,20 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       "/var/data/spark-jars/jar1.jar", "/var/data/spark-jars/jar2.jar")
   private val SPARK_FILES = Seq(
       "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
+  private val PYSPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.py",
+    "file:///app/files/file2.py",
+    "local:///app/files/file3.py",
+    "http://app/files/file4.py",
+    "file:///app/files/file5.py")
+  private val RESOLVED_PYSPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.py",
+    "/var/spark-data/spark-files/file2.py",
+    "local:///app/files/file3.py",
+    "http://app/files/file4.py")
+  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
+  private val RESOLVED_PYSPARK_PRIMARY_FILE = "/var/spark-data/spark-file/file5.py"
+
   private val RESOLVED_SPARK_FILES = Seq(
       "hdfs://localhost:9000/app/files/file1.txt", "file:///var/data/spark-files/file2.txt")
   private val INIT_CONTAINER_SECRET = new SecretBuilder()
@@ -138,7 +153,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   @Mock
   private var kubernetesClient: KubernetesClient = _
   @Mock
-  private var podOps: MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]] = _
+  private var podOps: MixedOperation[
+    Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]] = _
   private type ResourceListOps = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
       HasMetadata, java.lang.Boolean]
   @Mock
@@ -146,6 +162,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
   @Mock
   private var credentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider = _
   @Mock
+  private var fileMounter: DriverPodKubernetesFileMounter = _
+  @Mock
   private var credentialsMounter: DriverPodKubernetesCredentialsMounter = _
   @Mock
   private var loggingPodStatusWatcher: LoggingPodStatusWatcher = _
@@ -169,8 +187,10 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
             .endMetadata()
         }
       })
-    when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver())
-      .thenReturn(containerLocalizedFilesResolver)
+    when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver(
+      any[String])).thenReturn(containerLocalizedFilesResolver)
+    when(initContainerComponentsProvider.provideDriverPodFileMounter())
+        .thenReturn(fileMounter)
     when(submittedDependenciesSecretBuilder.build())
       .thenReturn(INIT_CONTAINER_SECRET)
     when(kubernetesClient.pods()).thenReturn(podOps)
@@ -178,14 +198,30 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       override def answer(invocation: InvocationOnMock): Pod = {
         new PodBuilder(invocation.getArgumentAt(0, classOf[Pod]))
           .editMetadata()
-          .withUid(DRIVER_POD_UID)
+            .withUid(DRIVER_POD_UID)
           .endMetadata()
-          .withKind(DRIVER_POD_KIND)
+            .withKind(DRIVER_POD_KIND)
           .withApiVersion(DRIVER_POD_API_VERSION)
           .build()
       }
     })
     when(podOps.withName(s"$APP_RESOURCE_PREFIX-driver")).thenReturn(namedPodResource)
+    when(fileMounter.addPySparkFiles(
+      mockitoEq(RESOLVED_PYSPARK_PRIMARY_FILE),
+      mockitoEq(RESOLVED_PYSPARK_FILES.mkString(",")),
+      any[String],
+      any())).thenAnswer( new Answer[PodBuilder] {
+        override def answer(invocation: InvocationOnMock) : PodBuilder = {
+          invocation.getArgumentAt(3, classOf[PodBuilder])
+          .editMetadata()
+            .withUid(DRIVER_POD_UID)
+            .withName(s"$APP_RESOURCE_PREFIX-driver")
+            .addToLabels("pyspark-test", "true")
+          .endMetadata()
+          .withKind(DRIVER_POD_KIND)
+          .withApiVersion(DRIVER_POD_API_VERSION)
+        }
+      })
     when(namedPodResource.watch(loggingPodStatusWatcher)).thenReturn(watch)
     when(containerLocalizedFilesResolver.resolveSubmittedAndRemoteSparkJars())
         .thenReturn(RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS)
@@ -193,6 +229,10 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
         .thenReturn(RESOLVED_SPARK_JARS)
     when(containerLocalizedFilesResolver.resolveSubmittedSparkFiles())
         .thenReturn(RESOLVED_SPARK_FILES)
+    when(containerLocalizedFilesResolver.resolvePrimaryResourceFile())
+      .thenReturn(RESOLVED_PYSPARK_PRIMARY_FILE)
+    when(containerLocalizedFilesResolver.resolveSubmittedPySparkFiles())
+      .thenReturn(RESOLVED_PYSPARK_FILES)
     when(executorInitContainerConfiguration.configureSparkConfForExecutorInitContainer(SPARK_CONF))
         .thenReturn(SPARK_CONF_WITH_EXECUTOR_INIT_CONF)
     when(kubernetesClient.resourceList(anyVararg[HasMetadata]())).thenReturn(resourceListOps)
@@ -208,9 +248,10 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     when(initContainerComponentsProvider
         .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets())))
         .thenReturn(Some(submittedDependenciesSecretBuilder))
-    when(initContainerComponentsProvider.provideInitContainerBundle(Some(SUBMITTED_RESOURCES.ids()),
-      RESOLVED_SPARK_JARS ++ RESOLVED_SPARK_FILES))
-        .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
+    when(initContainerComponentsProvider.provideInitContainerBundle(mockitoEq(
+      Option(SUBMITTED_RESOURCES.ids())),
+      mockitoEq(RESOLVED_SPARK_JARS ++ RESOLVED_SPARK_FILES)))
+        .thenReturn(Option(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
           initContainerBootstrap, executorInitContainerConfiguration)))
     runAndVerifyDriverPodHasCorrectProperties()
     val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
@@ -292,11 +333,11 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       APP_NAME,
       APP_RESOURCE_PREFIX,
       APP_ID,
+      "",
+      None,
       MAIN_CLASS,
       SPARK_CONF,
       APP_ARGS,
-      SPARK_JARS,
-      SPARK_FILES,
       true,
       kubernetesClient,
       initContainerComponentsProvider,
@@ -305,6 +346,20 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     verify(loggingPodStatusWatcher).awaitCompletion()
   }
 
+  test("Mounting environmental variables correctly onto Driver Pod for PySpark Jobs") {
+    expectationsForNoMountedCredentials()
+    expectationsForNoDependencyUploader()
+    expectationsForNoSparkJarsOrFiles()
+    runAndVerifyDriverPodHasCorrectPySparkProperties()
+  }
+
+  private def expectationsForNoSparkJarsOrFiles(): Unit = {
+    when(containerLocalizedFilesResolver.resolveSubmittedSparkFiles())
+        .thenReturn(Nil)
+    when(containerLocalizedFilesResolver.resolveSubmittedSparkJars())
+      .thenReturn(Nil)
+  }
+
   private def expectationsForNoDependencyUploader(): Unit = {
     when(initContainerComponentsProvider
       .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
@@ -312,8 +367,8 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     when(initContainerComponentsProvider
       .provideSubmittedDependenciesSecretBuilder(None))
       .thenReturn(None)
-    when(initContainerComponentsProvider.provideInitContainerBundle(None, RESOLVED_SPARK_JARS ++
-        RESOLVED_SPARK_FILES))
+    when(initContainerComponentsProvider.provideInitContainerBundle(mockitoEq(None),
+      mockitoEq(RESOLVED_SPARK_JARS ++ RESOLVED_SPARK_FILES)))
         .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
           initContainerBootstrap, executorInitContainerConfiguration)))
   }
@@ -373,16 +428,28 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  private def runAndVerifyDriverPodHasCorrectPySparkProperties(): Unit = {
+    when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver(
+      mockitoEq(PYSPARK_PRIMARY_FILE))).thenReturn(containerLocalizedFilesResolver)
+    when(initContainerComponentsProvider.provideInitContainerBundle(
+      any[Option[SubmittedResourceIds]], any[Iterable[String]]))
+      .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
+        initContainerBootstrap, executorInitContainerConfiguration)))
+    runAndVerifyPySparkPodMatchesPredicate { p =>
+      Option(p).exists(pod => containerHasCorrectPySparkEnvs(pod))
+    }
+  }
+
   private def runAndVerifyPodMatchesPredicate(pred: (Pod => Boolean)): Unit = {
     new Client(
       APP_NAME,
       APP_RESOURCE_PREFIX,
       APP_ID,
+      "",
+      None,
       MAIN_CLASS,
       SPARK_CONF,
       APP_ARGS,
-      SPARK_JARS,
-      SPARK_FILES,
       false,
       kubernetesClient,
       initContainerComponentsProvider,
@@ -434,6 +501,15 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
     expectedBasicEnvs.toSet.subsetOf(envs.toSet)
   }
 
+  private def containerHasCorrectPySparkEnvs(pod: Pod): Boolean = {
+    val driverPodLabels =
+      pod.getMetadata.getLabels.asScala.map(env => (env._1.toString, env._2.toString))
+    val expectedBasicLabels = Map(
+      "pyspark-test" -> "true",
+      "spark-role" -> "driver")
+    expectedBasicLabels.toSet.subsetOf(driverPodLabels.toSet)
+  }
+
   private def containerHasCorrectBasicContainerConfiguration(pod: Pod): Boolean = {
     val containers = pod.getSpec.getContainers.asScala
     containers.size == 1 &&
@@ -450,4 +526,33 @@ class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
       BOOTSTRAPPED_POD_ANNOTATION -> TRUE)
     pod.getMetadata.getAnnotations.asScala == expectedAnnotations
   }
+
+  private def runAndVerifyPySparkPodMatchesPredicate(pred: (Pod => Boolean)): Unit = {
+    new Client(
+      APP_NAME,
+      APP_RESOURCE_PREFIX,
+      APP_ID,
+      PYSPARK_PRIMARY_FILE,
+      Option(new PythonSubmissionResourcesImpl(PYSPARK_PRIMARY_FILE, PYSPARK_APP_ARGS)),
+      MAIN_CLASS,
+      SPARK_CONF,
+      PYSPARK_APP_ARGS,
+      false,
+      kubernetesClient,
+      initContainerComponentsProvider,
+      credentialsMounterProvider,
+      loggingPodStatusWatcher).run()
+    val podMatcher = new BaseMatcher[Pod] {
+      override def matches(o: scala.Any): Boolean = {
+        o match {
+          case p: Pod => pred(p)
+          case _ => false
+        }
+      }
+      override def describeTo(description: Description): Unit = {}
+    }
+    verify(podOps).create(argThat(podMatcher))
+  }
 }
+
+
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
index ca5cd1fff9b74..7e51abcd7b8e0 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
@@ -29,11 +29,20 @@ class ContainerLocalizedFilesResolverSuite extends SparkFunSuite {
     "file:///app/files/file2.txt",
     "local:///app/files/file3.txt",
     "http://app/files/file4.txt")
+  private val PYSPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.py",
+    "file:///app/files/file2.py",
+    "local:///app/files/file3.py",
+    "http://app/files/file4.py",
+    "file:///app/files/file5.py")
   private val JARS_DOWNLOAD_PATH = "/var/data/spark-jars"
   private val FILES_DOWNLOAD_PATH = "/var/data/spark-files"
+  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
   private val localizedFilesResolver = new ContainerLocalizedFilesResolverImpl(
     SPARK_JARS,
     SPARK_FILES,
+    PYSPARK_FILES,
+    PYSPARK_PRIMARY_FILE,
     JARS_DOWNLOAD_PATH,
     FILES_DOWNLOAD_PATH)
 
@@ -66,4 +75,19 @@ class ContainerLocalizedFilesResolverSuite extends SparkFunSuite {
       "http://app/files/file4.txt")
     assert(resolvedFiles === expectedResolvedFiles)
   }
+  test("Submitted PySpark files should resolve to the download path.") {
+    val resolvedPySparkFiles = localizedFilesResolver.resolveSubmittedPySparkFiles()
+    val expectedPySparkFiles = Seq(
+      "hdfs://localhost:9000/app/files/file1.py",
+      s"$FILES_DOWNLOAD_PATH/file2.py",
+      "local:///app/files/file3.py",
+      "http://app/files/file4.py")
+    assert(resolvedPySparkFiles === expectedPySparkFiles)
+  }
+  test("Submitted PySpark Primary resource should resolve to the download path.") {
+    val resolvedPySparkPrimary =
+      localizedFilesResolver.resolvePrimaryResourceFile()
+    val expectedPySparkPrimary = s"$FILES_DOWNLOAD_PATH/file5.py"
+    assert(resolvedPySparkPrimary === expectedPySparkPrimary)
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala
new file mode 100644
index 0000000000000..9b60b7ef2b786
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import org.apache.spark.{SSLOptions, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+
+import scala.collection.JavaConverters._
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, Pod, PodBuilder}
+import org.mockito.{Mock, MockitoAnnotations}
+import org.mockito.Mockito.when
+import org.scalatest.BeforeAndAfter
+
+private[spark] class PythonSubmissionResourcesSuite extends SparkFunSuite with BeforeAndAfter {
+  private val PYSPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.py",
+    "file:///app/files/file2.py",
+    "local:///app/files/file3.py",
+    "http://app/files/file4.py",
+    "file:///app/files/file5.py")
+  private val RESOLVED_PYSPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.py",
+    "/var/spark-data/spark-files/file2.py",
+    "local:///app/file`s/file3.py",
+    "http://app/files/file4.py")
+  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
+  private val RESOLVED_PYSPARK_PRIMARY_FILE = "/var/data/spark-files/file5.py"
+
+  private val pyFilesResource = new PythonSubmissionResourcesImpl(
+    PYSPARK_PRIMARY_FILE, Array(PYSPARK_FILES.mkString(","), "500")
+  )
+  private val pyResource = new PythonSubmissionResourcesImpl(
+    PYSPARK_PRIMARY_FILE, Array(null, "500")
+  )
+  private val DRIVER_CONTAINER_NAME = "pyspark_container"
+  private val driverContainer = new ContainerBuilder()
+    .withName(DRIVER_CONTAINER_NAME)
+    .build()
+  private val basePodBuilder = new PodBuilder()
+    .withNewMetadata()
+      .withName("base_pod")
+    .endMetadata()
+    .withNewSpec()
+      .addToContainers(driverContainer)
+    .endSpec()
+
+  @Mock
+  private var driverInitContainer: DriverInitContainerComponentsProviderImpl = _
+  @Mock
+  private var localizedFileResolver: ContainerLocalizedFilesResolverImpl = _
+  before {
+    MockitoAnnotations.initMocks(this)
+    when(driverInitContainer.provideDriverPodFileMounter()).thenReturn(
+      new DriverPodKubernetesFileMounterImpl()
+    )
+    when(localizedFileResolver.resolvePrimaryResourceFile()).thenReturn(
+      RESOLVED_PYSPARK_PRIMARY_FILE)
+  }
+  test("Test with --py-files included") {
+    assert(pyFilesResource.sparkJars === Seq.empty[String])
+    assert(pyFilesResource.pySparkFiles ===
+      PYSPARK_PRIMARY_FILE +: PYSPARK_FILES)
+    assert(pyFilesResource.primaryPySparkResource(localizedFileResolver) ===
+      RESOLVED_PYSPARK_PRIMARY_FILE)
+    val driverPod: Pod = pyFilesResource.driverPodWithPySparkEnvs(
+      driverInitContainer.provideDriverPodFileMounter(),
+      RESOLVED_PYSPARK_PRIMARY_FILE,
+      RESOLVED_PYSPARK_FILES.mkString(","),
+      DRIVER_CONTAINER_NAME,
+      basePodBuilder
+      )
+    val driverContainer = driverPod.getSpec.getContainers.asScala.head
+    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue)).toMap
+    envs.get("PYSPARK_PRIMARY") foreach{ a => assert (a === RESOLVED_PYSPARK_PRIMARY_FILE) }
+    envs.get("PYSPARK_FILES") foreach{ a => assert (a === RESOLVED_PYSPARK_FILES.mkString(",")) }
+  }
+
+  test("Test without --py-files") {
+    assert(pyResource.sparkJars === Seq.empty[String])
+    assert(pyResource.pySparkFiles === Array(PYSPARK_PRIMARY_FILE))
+    assert(pyResource.primaryPySparkResource(localizedFileResolver) ===
+      RESOLVED_PYSPARK_PRIMARY_FILE)
+    val driverPod: Pod = pyResource.driverPodWithPySparkEnvs(
+      driverInitContainer.provideDriverPodFileMounter(),
+      RESOLVED_PYSPARK_PRIMARY_FILE,
+      "",
+      DRIVER_CONTAINER_NAME,
+      basePodBuilder
+    )
+    val driverContainer = driverPod.getSpec.getContainers.asScala.head
+    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue)).toMap
+    envs.get("PYSPARK_PRIMARY") foreach{ a => assert (a === RESOLVED_PYSPARK_PRIMARY_FILE) }
+    envs.get("PYSPARK_FILES") foreach{ a => assert (a === "") }
+  }
+}
\ No newline at end of file
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-py/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-py/Dockerfile
new file mode 100644
index 0000000000000..6dcc7511c0dd9
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/driver-py/Dockerfile
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM spark-base
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-driver-py:latest -f dockerfiles/driver-py/Dockerfile .
+
+ADD examples /opt/spark/examples
+ADD python /opt/spark/python
+
+RUN apk add --no-cache python && \
+    python -m ensurepip && \
+    rm -r /usr/lib/python*/ensurepip && \
+    pip install --upgrade pip setuptools && \
+    rm -r /root/.cache
+# UNCOMMENT THE FOLLOWING TO START PIP INSTALLING PYTHON PACKAGES
+# RUN apk add --update alpine-sdk python-dev
+# RUN pip install numpy
+
+ENV PYTHON_VERSION 2.7.13
+ENV PYSPARK_PYTHON python
+ENV PYSPARK_DRIVER_PYTHON python
+ENV PYTHONPATH ${SPARK_HOME}/python/:${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${PYTHONPATH}
+
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
+    exec /sbin/tini -- ${JAVA_HOME}/bin/java $SPARK_DRIVER_JAVA_OPTS -cp $SPARK_CLASSPATH \
+    -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY \
+    $SPARK_DRIVER_CLASS $PYSPARK_PRIMARY $PYSPARK_FILES $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor-py/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor-py/Dockerfile
new file mode 100644
index 0000000000000..7a65a4f879376
--- /dev/null
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor-py/Dockerfile
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM spark-base
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-executor-py:latest -f dockerfiles/executor-py/Dockerfile .
+
+ADD examples /opt/spark/examples
+ADD python /opt/spark/python
+
+RUN apk add --no-cache python && \
+    python -m ensurepip && \
+    rm -r /usr/lib/python*/ensurepip && \
+    pip install --upgrade pip setuptools && \
+    rm -r /root/.cache
+# UNCOMMENT THE FOLLOWING TO START PIP INSTALLING PYTHON PACKAGES
+# RUN apk add --update alpine-sdk python-dev
+# RUN pip install numpy
+
+ENV PYTHON_VERSION 2.7.13
+ENV PYSPARK_PYTHON python
+ENV PYSPARK_DRIVER_PYTHON python
+ENV PYTHONPATH ${SPARK_HOME}/python/:${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${PYTHONPATH}
+
+# TODO support spark.executor.extraClassPath
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
+    exec /sbin/tini -- ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp $SPARK_CLASSPATH org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
\ No newline at end of file
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
index 6bff06da12840..4bafe25e2608f 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/init-container/Dockerfile
@@ -19,6 +19,6 @@ FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
+# docker build -t spark-init:latest -f dockerfiles/init-container/Dockerfile .
 
 ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.KubernetesSparkDependencyDownloadInitContainer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
index c9a92fa1c5b62..9ca96be0f1a88 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/resource-staging-server/Dockerfile
@@ -17,8 +17,9 @@
 
 FROM spark-base
 
+
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f dockerfiles/executor/Dockerfile .
+# docker build -t spark-resource-staging-server:latest -f dockerfiles/resource-staging-server/Dockerfile .
 
 ENTRYPOINT [ "/sbin/tini", "--", "bin/spark-class", "org.apache.spark.deploy.rest.kubernetes.ResourceStagingServer" ]
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
index 7f4e2aa51b67d..ccb2f1a03d88c 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/shuffle-service/Dockerfile
@@ -19,7 +19,7 @@ FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-shuffle:latest -f dockerfiles/shuffle/Dockerfile .
+# docker build -t spark-shuffle:latest -f dockerfiles/shuffle-service/Dockerfile .
 
 COPY examples /opt/spark/examples
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index bbf4b02cdaaf9..cd3ccad0a2b22 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -220,6 +220,108 @@
               </resources>
             </configuration>
           </execution>
+          <execution>
+              <id>copy-integration-python</id>
+              <phase>pre-integration-test</phase>
+              <goals>
+                  <goal>copy-resources</goal>
+              </goals>
+              <configuration>
+                  <outputDirectory>${project.build.directory}/docker/python</outputDirectory>
+                  <resources>
+                      <resource>
+                          <directory>${project.parent.basedir}/python</directory>
+                          <excludes>
+                            <exclude>${project.parent.basedir}/python/.egg</exclude>
+                            <exclude>${project.parent.basedir}/python/dist</exclude>
+                          </excludes>
+                      </resource>
+                  </resources>
+              </configuration>
+          </execution>
+          <execution>
+            <id>copy-integration-data</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/docker/data</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>${project.parent.basedir}/data</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+          <execution>
+            <id>copy-integration-licenses</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/docker/licenses</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>${project.parent.basedir}/licenses</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+          <execution>
+            <id>copy-integration-examples-jar</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/docker/examples/jars</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>${project.parent.basedir}/examples/target/scala-2.11/jars</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+          <execution>
+            <id>copy-integration-examples-src</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/docker/examples/src/main</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>${project.parent.basedir}/examples/src/main</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-antrun-plugin</artifactId>
+        <version>1.6</version>
+        <executions>
+          <execution>
+            <id>create-release-file</id>
+            <phase>pre-integration-test</phase>
+            <goals>
+              <goal>run</goal>
+            </goals>
+            <configuration>
+              <target>
+                <touch file="${project.build.directory}/docker/RELEASE"/>
+              </target>
+            </configuration>
+          </execution>
         </executions>
       </plugin>
       <plugin>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/python/pi.py b/resource-managers/kubernetes/integration-tests/src/test/python/pi.py
new file mode 100755
index 0000000000000..e3f0c4aeef1b7
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/python/pi.py
@@ -0,0 +1,46 @@
+from __future__ import print_function
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+from random import random
+from operator import add
+
+from pyspark.sql import SparkSession
+
+
+if __name__ == "__main__":
+    """
+        Usage: pi [partitions]
+    """
+    spark = SparkSession\
+        .builder\
+        .appName("PythonPi")\
+        .getOrCreate()
+
+    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
+    n = 100000 * partitions
+
+    def f(_):
+        x = random() * 2 - 1
+        y = random() * 2 - 1
+        return 1 if x ** 2 + y ** 2 < 1 else 0
+
+    count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
+    print("Pi is roughly %f" % (4.0 * count / n))
+
+    spark.stop()
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index e377f285eb9a6..d2082291eba22 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -72,6 +72,34 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     kubernetesTestComponents.deleteNamespace()
   }
 
+  test("Run PySpark Job on file from SUBMITTER") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    launchStagingServer(SSLOptions(), None)
+    sparkConf
+      .set(DRIVER_DOCKER_IMAGE,
+        System.getProperty("spark.docker.test.driverImage", "spark-driver-py:latest"))
+      .set(EXECUTOR_DOCKER_IMAGE,
+        System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
+
+    runPySparkPiAndVerifyCompletion(
+      PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION)
+  }
+
+  test("Run PySpark Job on file from CONTAINER with spark.jar defined") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+    sparkConf
+      .set(DRIVER_DOCKER_IMAGE,
+      System.getProperty("spark.docker.test.driverImage", "spark-driver-py:latest"))
+      .set(EXECUTOR_DOCKER_IMAGE,
+      System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
+
+    runPySparkPiAndVerifyCompletion(
+      PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION)
+  }
+
   test("Simple submission test with the resource staging server.") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
@@ -223,6 +251,13 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         appResource, SPARK_PI_MAIN_CLASS, "Pi is roughly 3", Array.empty[String])
   }
 
+  private def runPySparkPiAndVerifyCompletion(
+    appResource: String): Unit = {
+    runSparkApplicationAndVerifyCompletion(
+      appResource, PYSPARK_PI_MAIN_CLASS, "Pi is roughly 3",
+      Array(null, "5"))
+  }
+
   private def runSparkApplicationAndVerifyCompletion(
       appResource: String,
       mainClass: String,
@@ -305,11 +340,14 @@ private[spark] object KubernetesSuite {
     s"integration-tests-jars/${EXAMPLES_JAR_FILE.getName}"
   val CONTAINER_LOCAL_HELPER_JAR_PATH = s"local:///opt/spark/examples/" +
     s"integration-tests-jars/${HELPER_JAR_FILE.getName}"
-
   val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
   val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
   val SPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.SparkPiWithInfiniteWait"
+  val PYSPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.PythonRunner"
+  val PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION =
+    "local:///opt/spark/examples/src/main/python/pi.py"
+  val PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION = "src/test/python/pi.py"
   val FILE_EXISTENCE_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.FileExistenceTest"
   val GROUP_BY_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index 4db19478f44bc..e240fcf953f8c 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -16,21 +16,32 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest.docker
 
+import java.io.File
 import java.net.URI
 import java.nio.file.Paths
 
+import scala.collection.JavaConverters._
+
 import com.spotify.docker.client.{DefaultDockerClient, DockerCertificates, LoggingBuildHandler}
 import org.apache.http.client.utils.URIBuilder
 import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
 
-private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String, String]) {
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.RedirectThread
+
+
+
+private[spark] class SparkDockerImageBuilder
+  (private val dockerEnv: Map[String, String]) extends Logging{
 
   private val DOCKER_BUILD_PATH = Paths.get("target", "docker")
   // Dockerfile paths must be relative to the build path.
   private val BASE_DOCKER_FILE = "dockerfiles/spark-base/Dockerfile"
   private val DRIVER_DOCKER_FILE = "dockerfiles/driver/Dockerfile"
+  private val DRIVERPY_DOCKER_FILE = "dockerfiles/driver-py/Dockerfile"
   private val EXECUTOR_DOCKER_FILE = "dockerfiles/executor/Dockerfile"
+  private val EXECUTORPY_DOCKER_FILE = "dockerfiles/executor-py/Dockerfile"
   private val SHUFFLE_SERVICE_DOCKER_FILE = "dockerfiles/shuffle-service/Dockerfile"
   private val INIT_CONTAINER_DOCKER_FILE = "dockerfiles/init-container/Dockerfile"
   private val STAGING_SERVER_DOCKER_FILE = "dockerfiles/resource-staging-server/Dockerfile"
@@ -61,9 +72,25 @@ private[spark] class SparkDockerImageBuilder(private val dockerEnv: Map[String,
 
   def buildSparkDockerImages(): Unit = {
     Eventually.eventually(TIMEOUT, INTERVAL) { dockerClient.ping() }
+    // Building Python distribution environment
+    val pythonExec = sys.env.get("PYSPARK_DRIVER_PYTHON")
+      .orElse(sys.env.get("PYSPARK_PYTHON"))
+      .getOrElse("/usr/bin/python")
+    val builder = new ProcessBuilder(
+      Seq(pythonExec, "setup.py", "sdist").asJava)
+    builder.directory(new File(DOCKER_BUILD_PATH.toFile, "python"))
+    builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
+    val process = builder.start()
+    new RedirectThread(process.getInputStream, System.out, "redirect output").start()
+    val exitCode = process.waitFor()
+    if (exitCode != 0) {
+      logInfo(s"exitCode: $exitCode")
+    }
     buildImage("spark-base", BASE_DOCKER_FILE)
     buildImage("spark-driver", DRIVER_DOCKER_FILE)
+    buildImage("spark-driver-py", DRIVERPY_DOCKER_FILE)
     buildImage("spark-executor", EXECUTOR_DOCKER_FILE)
+    buildImage("spark-executor-py", EXECUTORPY_DOCKER_FILE)
     buildImage("spark-shuffle", SHUFFLE_SERVICE_DOCKER_FILE)
     buildImage("spark-resource-staging-server", STAGING_SERVER_DOCKER_FILE)
     buildImage("spark-init", INIT_CONTAINER_DOCKER_FILE)

From 0f4368f7f485a9ba5e73528242025ef4a3b23c04 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Fri, 14 Jul 2017 15:43:44 -0700
Subject: [PATCH 516/534] Submission client redesign to use a step-based
 builder pattern  (#365)

* Submission client redesign to use a step-based builder pattern.

This change overhauls the underlying architecture of the submission
client, but it is intended to entirely preserve existing behavior of
Spark applications. Therefore users will find this to be an invisible
change.

The philosophy behind this design is to reconsider the breakdown of the
submission process. It operates off the abstraction of "submission
steps", which are transformation functions that take the previous state
of the driver and return the new state of the driver. The driver's state
includes its Spark configurations and the Kubernetes resources that will
be used to deploy it.

Such a refactor moves away from a features-first API design, which
considers different containers to serve a set of features. The previous
design, for example, had a container files resolver API object that
returned different resolutions of the dependencies added by the user.
However, it was up to the main Client to know how to intelligently
invoke all of those APIs. Therefore the API surface area of the file
resolver became untenably large and it was not intuitive of how it was
to be used or extended.

This design changes the encapsulation layout; every module is now
responsible for changing the driver specification directly. An
orchestrator builds the correct chain of steps and hands it to the
client, which then calls it verbatim. The main client then makes any
final modifications that put the different pieces of the driver
together, particularly to attach the driver container itself to the pod
and to apply the Spark configuration as command-line arguments.

* Add a unit test for BaseSubmissionStep.

* Add unit test for kubernetes credentials mounting.

* Add unit test for InitContainerBootstrapStep.

* unit tests for initContainer

* Add a unit test for DependencyResolutionStep.

* further modifications to InitContainer unit tests

* Use of resolver in PythonStep and unit tests for PythonStep

* refactoring of init unit tests and pythonstep resolver logic

* Add unit test for KubernetesSubmissionStepsOrchestrator.

* refactoring and addition of secret trustStore+Cert checks in a SubmissionStepSuite

* added SparkPodInitContainerBootstrapSuite

* Added InitContainerResourceStagingServerSecretPluginSuite

* style in Unit tests

* extremely minor style fix in variable naming

* Address comments.

* Rename class for consistency.

* Attempt to make spacing consistent.

Multi-line methods should have four-space indentation for arguments that
aren't on the same line as the method call itself... but this is
difficult to do consistently given how IDEs handle Scala multi-line indentation
in most cases.
---
 .../org/apache/spark/deploy/SparkSubmit.scala |  15 +-
 ...nerResourceStagingServerSecretPlugin.scala |  38 +-
 .../PodWithDetachedInitContainer.scala        |  24 +
 .../SparkPodInitContainerBootstrap.scala      |  50 +-
 .../deploy/kubernetes/submit/Client.scala     | 415 ++++---------
 .../ContainerLocalizedFilesResolver.scala     |  87 ---
 ...DriverConfigurationStepsOrchestrator.scala | 138 +++++
 ...riverInitContainerComponentsProvider.scala | 223 -------
 ...riverPodKubernetesCredentialsMounter.scala | 184 ------
 ...KubernetesCredentialsMounterProvider.scala |  49 --
 ...iverPodKubernetesCredentialsProvider.scala |  63 --
 .../DriverPodKubernetesFileMounter.scala      |  55 --
 .../ExecutorInitContainerConfiguration.scala  |  47 --
 .../kubernetes/submit/InitContainerUtil.scala |  18 +-
 .../submit/KubernetesFileUtils.scala          |  45 ++
 ...inerBundle.scala => MainAppResource.scala} |   9 +-
 .../submit/PythonSubmissionResources.scala    |  75 ---
 ...dDependencyInitContainerConfigPlugin.scala |  96 ---
 .../SubmittedDependencySecretBuilder.scala    |  81 ---
 .../BaseDriverConfigurationStep.scala         | 131 ++++
 .../DependencyResolutionStep.scala            |  66 +++
 .../submitsteps/DriverConfigurationStep.scala |  28 +
 .../DriverKubernetesCredentialsStep.scala     | 222 +++++++
 .../InitContainerBootstrapStep.scala          |  64 ++
 .../submitsteps/KubernetesDriverSpec.scala    |  47 ++
 .../submit/submitsteps/PythonStep.scala       |  46 ++
 .../BaseInitContainerConfigurationStep.scala} |  41 +-
 .../InitContainerConfigurationStep.scala      |  25 +
 ...tainerConfigurationStepsOrchestrator.scala | 131 ++++
 .../initcontainer/InitContainerSpec.scala     |  41 ++
 ...ourcesInitContainerConfigurationStep.scala | 146 +++++
 ...SparkDependencyDownloadInitContainer.scala |   1 -
 .../kubernetes/KubernetesClusterManager.scala |   9 +-
 .../KubernetesClusterSchedulerBackend.scala   | 148 +++--
 ...sourceStagingServerSecretPluginSuite.scala |  59 ++
 .../SparkPodInitContainerBootstrapSuite.scala | 187 ++----
 ...dencyInitContainerVolumesPluginSuite.scala |  60 --
 .../kubernetes/submit/ClientSuite.scala       | 226 +++++++
 .../kubernetes/submit/ClientV2Suite.scala     | 558 ------------------
 ...ContainerLocalizedFilesResolverSuite.scala |  93 ---
 ...rConfigurationStepsOrchestratorSuite.scala |  95 +++
 ...PodKubernetesCredentialsMounterSuite.scala | 171 ------
 ...cutorInitContainerConfigurationSuite.scala |  56 --
 .../PythonSubmissionResourcesSuite.scala      | 109 ----
 ...rkInitContainerConfigMapBuilderSuite.scala | 101 ----
 ...ndencyInitContainerConfigPluginSuite.scala |  89 ---
 ...ubmittedDependencySecretBuilderSuite.scala | 109 ----
 .../BaseDriverConfigurationStepSuite.scala    | 106 ++++
 .../DependencyResolutionStepSuite.scala       |  79 +++
 ...DriverKubernetesCredentialsStepSuite.scala | 152 +++++
 .../submit/submitsteps/PythonStepSuite.scala  |  76 +++
 .../initContainerBootstrapStepSuite.scala     | 159 +++++
 ...eInitContainerConfigurationStepSuite.scala |  98 +++
 ...rConfigurationStepsOrchestratorSuite.scala |  94 +++
 ...ittedResourcesInitContainerStepSuite.scala | 252 ++++++++
 .../integrationtest/KubernetesSuite.scala     |  70 ++-
 56 files changed, 2946 insertions(+), 2911 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithDetachedInitContainer.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{InitContainerBundle.scala => MainAppResource.scala} (71%)
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/InitContainerBootstrapStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/KubernetesDriverSpec.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStep.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/{SparkInitContainerConfigMapBuilder.scala => submitsteps/initcontainer/BaseInitContainerConfigurationStep.scala} (62%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestrator.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerSpec.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initContainerBootstrapStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestratorSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerStepSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 9256a9ddd9960..df50af13f71a3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -621,14 +621,17 @@ object SparkSubmit {
     if (isKubernetesCluster) {
       childMainClass = "org.apache.spark.deploy.kubernetes.submit.Client"
       if (args.isPython) {
-        childArgs += args.primaryResource
-        childArgs += "org.apache.spark.deploy.PythonRunner"
-        childArgs += args.pyFiles
+        childArgs ++= Array("--primary-py-file", args.primaryResource)
+        childArgs ++= Array("--main-class", "org.apache.spark.deploy.PythonRunner")
+        childArgs ++= Array("--other-py-files", args.pyFiles)
       } else {
-        childArgs += args.primaryResource
-        childArgs += args.mainClass
+        childArgs ++= Array("--primary-java-resource", args.primaryResource)
+        childArgs ++= Array("--main-class", args.mainClass)
+      }
+      args.childArgs.foreach { arg =>
+        childArgs += "--arg"
+        childArgs += arg
       }
-      childArgs ++= args.childArgs
     }
 
     // Load any properties specified through --conf and the default properties file
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala
index 45b881a8a3737..265b8f197a102 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPlugin.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, Secret}
+import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBuilder, Secret}
 
 import org.apache.spark.deploy.kubernetes.constants._
 
@@ -27,13 +27,13 @@ private[spark] trait InitContainerResourceStagingServerSecretPlugin {
    * from a resource staging server.
    */
   def mountResourceStagingServerSecretIntoInitContainer(
-      initContainer: ContainerBuilder): ContainerBuilder
+      initContainer: Container): Container
 
   /**
    * Configure the pod to attach a Secret volume which hosts secret files allowing the
    * init-container to retrieve dependencies from the resource staging server.
    */
-  def addResourceStagingServerSecretVolumeToPod(basePod: PodBuilder): PodBuilder
+  def addResourceStagingServerSecretVolumeToPod(basePod: Pod): Pod
 }
 
 private[spark] class InitContainerResourceStagingServerSecretPluginImpl(
@@ -42,21 +42,25 @@ private[spark] class InitContainerResourceStagingServerSecretPluginImpl(
     extends InitContainerResourceStagingServerSecretPlugin {
 
   override def mountResourceStagingServerSecretIntoInitContainer(
-      initContainer: ContainerBuilder): ContainerBuilder = {
-    initContainer.addNewVolumeMount()
-      .withName(INIT_CONTAINER_SECRET_VOLUME_NAME)
-      .withMountPath(initContainerSecretMountPath)
-      .endVolumeMount()
+      initContainer: Container): Container = {
+    new ContainerBuilder(initContainer)
+      .addNewVolumeMount()
+        .withName(INIT_CONTAINER_SECRET_VOLUME_NAME)
+        .withMountPath(initContainerSecretMountPath)
+        .endVolumeMount()
+      .build()
   }
 
-  override def addResourceStagingServerSecretVolumeToPod(basePod: PodBuilder): PodBuilder = {
-    basePod.editSpec()
-      .addNewVolume()
-        .withName(INIT_CONTAINER_SECRET_VOLUME_NAME)
-        .withNewSecret()
-          .withSecretName(initContainerSecretName)
-          .endSecret()
-        .endVolume()
-      .endSpec()
+  override def addResourceStagingServerSecretVolumeToPod(basePod: Pod): Pod = {
+    new PodBuilder(basePod)
+      .editSpec()
+        .addNewVolume()
+          .withName(INIT_CONTAINER_SECRET_VOLUME_NAME)
+          .withNewSecret()
+            .withSecretName(initContainerSecretName)
+            .endSecret()
+          .endVolume()
+        .endSpec()
+      .build()
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithDetachedInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithDetachedInitContainer.scala
new file mode 100644
index 0000000000000..36b1b07dc6bc4
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithDetachedInitContainer.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.{Container, Pod}
+
+private[spark] case class PodWithDetachedInitContainer(
+    pod: Pod,
+    initContainer: Container,
+    mainContainer: Container)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
index 87462dbde17a5..2df7ac7a204c4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrap.scala
@@ -19,8 +19,11 @@ package org.apache.spark.deploy.kubernetes
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, PodBuilder, VolumeMount, VolumeMountBuilder}
 
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.{ContainerNameEqualityPredicate, InitContainerUtil}
 
+/**
+ * This is separated out from the init-container steps API because this component can be reused to
+ * set up the init-container for executors as well.
+ */
 private[spark] trait SparkPodInitContainerBootstrap {
   /**
    * Bootstraps an init-container that downloads dependencies to be used by a main container.
@@ -28,10 +31,13 @@ private[spark] trait SparkPodInitContainerBootstrap {
    * by a ConfigMap that was installed by some other component; that is, the implementation
    * here makes no assumptions about how the init-container is specifically configured. For
    * example, this class is unaware if the init-container is fetching remote dependencies or if
-   * it is fetching dependencies from a resource staging server.
+   * it is fetching dependencies from a resource staging server. Additionally, the container itself
+   * is not actually attached to the pod, but the init container is returned so it can be attached
+   * by InitContainerUtil after the caller has decided to make any changes to it.
    */
   def bootstrapInitContainerAndVolumes(
-      mainContainerName: String, originalPodSpec: PodBuilder): PodBuilder
+      originalPodWithUnattachedInitContainer: PodWithDetachedInitContainer)
+      : PodWithDetachedInitContainer
 }
 
 private[spark] class SparkPodInitContainerBootstrapImpl(
@@ -41,13 +47,11 @@ private[spark] class SparkPodInitContainerBootstrapImpl(
     filesDownloadPath: String,
     downloadTimeoutMinutes: Long,
     initContainerConfigMapName: String,
-    initContainerConfigMapKey: String,
-    resourceStagingServerSecretPlugin: Option[InitContainerResourceStagingServerSecretPlugin])
+    initContainerConfigMapKey: String)
   extends SparkPodInitContainerBootstrap {
 
   override def bootstrapInitContainerAndVolumes(
-      mainContainerName: String,
-      originalPodSpec: PodBuilder): PodBuilder = {
+      podWithDetachedInitContainer: PodWithDetachedInitContainer): PodWithDetachedInitContainer = {
     val sharedVolumeMounts = Seq[VolumeMount](
       new VolumeMountBuilder()
         .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
@@ -58,7 +62,7 @@ private[spark] class SparkPodInitContainerBootstrapImpl(
         .withMountPath(filesDownloadPath)
         .build())
 
-    val initContainer = new ContainerBuilder()
+    val initContainer = new ContainerBuilder(podWithDetachedInitContainer.initContainer)
       .withName(s"spark-init")
       .withImage(initContainerImage)
       .withImagePullPolicy(dockerImagePullPolicy)
@@ -68,11 +72,8 @@ private[spark] class SparkPodInitContainerBootstrapImpl(
         .endVolumeMount()
       .addToVolumeMounts(sharedVolumeMounts: _*)
       .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
-    val resolvedInitContainer = resourceStagingServerSecretPlugin.map { plugin =>
-      plugin.mountResourceStagingServerSecretIntoInitContainer(initContainer)
-    }.getOrElse(initContainer).build()
-    val podWithBasicVolumes = InitContainerUtil.appendInitContainer(
-        originalPodSpec, resolvedInitContainer)
+      .build()
+    val podWithBasicVolumes = new PodBuilder(podWithDetachedInitContainer.pod)
       .editSpec()
         .addNewVolume()
           .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
@@ -92,17 +93,20 @@ private[spark] class SparkPodInitContainerBootstrapImpl(
           .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
           .withEmptyDir(new EmptyDirVolumeSource())
           .endVolume()
-        .editMatchingContainer(new ContainerNameEqualityPredicate(mainContainerName))
-          .addToVolumeMounts(sharedVolumeMounts: _*)
-          .addNewEnv()
-            .withName(ENV_MOUNTED_FILES_DIR)
-            .withValue(filesDownloadPath)
-            .endEnv()
-          .endContainer()
         .endSpec()
-    resourceStagingServerSecretPlugin.map { plugin =>
-      plugin.addResourceStagingServerSecretVolumeToPod(podWithBasicVolumes)
-    }.getOrElse(podWithBasicVolumes)
+      .build()
+    val mainContainerWithMountedFiles = new ContainerBuilder(
+      podWithDetachedInitContainer.mainContainer)
+        .addToVolumeMounts(sharedVolumeMounts: _*)
+        .addNewEnv()
+          .withName(ENV_MOUNTED_FILES_DIR)
+          .withValue(filesDownloadPath)
+          .endEnv()
+        .build()
+    PodWithDetachedInitContainer(
+      podWithBasicVolumes,
+      initContainer,
+      mainContainerWithMountedFiles)
   }
 
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 781ecbd6c5416..98cd7afcd204d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -16,233 +16,99 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
-import java.io.File
 import java.util.{Collections, UUID}
 
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, OwnerReferenceBuilder, PodBuilder, QuantityBuilder}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, OwnerReferenceBuilder, PodBuilder}
 import io.fabric8.kubernetes.client.KubernetesClient
-import scala.collection.JavaConverters._
+import scala.collection.mutable
 
 import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, SparkKubernetesClientFactory}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.ResourceStagingServerSslOptionsProviderImpl
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.{DriverConfigurationStep, KubernetesDriverSpec}
+import org.apache.spark.deploy.kubernetes.SparkKubernetesClientFactory
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.ConfigEntry
-import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
 
-/**
- * Submission client for launching Spark applications on Kubernetes clusters.
- *
- * This class is responsible for instantiating Kubernetes resources that allow a Spark driver to
- * run in a pod on the Kubernetes cluster with the Spark configurations specified by spark-submit.
- * The API of this class makes it such that much of the specific behavior can be stubbed for
- * testing; most of the detailed logic must be dependency-injected when constructing an instance
- * of this client. Therefore the submission process is designed to be as modular as possible,
- * where different steps of submission should be factored out into separate classes.
- */
+private[spark] case class ClientArguments(
+     mainAppResource: MainAppResource,
+     otherPyFiles: Seq[String],
+     mainClass: String,
+     driverArgs: Array[String])
+
+private[spark] object ClientArguments {
+  def fromCommandLineArgs(args: Array[String]): ClientArguments = {
+    var mainAppResource: Option[MainAppResource] = None
+    var otherPyFiles = Seq.empty[String]
+    var mainClass: Option[String] = None
+    val driverArgs = mutable.Buffer.empty[String]
+    args.sliding(2).toList.collect {
+      case Array("--primary-py-file", mainPyFile: String) =>
+        mainAppResource = Some(PythonMainAppResource(mainPyFile))
+      case Array("--primary-java-resource", primaryJavaResource: String) =>
+        mainAppResource = Some(JavaMainAppResource(primaryJavaResource))
+      case Array("--main-class", clazz: String) =>
+        mainClass = Some(clazz)
+      case Array("--other-py-files", pyFiles: String) =>
+        otherPyFiles = pyFiles.split(",")
+      case Array("--arg", arg: String) =>
+        driverArgs += arg
+      case other =>
+        throw new RuntimeException(s"Unknown arguments: $other")
+    }
+    require(mainAppResource.isDefined,
+        "Main app resource must be defined by either --primary-py-file or --primary-java-resource.")
+    require(mainClass.isDefined, "Main class must be specified via --main-class")
+    ClientArguments(
+        mainAppResource.get,
+        otherPyFiles,
+        mainClass.get,
+        driverArgs.toArray)
+  }
+}
+
 private[spark] class Client(
-    appName: String,
-    kubernetesResourceNamePrefix: String,
-    kubernetesAppId: String,
-    mainAppResource: String,
-    pythonResource: Option[PythonSubmissionResourcesImpl],
-    mainClass: String,
-    sparkConf: SparkConf,
-    appArgs: Array[String],
-    waitForAppCompletion: Boolean,
+    submissionSteps: Seq[DriverConfigurationStep],
+    submissionSparkConf: SparkConf,
     kubernetesClient: KubernetesClient,
-    initContainerComponentsProvider: DriverInitContainerComponentsProvider,
-    kubernetesCredentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider,
+    waitForAppCompletion: Boolean,
+    appName: String,
     loggingPodStatusWatcher: LoggingPodStatusWatcher) extends Logging {
-  private val kubernetesDriverPodName = sparkConf.get(KUBERNETES_DRIVER_POD_NAME)
-    .getOrElse(s"$kubernetesResourceNamePrefix-driver")
-  private val driverDockerImage = sparkConf.get(DRIVER_DOCKER_IMAGE)
-  private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
-
-  // CPU settings
-  private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
-  private val driverLimitCores = sparkConf.getOption(KUBERNETES_DRIVER_LIMIT_CORES.key)
 
-  // Memory settings
-  private val driverMemoryMb = sparkConf.get(org.apache.spark.internal.config.DRIVER_MEMORY)
-  private val memoryOverheadMb = sparkConf
-    .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
-    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMb).toInt,
-      MEMORY_OVERHEAD_MIN))
-  private val driverContainerMemoryWithOverhead = driverMemoryMb + memoryOverheadMb
-  private val customLabels = sparkConf.get(KUBERNETES_DRIVER_LABELS)
-  private val customAnnotations = sparkConf.get(KUBERNETES_DRIVER_ANNOTATIONS)
-
-  private val driverExtraClasspath = sparkConf.get(
-    org.apache.spark.internal.config.DRIVER_CLASS_PATH)
-  private val driverJavaOptions = sparkConf.get(
+  private val driverJavaOptions = submissionSparkConf.get(
     org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
 
+   /**
+    * Run command that initalizes a DriverSpec that will be updated after each
+    * DriverConfigurationStep in the sequence that is passed in. The final KubernetesDriverSpec
+    * will be used to build the Driver Container, Driver Pod, and Kubernetes Resources
+    */
   def run(): Unit = {
-    val arguments = (pythonResource map {p => p.arguments}).getOrElse(appArgs)
-    val driverCustomLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
-      sparkConf,
-      KUBERNETES_DRIVER_LABEL_PREFIX,
-      KUBERNETES_DRIVER_LABELS,
-      "label")
-    require(!driverCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
-        s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping" +
-        s" operations.")
-
-    val driverCustomAnnotations = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
-      sparkConf,
-      KUBERNETES_DRIVER_ANNOTATION_PREFIX,
-      KUBERNETES_DRIVER_ANNOTATIONS,
-      "annotation")
-    require(!driverCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION),
-        s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
-        s" Spark bookkeeping operations.")
-    val allDriverLabels = driverCustomLabels ++ Map(
-        SPARK_APP_ID_LABEL -> kubernetesAppId,
-        SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
-
-    val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
-      new EnvVarBuilder()
-        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
-        .withValue(classPath)
-        .build()
+    var currentDriverSpec = KubernetesDriverSpec.initialSpec(submissionSparkConf)
+    // submissionSteps contain steps necessary to take, to resolve varying
+    // client arguments that are passed in, created by orchestrator
+    for (nextStep <- submissionSteps) {
+      currentDriverSpec = nextStep.configureDriver(currentDriverSpec)
     }
-    val driverCpuQuantity = new QuantityBuilder(false)
-      .withAmount(driverCpuCores)
-      .build()
-    val driverMemoryQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverMemoryMb}M")
-      .build()
-    val driverMemoryLimitQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverContainerMemoryWithOverhead}M")
-      .build()
-    val driverContainer = new ContainerBuilder()
-      .withName(DRIVER_CONTAINER_NAME)
-      .withImage(driverDockerImage)
-      .withImagePullPolicy(dockerImagePullPolicy)
-      .addToEnv(driverExtraClasspathEnv.toSeq: _*)
+    val resolvedDriverJavaOpts = currentDriverSpec
+      .driverSparkConf
+      // We don't need this anymore since we just set the JVM options on the environment
+      .remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
+      .getAll
+      .map {
+        case (confKey, confValue) => s"-D$confKey=$confValue"
+      }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
+    val resolvedDriverContainer = new ContainerBuilder(currentDriverSpec.driverContainer)
       .addNewEnv()
-        .withName(ENV_DRIVER_MEMORY)
-        .withValue(driverContainerMemoryWithOverhead + "m")
+        .withName(ENV_DRIVER_JAVA_OPTS)
+        .withValue(resolvedDriverJavaOpts)
         .endEnv()
-      .addNewEnv()
-        .withName(ENV_DRIVER_MAIN_CLASS)
-        .withValue(mainClass)
-        .endEnv()
-      .addNewEnv()
-        .withName(ENV_DRIVER_ARGS)
-        .withValue(arguments.mkString(" "))
-        .endEnv()
-      .withNewResources()
-        .addToRequests("cpu", driverCpuQuantity)
-        .addToRequests("memory", driverMemoryQuantity)
-        .addToLimits("memory", driverMemoryLimitQuantity)
-        .endResources()
       .build()
-    val basePod = new PodBuilder()
-      .withNewMetadata()
-        .withName(kubernetesDriverPodName)
-        .addToLabels(allDriverLabels.asJava)
-        .addToAnnotations(driverCustomAnnotations.toMap.asJava)
-        .addToAnnotations(SPARK_APP_NAME_ANNOTATION, appName)
-        .endMetadata()
-      .withNewSpec()
-        .withRestartPolicy("Never")
-        .addToContainers(driverContainer)
+    val resolvedDriverPod = new PodBuilder(currentDriverSpec.driverPod)
+      .editSpec()
+        .addToContainers(resolvedDriverContainer)
         .endSpec()
-
-    driverLimitCores.map {
-      limitCores =>
-        val driverCpuLimitQuantity = new QuantityBuilder(false)
-          .withAmount(limitCores)
-          .build()
-        basePod
-          .editSpec()
-            .editFirstContainer()
-              .editResources
-                .addToLimits("cpu", driverCpuLimitQuantity)
-              .endResources()
-            .endContainer()
-          .endSpec()
-    }
-
-    val maybeSubmittedResourceIdentifiers = initContainerComponentsProvider
-      .provideInitContainerSubmittedDependencyUploader(allDriverLabels)
-      .map { uploader =>
-      SubmittedResources(uploader.uploadJars(), uploader.uploadFiles())
-    }
-    val maybeSubmittedDependenciesSecret = initContainerComponentsProvider
-      .provideSubmittedDependenciesSecretBuilder(
-        maybeSubmittedResourceIdentifiers.map(_.secrets()))
-      .map(_.build())
-
-    val containerLocalizedFilesResolver = initContainerComponentsProvider
-      .provideContainerLocalizedFilesResolver(mainAppResource)
-    val resolvedSparkJars = containerLocalizedFilesResolver.resolveSubmittedSparkJars()
-    val resolvedSparkFiles = containerLocalizedFilesResolver.resolveSubmittedSparkFiles()
-    val resolvedPySparkFiles = containerLocalizedFilesResolver.resolveSubmittedPySparkFiles()
-    val resolvedPrimaryPySparkResource = pythonResource.map {
-        p => p.primaryPySparkResource(containerLocalizedFilesResolver)
-      }.getOrElse("")
-    val initContainerBundler = initContainerComponentsProvider
-      .provideInitContainerBundle(maybeSubmittedResourceIdentifiers.map(_.ids()),
-        resolvedSparkJars ++ resolvedSparkFiles)
-
-    val podWithInitContainer = initContainerBundler.map(
-      _.sparkPodInitContainerBootstrap
-        .bootstrapInitContainerAndVolumes(driverContainer.getName, basePod))
-      .getOrElse(basePod)
-    val sparkConfWithExecutorInit = initContainerBundler.map(
-      _.executorInitContainerConfiguration
-        .configureSparkConfForExecutorInitContainer(sparkConf))
-      .getOrElse(sparkConf)
-    val credentialsMounter = kubernetesCredentialsMounterProvider
-        .getDriverPodKubernetesCredentialsMounter()
-    val credentialsSecret = credentialsMounter.createCredentialsSecret()
-    val podWithInitContainerAndMountedCreds = credentialsMounter.mountDriverKubernetesCredentials(
-      podWithInitContainer, driverContainer.getName, credentialsSecret)
-    val resolvedSparkConf = credentialsMounter.setDriverPodKubernetesCredentialLocations(
-        sparkConfWithExecutorInit)
-    if (resolvedSparkJars.nonEmpty) {
-      resolvedSparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
-    }
-    if (resolvedSparkFiles.nonEmpty) {
-      resolvedSparkConf.set("spark.files", resolvedSparkFiles.mkString(","))
-    }
-    resolvedSparkConf.setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
-    resolvedSparkConf.set("spark.app.id", kubernetesAppId)
-    resolvedSparkConf.set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, kubernetesResourceNamePrefix)
-    // We don't need this anymore since we just set the JVM options on the environment
-    resolvedSparkConf.remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
-    val resolvedLocalClasspath = containerLocalizedFilesResolver
-      .resolveSubmittedAndRemoteSparkJars()
-    val resolvedDriverJavaOpts = resolvedSparkConf.getAll.map {
-      case (confKey, confValue) => s"-D$confKey=$confValue"
-    }.mkString(" ") + driverJavaOptions.map(" " + _).getOrElse("")
-    val resolvedDriverPodBuilder = podWithInitContainerAndMountedCreds.editSpec()
-      .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainer.getName))
-        .addNewEnv()
-          .withName(ENV_MOUNTED_CLASSPATH)
-          .withValue(resolvedLocalClasspath.mkString(File.pathSeparator))
-          .endEnv()
-        .addNewEnv()
-          .withName(ENV_DRIVER_JAVA_OPTS)
-          .withValue(resolvedDriverJavaOpts)
-          .endEnv()
-        .endContainer()
-      .endSpec()
-    val driverPodFileMounter = initContainerComponentsProvider.provideDriverPodFileMounter()
-    val resolvedDriverPod = pythonResource.map {
-      p => p.driverPodWithPySparkEnvs(
-        driverPodFileMounter,
-        resolvedPrimaryPySparkResource,
-        resolvedPySparkFiles.mkString(","),
-        driverContainer.getName,
-        resolvedDriverPodBuilder
-      )}.getOrElse(resolvedDriverPodBuilder.build())
+      .build()
     Utils.tryWithResource(
         kubernetesClient
             .pods()
@@ -250,22 +116,21 @@ private[spark] class Client(
             .watch(loggingPodStatusWatcher)) { _ =>
       val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
       try {
-        val driverOwnedResources = initContainerBundler.map(
-          _.sparkInitContainerConfigMap).toSeq ++
-          maybeSubmittedDependenciesSecret.toSeq ++
-          credentialsSecret.toSeq
-        val driverPodOwnerReference = new OwnerReferenceBuilder()
-          .withName(createdDriverPod.getMetadata.getName)
-          .withApiVersion(createdDriverPod.getApiVersion)
-          .withUid(createdDriverPod.getMetadata.getUid)
-          .withKind(createdDriverPod.getKind)
-          .withController(true)
-          .build()
-        driverOwnedResources.foreach { resource =>
-          val originalMetadata = resource.getMetadata
-          originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
+        if (currentDriverSpec.otherKubernetesResources.nonEmpty) {
+          val driverPodOwnerReference = new OwnerReferenceBuilder()
+            .withName(createdDriverPod.getMetadata.getName)
+            .withApiVersion(createdDriverPod.getApiVersion)
+            .withUid(createdDriverPod.getMetadata.getUid)
+            .withKind(createdDriverPod.getKind)
+            .withController(true)
+            .build()
+          currentDriverSpec.otherKubernetesResources.foreach { resource =>
+            val originalMetadata = resource.getMetadata
+            originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
+          }
+          val otherKubernetesResources = currentDriverSpec.otherKubernetesResources
+          kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
         }
-        kubernetesClient.resourceList(driverOwnedResources: _*).createOrReplace()
       } catch {
         case e: Throwable =>
           kubernetesClient.pods().delete(createdDriverPod)
@@ -283,61 +148,26 @@ private[spark] class Client(
 }
 
 private[spark] object Client {
-  def main(args: Array[String]): Unit = {
-    val sparkConf = new SparkConf(true)
-    val mainAppResource = args(0)
-    val mainClass = args(1)
-    val appArgs = args.drop(2)
-    run(sparkConf, mainAppResource, mainClass, appArgs)
-  }
-  def run(
-      sparkConf: SparkConf,
-      mainAppResource: String,
-      mainClass: String,
-      appArgs: Array[String]): Unit = {
-    val isPython = mainAppResource.endsWith(".py")
-    val pythonResource: Option[PythonSubmissionResourcesImpl] =
-      if (isPython) {
-        Option(new PythonSubmissionResourcesImpl(mainAppResource, appArgs))
-      } else None
-    // Since you might need jars for SQL UDFs in PySpark
-    def sparkJarFilter(): Seq[String] =
-      pythonResource.map {p => p.sparkJars}.getOrElse(
-        Option(mainAppResource)
-          .filterNot(_ == SparkLauncher.NO_RESOURCE)
-          .toSeq)
-    val sparkJars = sparkConf.getOption("spark.jars")
-      .map(_.split(","))
-      .getOrElse(Array.empty[String]) ++ sparkJarFilter()
-    val launchTime = System.currentTimeMillis
-    val sparkFiles = sparkConf.getOption("spark.files")
-      .map(_.split(","))
-      .getOrElse(Array.empty[String])
-    val pySparkFilesOption = pythonResource.map {p => p.pySparkFiles}
-    validateNoDuplicateFileNames(sparkJars)
-    validateNoDuplicateFileNames(sparkFiles)
-    pySparkFilesOption.foreach {b => validateNoDuplicateFileNames(b)}
-    val pySparkFiles = pySparkFilesOption.getOrElse(Array.empty[String])
-    val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
-    // The resource name prefix is derived from the application name, making it easy to connect the
-    // names of the Kubernetes resources from e.g. Kubectl or the Kubernetes dashboard to the
-    // application the user submitted. However, we can't use the application name in the label, as
-    // label values are considerably restrictive, e.g. must be no longer than 63 characters in
-    // length. So we generate a separate identifier for the app ID itself, and bookkeeping that
-    // requires finding "all pods for this application" should use the kubernetesAppId.
-    val kubernetesResourceNamePrefix = s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
-    val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}"
+  def run(sparkConf: SparkConf, clientArguments: ClientArguments): Unit = {
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
+    val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}"
+    val launchTime = System.currentTimeMillis()
+    val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+    val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
     val master = resolveK8sMaster(sparkConf.get("spark.master"))
-    val sslOptionsProvider = new ResourceStagingServerSslOptionsProviderImpl(sparkConf)
-    val initContainerComponentsProvider = new DriverInitContainerComponentsProviderImpl(
-        sparkConf,
-        kubernetesResourceNamePrefix,
+    val loggingInterval = Option(sparkConf.get(REPORT_INTERVAL)).filter( _ => waitForAppCompletion)
+    val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(
+        kubernetesAppId, loggingInterval)
+    val configurationStepsOrchestrator = new DriverConfigurationStepsOrchestrator(
         namespace,
-        sparkJars,
-        sparkFiles,
-        pySparkFiles,
-        sslOptionsProvider.getSslOptions)
+        kubernetesAppId,
+        launchTime,
+        clientArguments.mainAppResource,
+        appName,
+        clientArguments.mainClass,
+        clientArguments.driverArgs,
+        clientArguments.otherPyFiles,
+        sparkConf)
     Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
         master,
         Some(namespace),
@@ -345,38 +175,25 @@ private[spark] object Client {
         sparkConf,
         None,
         None)) { kubernetesClient =>
-      val kubernetesCredentialsMounterProvider =
-          new DriverPodKubernetesCredentialsMounterProviderImpl(
-              sparkConf, kubernetesResourceNamePrefix)
-      val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
-      val loggingInterval = Option(sparkConf.get(REPORT_INTERVAL))
-          .filter( _ => waitForAppCompletion)
-      val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(
-          kubernetesResourceNamePrefix, loggingInterval)
       new Client(
-          appName,
-          kubernetesResourceNamePrefix,
-          kubernetesAppId,
-          mainAppResource,
-          pythonResource,
-          mainClass,
+          configurationStepsOrchestrator.getAllConfigurationSteps(),
           sparkConf,
-          appArgs,
-          waitForAppCompletion,
           kubernetesClient,
-          initContainerComponentsProvider,
-          kubernetesCredentialsMounterProvider,
+          waitForAppCompletion,
+          appName,
           loggingPodStatusWatcher).run()
     }
   }
-  private def validateNoDuplicateFileNames(allFiles: Seq[String]): Unit = {
-    val fileNamesToUris = allFiles.map { file =>
-      (new File(Utils.resolveURI(file).getPath).getName, file)
-    }
-    fileNamesToUris.groupBy(_._1).foreach {
-      case (fileName, urisWithFileName) =>
-        require(urisWithFileName.size == 1, "Cannot add multiple files with the same name, but" +
-          s" file name $fileName is shared by all of these URIs: $urisWithFileName")
-    }
+
+   /**
+    * Entry point from SparkSubmit in spark-core
+    *
+    * @param args Array of strings that have interchanging values that will be
+    *             parsed by ClientArguments with the identifiers that precede the values
+    */
+  def main(args: Array[String]): Unit = {
+    val parsedArguments = ClientArguments.fromCommandLineArgs(args)
+    val sparkConf = new SparkConf()
+    run(sparkConf, parsedArguments)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
deleted file mode 100644
index c31aa5f306bea..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolver.scala
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import java.io.File
-
-import org.apache.spark.util.Utils
-
-private[spark] trait ContainerLocalizedFilesResolver {
-  def resolveSubmittedAndRemoteSparkJars(): Seq[String]
-  def resolveSubmittedSparkJars(): Seq[String]
-  def resolveSubmittedSparkFiles(): Seq[String]
-  def resolveSubmittedPySparkFiles(): Seq[String]
-  def resolvePrimaryResourceFile(): String
-}
-
-private[spark] class ContainerLocalizedFilesResolverImpl(
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    pySparkFiles: Seq[String],
-    primaryPyFile: String,
-    jarsDownloadPath: String,
-    filesDownloadPath: String) extends ContainerLocalizedFilesResolver {
-
-
-  override def resolveSubmittedAndRemoteSparkJars(): Seq[String] = {
-    sparkJars.map { jar =>
-      val jarUri = Utils.resolveURI(jar)
-      Option(jarUri.getScheme).getOrElse("file") match {
-        case "local" =>
-          jarUri.getPath
-        case _ =>
-          val jarFileName = new File(jarUri.getPath).getName
-          s"$jarsDownloadPath/$jarFileName"
-      }
-    }
-  }
-
-  override def resolveSubmittedSparkJars(): Seq[String] = {
-    resolveSubmittedFiles(sparkJars, jarsDownloadPath)
-  }
-
-  override def resolveSubmittedSparkFiles(): Seq[String] = {
-    resolveSubmittedFiles(sparkFiles, filesDownloadPath)
-  }
-
-  override def resolveSubmittedPySparkFiles(): Seq[String] = {
-    def filterMainResource(x: String) = x match {
-      case `primaryPyFile` => None
-      case _ => Some(resolveFile(x, filesDownloadPath))
-    }
-    pySparkFiles.flatMap(x => filterMainResource(x))
-  }
-
-  override def resolvePrimaryResourceFile(): String = {
-    Option(primaryPyFile).map(p => resolveFile(p, filesDownloadPath)).getOrElse("")
-  }
-
-  private def resolveFile(file: String, downloadPath: String) = {
-    val fileUri = Utils.resolveURI(file)
-    Option(fileUri.getScheme).getOrElse("file") match {
-      case "file" =>
-        val fileName = new File(fileUri.getPath).getName
-        s"$downloadPath/$fileName"
-      case _ =>
-        file
-    }
-  }
-
-  private def resolveSubmittedFiles(files: Seq[String], downloadPath: String): Seq[String] = {
-    files.map { file => resolveFile(file, downloadPath) }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
new file mode 100644
index 0000000000000..82abe55ac6989
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.ConfigurationUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.{BaseDriverConfigurationStep, DependencyResolutionStep, DriverConfigurationStep, DriverKubernetesCredentialsStep, InitContainerBootstrapStep, PythonStep}
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer.InitContainerConfigurationStepsOrchestrator
+import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.util.Utils
+
+/**
+ * Constructs the complete list of driver configuration steps to run to deploy the Spark driver.
+ */
+private[spark] class DriverConfigurationStepsOrchestrator(
+    namespace: String,
+    kubernetesAppId: String,
+    launchTime: Long,
+    mainAppResource: MainAppResource,
+    appName: String,
+    mainClass: String,
+    appArgs: Array[String],
+    additionalPythonFiles: Seq[String],
+    submissionSparkConf: SparkConf) {
+
+  // The resource name prefix is derived from the application name, making it easy to connect the
+  // names of the Kubernetes resources from e.g. kubectl or the Kubernetes dashboard to the
+  // application the user submitted. However, we can't use the application name in the label, as
+  // label values are considerably restrictive, e.g. must be no longer than 63 characters in
+  // length. So we generate a separate identifier for the app ID itself, and bookkeeping that
+  // requires finding "all pods for this application" should use the kubernetesAppId.
+  private val kubernetesResourceNamePrefix =
+      s"$appName-$launchTime".toLowerCase.replaceAll("\\.", "-")
+  private val jarsDownloadPath = submissionSparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
+  private val filesDownloadPath = submissionSparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
+  private val dockerImagePullPolicy = submissionSparkConf.get(DOCKER_IMAGE_PULL_POLICY)
+  private val initContainerConfigMapName = s"$kubernetesResourceNamePrefix-init-config"
+
+  def getAllConfigurationSteps(): Seq[DriverConfigurationStep] = {
+    val additionalMainAppJar = mainAppResource match {
+      case JavaMainAppResource(resource) if resource != SparkLauncher.NO_RESOURCE =>
+        Option(resource)
+      case _ => Option.empty
+    }
+    val additionalMainAppPythonFile = mainAppResource match {
+      case PythonMainAppResource(resource) if resource != SparkLauncher.NO_RESOURCE =>
+        Option(resource)
+      case _ => Option.empty
+    }
+    val sparkJars = submissionSparkConf.getOption("spark.jars")
+        .map(_.split(","))
+        .getOrElse(Array.empty[String]) ++
+        additionalMainAppJar.toSeq
+    val sparkFiles = submissionSparkConf.getOption("spark.files")
+        .map(_.split(","))
+        .getOrElse(Array.empty[String]) ++
+        additionalMainAppPythonFile.toSeq ++
+        additionalPythonFiles
+    val driverCustomLabels = ConfigurationUtils.combinePrefixedKeyValuePairsWithDeprecatedConf(
+        submissionSparkConf,
+        KUBERNETES_DRIVER_LABEL_PREFIX,
+        KUBERNETES_DRIVER_LABELS,
+        "label")
+    require(!driverCustomLabels.contains(SPARK_APP_ID_LABEL), s"Label with key " +
+        s" $SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping" +
+        s" operations.")
+    val allDriverLabels = driverCustomLabels ++ Map(
+        SPARK_APP_ID_LABEL -> kubernetesAppId,
+        SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
+    val initialSubmissionStep = new BaseDriverConfigurationStep(
+        kubernetesAppId,
+        kubernetesResourceNamePrefix,
+        allDriverLabels,
+        dockerImagePullPolicy,
+        appName,
+        mainClass,
+        appArgs,
+        submissionSparkConf)
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+        submissionSparkConf, kubernetesResourceNamePrefix)
+    val pythonStep = mainAppResource match {
+      case PythonMainAppResource(mainPyResource) =>
+        Option(new PythonStep(mainPyResource, additionalPythonFiles, filesDownloadPath))
+      case _ => Option.empty[DriverConfigurationStep]
+    }
+    val initContainerBootstrapStep = if ((sparkJars ++ sparkFiles).exists { uri =>
+      Option(Utils.resolveURI(uri).getScheme).getOrElse("file") != "local"
+    }) {
+      val initContainerConfigurationStepsOrchestrator =
+          new InitContainerConfigurationStepsOrchestrator(
+              namespace,
+              kubernetesResourceNamePrefix,
+              sparkJars,
+              sparkFiles,
+              jarsDownloadPath,
+              filesDownloadPath,
+              dockerImagePullPolicy,
+              allDriverLabels,
+              initContainerConfigMapName,
+              INIT_CONTAINER_CONFIG_MAP_KEY,
+              submissionSparkConf)
+      val initContainerConfigurationSteps =
+          initContainerConfigurationStepsOrchestrator.getAllConfigurationSteps()
+      Some(new InitContainerBootstrapStep(initContainerConfigurationSteps,
+        initContainerConfigMapName,
+        INIT_CONTAINER_CONFIG_MAP_KEY))
+    } else {
+      Option.empty[DriverConfigurationStep]
+    }
+    val dependencyResolutionStep = new DependencyResolutionStep(
+      sparkJars,
+      sparkFiles,
+      jarsDownloadPath,
+      filesDownloadPath)
+    Seq(
+      initialSubmissionStep,
+      kubernetesCredentialsStep,
+      dependencyResolutionStep) ++
+      initContainerBootstrapStep.toSeq ++
+      pythonStep.toSeq
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
deleted file mode 100644
index 6e185d2c069f6..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverInitContainerComponentsProvider.scala
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.api.model.ConfigMap
-
-import org.apache.spark.{SparkConf, SSLOptions}
-import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrap, SparkPodInitContainerBootstrapImpl}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.rest.kubernetes.RetrofitClientFactoryImpl
-import org.apache.spark.util.Utils
-
-/**
- * Interface that wraps the provision of everything the submission client needs to set up the
- * driver's init-container. This is all wrapped in the same place to ensure that related
- * components are being constructed with consistent configurations with respect to one another.
- */
-private[spark] trait DriverInitContainerComponentsProvider {
-
-  def provideContainerLocalizedFilesResolver(
-      mainAppResource: String): ContainerLocalizedFilesResolver
-  def provideInitContainerSubmittedDependencyUploader(
-      driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader]
-  def provideSubmittedDependenciesSecretBuilder(
-      maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
-      : Option[SubmittedDependencySecretBuilder]
-  def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap
-  def provideDriverPodFileMounter(): DriverPodKubernetesFileMounter
-  def provideInitContainerBundle(maybeSubmittedResourceIds: Option[SubmittedResourceIds],
-      uris: Iterable[String]): Option[InitContainerBundle]
-}
-
-private[spark] class DriverInitContainerComponentsProviderImpl(
-      sparkConf: SparkConf,
-      kubernetesResourceNamePrefix: String,
-      namespace: String,
-      sparkJars: Seq[String],
-      sparkFiles: Seq[String],
-      pySparkFiles: Seq[String],
-      resourceStagingServerExternalSslOptions: SSLOptions)
-    extends DriverInitContainerComponentsProvider {
-
-  private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
-  private val maybeResourceStagingServerInternalUri =
-      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_URI)
-  private val maybeResourceStagingServerInternalTrustStore =
-      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_FILE)
-          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE))
-  private val maybeResourceStagingServerInternalTrustStorePassword =
-      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_PASSWORD)
-          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD))
-  private val maybeResourceStagingServerInternalTrustStoreType =
-      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_TYPE)
-          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE))
-  private val maybeResourceStagingServerInternalClientCert =
-      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_CLIENT_CERT_PEM)
-          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM))
-  private val resourceStagingServerInternalSslEnabled =
-      sparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_SSL_ENABLED)
-          .orElse(sparkConf.get(RESOURCE_STAGING_SERVER_SSL_ENABLED))
-          .getOrElse(false)
-
-  OptionRequirements.requireNandDefined(
-      maybeResourceStagingServerInternalClientCert,
-      maybeResourceStagingServerInternalTrustStore,
-      "Cannot provide both a certificate file and a trustStore file for init-containers to" +
-        " use for contacting the resource staging server over TLS.")
-
-  require(maybeResourceStagingServerInternalTrustStore.forall { trustStore =>
-    Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") match {
-      case "file" | "local" => true
-      case _ => false
-    }
-  }, "TrustStore URI used for contacting the resource staging server from init containers must" +
-    " have no scheme, or scheme file://, or scheme local://.")
-
-  require(maybeResourceStagingServerInternalClientCert.forall { trustStore =>
-    Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") match {
-      case "file" | "local" => true
-      case _ => false
-    }
-  }, "Client cert file URI used for contacting the resource staging server from init containers" +
-    " must have no scheme, or scheme file://, or scheme local://.")
-
-  private val jarsDownloadPath = sparkConf.get(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION)
-  private val filesDownloadPath = sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
-  private val maybeSecretName = maybeResourceStagingServerUri.map { _ =>
-    s"$kubernetesResourceNamePrefix-init-secret"
-  }
-  private val configMapName = s"$kubernetesResourceNamePrefix-init-config"
-  private val configMapKey = s"$kubernetesResourceNamePrefix-init-config-key"
-  private val initContainerImage = sparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
-  private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
-  private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
-  private val pySparkSubmitted = KubernetesFileUtils.getOnlySubmitterLocalFiles(pySparkFiles)
-
-  private def provideInitContainerConfigMap(
-      maybeSubmittedResourceIds: Option[SubmittedResourceIds]): ConfigMap = {
-    val submittedDependencyConfigPlugin = for {
-      stagingServerUri <- maybeResourceStagingServerUri
-      jarsResourceId <- maybeSubmittedResourceIds.map(_.jarsResourceId)
-      filesResourceId <- maybeSubmittedResourceIds.map(_.filesResourceId)
-    } yield {
-      new SubmittedDependencyInitContainerConfigPluginImpl(
-          // Configure the init-container with the internal URI over the external URI.
-          maybeResourceStagingServerInternalUri.getOrElse(stagingServerUri),
-          jarsResourceId,
-          filesResourceId,
-          INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
-          INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
-          INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
-          INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
-          resourceStagingServerInternalSslEnabled,
-          maybeResourceStagingServerInternalTrustStore,
-          maybeResourceStagingServerInternalClientCert,
-          maybeResourceStagingServerInternalTrustStorePassword,
-          maybeResourceStagingServerInternalTrustStoreType,
-          INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
-    }
-    new SparkInitContainerConfigMapBuilderImpl(
-        sparkJars,
-        sparkFiles ++ pySparkSubmitted,
-        jarsDownloadPath,
-        filesDownloadPath,
-        configMapName,
-        configMapKey,
-        submittedDependencyConfigPlugin).build()
-  }
-
-  override def provideContainerLocalizedFilesResolver(mainAppResource: String)
-    : ContainerLocalizedFilesResolver = {
-    new ContainerLocalizedFilesResolverImpl(
-        sparkJars, sparkFiles, pySparkFiles, mainAppResource, jarsDownloadPath, filesDownloadPath)
-  }
-
-  private def provideExecutorInitContainerConfiguration(): ExecutorInitContainerConfiguration = {
-    new ExecutorInitContainerConfigurationImpl(
-        maybeSecretName,
-        INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH,
-        configMapName,
-        configMapKey)
-  }
-
-  override def provideInitContainerSubmittedDependencyUploader(
-      driverPodLabels: Map[String, String]): Option[SubmittedDependencyUploader] = {
-    maybeResourceStagingServerUri.map { stagingServerUri =>
-      new SubmittedDependencyUploaderImpl(
-          driverPodLabels,
-          namespace,
-          stagingServerUri,
-          sparkJars,
-          sparkFiles ++ pySparkSubmitted,
-          resourceStagingServerExternalSslOptions,
-          RetrofitClientFactoryImpl)
-    }
-  }
-
-  override def provideSubmittedDependenciesSecretBuilder(
-      maybeSubmittedResourceSecrets: Option[SubmittedResourceSecrets])
-      : Option[SubmittedDependencySecretBuilder] = {
-    for {
-      secretName <- maybeSecretName
-      jarsResourceSecret <- maybeSubmittedResourceSecrets.map(_.jarsResourceSecret)
-      filesResourceSecret <- maybeSubmittedResourceSecrets.map(_.filesResourceSecret)
-    } yield {
-      new SubmittedDependencySecretBuilderImpl(
-          secretName,
-          jarsResourceSecret,
-          filesResourceSecret,
-          INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY,
-          INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY,
-          INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
-          INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
-          maybeResourceStagingServerInternalTrustStore,
-          maybeResourceStagingServerInternalClientCert)
-    }
-  }
-
-  override def provideInitContainerBootstrap(): SparkPodInitContainerBootstrap = {
-    val resourceStagingServerSecretPlugin = maybeSecretName.map { secret =>
-      new InitContainerResourceStagingServerSecretPluginImpl(
-          secret, INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
-    }
-    new SparkPodInitContainerBootstrapImpl(
-        initContainerImage,
-        dockerImagePullPolicy,
-        jarsDownloadPath,
-        filesDownloadPath,
-        downloadTimeoutMinutes,
-        configMapName,
-        configMapKey,
-        resourceStagingServerSecretPlugin)
-  }
-  override def provideDriverPodFileMounter(): DriverPodKubernetesFileMounter = {
-    new DriverPodKubernetesFileMounterImpl()
-  }
-  override def provideInitContainerBundle(
-      maybeSubmittedResourceIds: Option[SubmittedResourceIds],
-      uris: Iterable[String]): Option[InitContainerBundle] = {
-    // Bypass init-containers if `spark.jars` and `spark.files` and '--py-rilfes'
-    // is empty or only has `local://` URIs
-    if ((KubernetesFileUtils.getNonContainerLocalFiles(uris) ++ pySparkSubmitted).nonEmpty) {
-      Some(InitContainerBundle(provideInitContainerConfigMap(maybeSubmittedResourceIds),
-        provideInitContainerBootstrap(),
-        provideExecutorInitContainerConfiguration()))
-    } else None
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
deleted file mode 100644
index 25e7c3b3ebd89..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounter.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.api.model.{PodBuilder, Secret, SecretBuilder}
-import scala.collection.JavaConverters._
-import scala.language.implicitConversions
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-
-private[spark] trait DriverPodKubernetesCredentialsMounter {
-
-  /**
-   * Set fields on the Spark configuration that indicate where the driver pod is
-   * to find its Kubernetes credentials for requesting executors.
-   */
-  def setDriverPodKubernetesCredentialLocations(sparkConf: SparkConf): SparkConf
-
-  /**
-   * Create the Kubernetes secret object that correspond to the driver's credentials
-   * that have to be created and mounted into the driver pod. The single Secret
-   * object contains all of the data entries for the driver pod's Kubernetes
-   * credentials. Returns empty if no secrets are to be mounted.
-   */
-  def createCredentialsSecret(): Option[Secret]
-
-  /**
-   * Mount any Kubernetes credentials from the submitting machine's disk into the driver pod. The
-   * secret that is passed in here should have been created from createCredentialsSecret so that
-   * the implementation does not need to hold its state.
-   */
-  def mountDriverKubernetesCredentials(
-    originalPodSpec: PodBuilder,
-    driverContainerName: String,
-    credentialsSecret: Option[Secret]): PodBuilder
-}
-
-private[spark] class DriverPodKubernetesCredentialsMounterImpl(
-      kubernetesAppId: String,
-      submitterLocalDriverPodKubernetesCredentials: KubernetesCredentials,
-      maybeUserSpecifiedMountedClientKeyFile: Option[String],
-      maybeUserSpecifiedMountedClientCertFile: Option[String],
-      maybeUserSpecifiedMountedOAuthTokenFile: Option[String],
-      maybeUserSpecifiedMountedCaCertFile: Option[String])
-    extends DriverPodKubernetesCredentialsMounter {
-
-  override def setDriverPodKubernetesCredentialLocations(sparkConf: SparkConf): SparkConf = {
-    val resolvedMountedClientKeyFile = resolveSecretLocation(
-        maybeUserSpecifiedMountedClientKeyFile,
-        submitterLocalDriverPodKubernetesCredentials.clientKeyDataBase64,
-        DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
-    val resolvedMountedClientCertFile = resolveSecretLocation(
-        maybeUserSpecifiedMountedClientCertFile,
-        submitterLocalDriverPodKubernetesCredentials.clientCertDataBase64,
-        DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
-    val resolvedMountedCaCertFile = resolveSecretLocation(
-        maybeUserSpecifiedMountedCaCertFile,
-        submitterLocalDriverPodKubernetesCredentials.caCertDataBase64,
-        DRIVER_CREDENTIALS_CA_CERT_PATH)
-    val resolvedMountedOAuthTokenFile = resolveSecretLocation(
-        maybeUserSpecifiedMountedOAuthTokenFile,
-        submitterLocalDriverPodKubernetesCredentials.oauthTokenBase64,
-        DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
-    val sparkConfWithCredentialLocations = sparkConf.clone()
-      .setOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-          resolvedMountedCaCertFile)
-      .setOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-          resolvedMountedClientKeyFile)
-      .setOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-          resolvedMountedClientCertFile)
-      .setOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
-          resolvedMountedOAuthTokenFile)
-    // Redact all OAuth token values
-    sparkConfWithCredentialLocations
-        .getAll
-        .filter(_._1.endsWith(OAUTH_TOKEN_CONF_SUFFIX)).map(_._1)
-        .foreach {
-      sparkConfWithCredentialLocations.set(_, "<present_but_redacted>")
-    }
-    sparkConfWithCredentialLocations
-  }
-
-  override def createCredentialsSecret(): Option[Secret] = {
-    val allSecretData =
-      resolveSecretData(
-        maybeUserSpecifiedMountedClientKeyFile,
-        submitterLocalDriverPodKubernetesCredentials.clientKeyDataBase64,
-        DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME) ++
-      resolveSecretData(
-        maybeUserSpecifiedMountedClientCertFile,
-        submitterLocalDriverPodKubernetesCredentials.clientCertDataBase64,
-        DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME) ++
-      resolveSecretData(
-        maybeUserSpecifiedMountedCaCertFile,
-        submitterLocalDriverPodKubernetesCredentials.caCertDataBase64,
-        DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME) ++
-      resolveSecretData(
-        maybeUserSpecifiedMountedOAuthTokenFile,
-        submitterLocalDriverPodKubernetesCredentials.oauthTokenBase64,
-        DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME)
-    if (allSecretData.isEmpty) {
-      None
-    } else {
-      Some(new SecretBuilder()
-        .withNewMetadata().withName(s"$kubernetesAppId-kubernetes-credentials").endMetadata()
-        .withData(allSecretData.asJava)
-        .build())
-    }
-  }
-
-  override def mountDriverKubernetesCredentials(
-      originalPodSpec: PodBuilder,
-      driverContainerName: String,
-      credentialsSecret: Option[Secret]): PodBuilder = {
-    credentialsSecret.map { secret =>
-      originalPodSpec.editSpec()
-        .addNewVolume()
-          .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
-          .withNewSecret().withSecretName(secret.getMetadata.getName).endSecret()
-          .endVolume()
-        .editMatchingContainer(new ContainerNameEqualityPredicate(driverContainerName))
-          .addNewVolumeMount()
-            .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
-            .withMountPath(DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
-            .endVolumeMount()
-          .endContainer()
-        .endSpec()
-    }.getOrElse(originalPodSpec)
-  }
-
-  private def resolveSecretLocation(
-      mountedUserSpecified: Option[String],
-      valueMountedFromSubmitter: Option[String],
-      mountedCanonicalLocation: String): Option[String] = {
-    mountedUserSpecified.orElse(valueMountedFromSubmitter.map( _ => {
-      mountedCanonicalLocation
-    }))
-  }
-
-  private def resolveSecretData(
-      mountedUserSpecified: Option[String],
-      valueMountedFromSubmitter: Option[String],
-      secretName: String): Map[String, String] = {
-    mountedUserSpecified.map { _ => Map.empty[String, String]}
-        .getOrElse {
-      valueMountedFromSubmitter.map { valueBase64 =>
-        Map(secretName -> valueBase64)
-      }.getOrElse(Map.empty[String, String])
-    }
-  }
-
-  private implicit def augmentSparkConf(sparkConf: SparkConf): OptionSettableSparkConf = {
-    new OptionSettableSparkConf(sparkConf)
-  }
-}
-
-private class OptionSettableSparkConf(sparkConf: SparkConf) {
-  def setOption(configEntry: String, option: Option[String]): SparkConf = {
-    option.map( opt => {
-      sparkConf.set(configEntry, opt)
-    }).getOrElse(sparkConf)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
deleted file mode 100644
index 913279198146a..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterProvider.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.config._
-
-private[spark] trait DriverPodKubernetesCredentialsMounterProvider {
-
-  def getDriverPodKubernetesCredentialsMounter()
-      : DriverPodKubernetesCredentialsMounter
-}
-
-private[spark] class DriverPodKubernetesCredentialsMounterProviderImpl(
-    sparkConf: SparkConf,
-    kubernetesAppId: String)
-    extends DriverPodKubernetesCredentialsMounterProvider {
-
-  override def getDriverPodKubernetesCredentialsMounter()
-      : DriverPodKubernetesCredentialsMounter = {
-    val submitterLocalDriverPodKubernetesCredentials =
-      new DriverPodKubernetesCredentialsProvider(sparkConf).get()
-    new DriverPodKubernetesCredentialsMounterImpl(
-      kubernetesAppId,
-      submitterLocalDriverPodKubernetesCredentials,
-      sparkConf.getOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX"),
-      sparkConf.getOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX"),
-      sparkConf.getOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX"),
-      sparkConf.getOption(
-          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX"))
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
deleted file mode 100644
index 41b0cf8ceaeab..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsProvider.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import java.io.File
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.internal.config.OptionalConfigEntry
-
-private[spark] class DriverPodKubernetesCredentialsProvider(sparkConf: SparkConf) {
-
-  def get(): KubernetesCredentials = {
-    val oauthTokenBase64 = sparkConf
-        .getOption(s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX")
-        .map { token =>
-      BaseEncoding.base64().encode(token.getBytes(Charsets.UTF_8))
-    }
-    val caCertDataBase64 = safeFileConfToBase64(
-        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-        s"Driver CA cert file provided at %s does not exist or is not a file.")
-    val clientKeyDataBase64 = safeFileConfToBase64(
-        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-        s"Driver client key file provided at %s does not exist or is not a file.")
-    val clientCertDataBase64 = safeFileConfToBase64(
-        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-        s"Driver client cert file provided at %s does not exist or is not a file.")
-    KubernetesCredentials(
-      oauthTokenBase64 = oauthTokenBase64,
-      caCertDataBase64 = caCertDataBase64,
-      clientKeyDataBase64 = clientKeyDataBase64,
-      clientCertDataBase64 = clientCertDataBase64)
-  }
-
-  private def safeFileConfToBase64(
-      conf: String,
-      fileNotFoundFormatString: String): Option[String] = {
-    sparkConf.getOption(conf)
-      .map(new File(_))
-      .map { file =>
-        require(file.isFile, String.format(fileNotFoundFormatString, file.getAbsolutePath))
-        BaseEncoding.base64().encode(Files.toByteArray(file))
-      }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
deleted file mode 100644
index cc0ef0eedb457..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesFileMounter.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.api.model.{Container, PodBuilder}
-
-import org.apache.spark.deploy.kubernetes.constants._
-
- /**
-  * Trait that is responsible for providing full file-paths dynamically after
-  * the filesDownloadPath has been defined. The file-names are then stored in the
-  * environmental variables in the driver-pod.
-  */
-private[spark] trait DriverPodKubernetesFileMounter {
-  def addPySparkFiles(primaryFile: String, pySparkFiles: String,
-    mainContainerName: String, originalPodSpec: PodBuilder) : PodBuilder
-}
-
-private[spark] class DriverPodKubernetesFileMounterImpl()
-  extends DriverPodKubernetesFileMounter {
-  override def addPySparkFiles(
-        primaryFile: String,
-        pySparkFiles: String,
-        mainContainerName: String,
-        originalPodSpec: PodBuilder): PodBuilder = {
-
-    originalPodSpec
-      .editSpec()
-        .editMatchingContainer(new ContainerNameEqualityPredicate(mainContainerName))
-          .addNewEnv()
-            .withName(ENV_PYSPARK_PRIMARY)
-            .withValue(primaryFile)
-          .endEnv()
-          .addNewEnv()
-            .withName(ENV_PYSPARK_FILES)
-            .withValue(pySparkFiles)
-          .endEnv()
-        .endContainer()
-      .endSpec()
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala
deleted file mode 100644
index 2292365995d1f..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfiguration.scala
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.kubernetes.config._
-
-private[spark] trait ExecutorInitContainerConfiguration {
-  /**
-   * Provide the driver with configuration that allows it to configure executors to
-   * fetch resources in the same way the driver does.
-   */
-  def configureSparkConfForExecutorInitContainer(originalSparkConf: SparkConf): SparkConf
-}
-
-private[spark] class ExecutorInitContainerConfigurationImpl(
-    initContainerSecretName: Option[String],
-    initContainerSecretMountDir: String,
-    initContainerConfigMapName: String,
-    initContainerConfigMapKey: String)
-    extends ExecutorInitContainerConfiguration {
-  def configureSparkConfForExecutorInitContainer(originalSparkConf: SparkConf): SparkConf = {
-    val configuredSparkConf = originalSparkConf.clone()
-      .set(EXECUTOR_INIT_CONTAINER_CONFIG_MAP,
-        initContainerConfigMapName)
-      .set(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY,
-        initContainerConfigMapKey)
-      .set(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR, initContainerSecretMountDir)
-    initContainerSecretName.map { secret =>
-      configuredSparkConf.set(EXECUTOR_INIT_CONTAINER_SECRET, secret)
-    }.getOrElse(configuredSparkConf)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala
index 9b7faaa78a9aa..837ec0e8c867e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerUtil.scala
@@ -18,19 +18,18 @@ package org.apache.spark.deploy.kubernetes.submit
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import io.fabric8.kubernetes.api.model.{Container, PodBuilder}
+import io.fabric8.kubernetes.api.model.{Container, Pod, PodBuilder}
 import scala.collection.JavaConverters._
 
 import org.apache.spark.deploy.kubernetes.constants._
 
 private[spark] object InitContainerUtil {
 
-  private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(DefaultScalaModule)
 
-  def appendInitContainer(
-    originalPodSpec: PodBuilder, initContainer: Container): PodBuilder = {
+  def appendInitContainer(originalPodSpec: Pod, initContainer: Container): Pod = {
     val resolvedInitContainers = originalPodSpec
-      .editMetadata()
+      .getMetadata
       .getAnnotations
       .asScala
       .get(INIT_CONTAINER_ANNOTATION)
@@ -40,10 +39,11 @@ private[spark] object InitContainerUtil {
         existingInitContainers ++ Seq(initContainer)
       }.getOrElse(Seq(initContainer))
     val resolvedSerializedInitContainers = OBJECT_MAPPER.writeValueAsString(resolvedInitContainers)
-    originalPodSpec
+    new PodBuilder(originalPodSpec)
       .editMetadata()
-      .removeFromAnnotations(INIT_CONTAINER_ANNOTATION)
-      .addToAnnotations(INIT_CONTAINER_ANNOTATION, resolvedSerializedInitContainers)
-      .endMetadata()
+        .removeFromAnnotations(INIT_CONTAINER_ANNOTATION)
+        .addToAnnotations(INIT_CONTAINER_ANNOTATION, resolvedSerializedInitContainers)
+        .endMetadata()
+      .build()
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
index d688bf29808fb..ec591923f1472 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/KubernetesFileUtils.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
+import java.io.File
+
 import org.apache.spark.util.Utils
 
 private[spark] object KubernetesFileUtils {
@@ -48,4 +50,47 @@ private[spark] object KubernetesFileUtils {
   def getOnlyRemoteFiles(uris: Iterable[String]): Iterable[String] = {
     filterUriStringsByScheme(uris, scheme => scheme != "file" && scheme != "local")
   }
+
+  /**
+   * For the collection of uris, resolves any files as follows:
+   * - Files with scheme file:// are resolved to the given download path
+   * - Files with scheme local:// resolve to just the path of the URI
+   * - Otherwise, the URI is returned as-is.
+   */
+  def resolveSubmittedUris(fileUris: Iterable[String], fileDownloadPath: String)
+      : Iterable[String] = {
+    fileUris.map { uri =>
+      val fileUri = Utils.resolveURI(uri)
+      val fileScheme = Option(fileUri.getScheme).getOrElse("file")
+      fileScheme match {
+        case "file" =>
+          val fileName = new File(fileUri.getPath).getName
+          s"$fileDownloadPath/$fileName"
+        case "local" =>
+          fileUri.getPath
+        case _ => uri
+      }
+    }
+  }
+
+  /**
+   * If any file uri has any scheme other than local:// it is mapped as if the file
+   * was downloaded to the file download path. Otherwise, it is mapped to the path
+   * part of the URI.
+   */
+  def resolveFilePaths(fileUris: Iterable[String], fileDownloadPath: String): Iterable[String] = {
+    fileUris.map { uri =>
+      resolveFilePath(uri, fileDownloadPath)
+    }
+  }
+
+  def resolveFilePath(uri: String, fileDownloadPath: String): String = {
+    val fileUri = Utils.resolveURI(uri)
+    if (Option(fileUri.getScheme).getOrElse("file") == "local") {
+      fileUri.getPath
+    } else {
+      val fileName = new File(fileUri.getPath).getName
+      s"$fileDownloadPath/$fileName"
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/MainAppResource.scala
similarity index 71%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/MainAppResource.scala
index ba44f794d5811..436d531a850ff 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/InitContainerBundle.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/MainAppResource.scala
@@ -16,11 +16,8 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
-import io.fabric8.kubernetes.api.model.ConfigMap
+private[spark] sealed trait MainAppResource
 
-import org.apache.spark.deploy.kubernetes.{SparkPodInitContainerBootstrap}
+private[spark] case class PythonMainAppResource(primaryPyFile: String) extends MainAppResource
 
-case class InitContainerBundle(
-    sparkInitContainerConfigMap: ConfigMap,
-    sparkPodInitContainerBootstrap: SparkPodInitContainerBootstrap,
-    executorInitContainerConfiguration: ExecutorInitContainerConfiguration)
+private[spark] case class JavaMainAppResource(primaryResource: String) extends MainAppResource
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala
deleted file mode 100644
index c61e930a2b97f..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResources.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.api.model.{Pod, PodBuilder}
-
-private[spark] trait PythonSubmissionResources {
-  def sparkJars: Seq[String]
-  def pySparkFiles: Array[String]
-  def arguments: Array[String]
-  def primaryPySparkResource(containerLocalizedFilesResolver: ContainerLocalizedFilesResolver)
-    : String
-  def driverPodWithPySparkEnvs(
-    driverPodFileMounter: DriverPodKubernetesFileMounter,
-    resolvedPrimaryPySparkResource: String,
-    resolvedPySparkFiles: String,
-    driverContainerName: String,
-    driverPodBuilder: PodBuilder): Pod
-}
-
-private[spark] class PythonSubmissionResourcesImpl(
-  private val mainAppResource: String,
-  private val appArgs: Array[String] ) extends PythonSubmissionResources {
-
-  private val pyFiles: Array[String] = {
-    Option(appArgs(0)).map(a => mainAppResource +: a.split(","))
-      .getOrElse(Array(mainAppResource))
-  }
-
-  override def sparkJars: Seq[String] = Seq.empty[String]
-
-  override def pySparkFiles: Array[String] = pyFiles
-
-  override def arguments: Array[String] = {
-    pyFiles.toList match {
-      case Nil => appArgs
-      case a :: b => a match {
-        case _ if a == mainAppResource && b == Nil => appArgs
-        case _ => appArgs.drop(1)
-      }
-    }
-  }
-  override def primaryPySparkResource(
-    containerLocalizedFilesResolver: ContainerLocalizedFilesResolver) : String =
-      containerLocalizedFilesResolver.resolvePrimaryResourceFile()
-
-  override def driverPodWithPySparkEnvs(
-    driverPodFileMounter: DriverPodKubernetesFileMounter,
-    resolvedPrimaryPySparkResource: String,
-    resolvedPySparkFiles: String,
-    driverContainerName: String,
-    driverPodBuilder: PodBuilder) : Pod = {
-      driverPodFileMounter
-        .addPySparkFiles(
-          resolvedPrimaryPySparkResource,
-          resolvedPySparkFiles,
-          driverContainerName,
-          driverPodBuilder)
-        .build()
-    }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala
deleted file mode 100644
index 06d3648efb89f..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPlugin.scala
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.SparkException
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.internal.config.OptionalConfigEntry
-import org.apache.spark.util.Utils
-
-private[spark] trait SubmittedDependencyInitContainerConfigPlugin {
-  /**
-   * Obtain configuration to fetch submitted dependencies from a resource staging server.
-   * This includes the resource identifiers for the jar and file bundles, as well as the
-   * remote location of the resource staging server, and the location of secret files for
-   * authenticating to the resource staging server. Note that the secret file paths here need to
-   * line up with the locations the secrets are mounted by
-   * SubmittedDependencyInitContainerVolumesPlugin; constants provide the consistency and
-   * convention for these to line up.
-   */
-  def configurationsToFetchSubmittedDependencies(): Map[String, String]
-}
-
-private[spark] class SubmittedDependencyInitContainerConfigPluginImpl(
-    internalResourceStagingServerUri: String,
-    jarsResourceId: String,
-    filesResourceId: String,
-    jarsSecretKey: String,
-    filesSecretKey: String,
-    trustStoreSecretKey: String,
-    clientCertSecretKey: String,
-    resourceStagingServerSslEnabled: Boolean,
-    maybeInternalTrustStoreUri: Option[String],
-    maybeInternalClientCertUri: Option[String],
-    maybeInternalTrustStorePassword: Option[String],
-    maybeInternalTrustStoreType: Option[String],
-    secretsVolumeMountPath: String)
-    extends SubmittedDependencyInitContainerConfigPlugin {
-
-  override def configurationsToFetchSubmittedDependencies(): Map[String, String] = {
-    Map[String, String](
-      RESOURCE_STAGING_SERVER_URI.key -> internalResourceStagingServerUri,
-      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> jarsResourceId,
-      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
-        s"$secretsVolumeMountPath/$jarsSecretKey",
-      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> filesResourceId,
-      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
-        s"$secretsVolumeMountPath/$filesSecretKey",
-      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> resourceStagingServerSslEnabled.toString) ++
-      resolveSecretPath(
-        maybeInternalTrustStoreUri,
-        trustStoreSecretKey,
-        RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE,
-        "TrustStore URI") ++
-      resolveSecretPath(
-        maybeInternalClientCertUri,
-        clientCertSecretKey,
-        RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM,
-        "Client certificate URI") ++
-      maybeInternalTrustStorePassword.map { password =>
-        (RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key, password)
-      }.toMap ++
-      maybeInternalTrustStoreType.map { storeType =>
-        (RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key, storeType)
-      }.toMap
-  }
-
-  private def resolveSecretPath(
-      maybeUri: Option[String],
-      secretKey: String,
-      configEntry: OptionalConfigEntry[String],
-      uriType: String): Map[String, String] = {
-    maybeUri.map(Utils.resolveURI).map { uri =>
-      val resolvedPath = Option(uri.getScheme).getOrElse("file") match {
-        case "file" => s"$secretsVolumeMountPath/$secretKey"
-        case "local" => uri.getPath
-        case invalid => throw new SparkException(s"$uriType has invalid scheme $invalid must be" +
-          s" local://, file://, or empty.")
-      }
-      (configEntry.key, resolvedPath)
-    }.toMap
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala
deleted file mode 100644
index 7850853df97e6..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilder.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import java.io.File
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{Secret, SecretBuilder}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.util.Utils
-
-private[spark] trait SubmittedDependencySecretBuilder {
-  /**
-   * Construct a Kubernetes secret bundle that init-containers can use to retrieve an
-   * application's dependencies.
-   */
-  def build(): Secret
-}
-
-private[spark] class SubmittedDependencySecretBuilderImpl(
-      secretName: String,
-      jarsResourceSecret: String,
-      filesResourceSecret: String,
-      jarsSecretKey: String,
-      filesSecretKey: String,
-      trustStoreSecretKey: String,
-      clientCertSecretKey: String,
-      internalTrustStoreUri: Option[String],
-      internalClientCertUri: Option[String])
-    extends SubmittedDependencySecretBuilder {
-
-  override def build(): Secret = {
-    val trustStoreBase64 = convertFileToBase64IfSubmitterLocal(
-        trustStoreSecretKey, internalTrustStoreUri)
-    val clientCertBase64 = convertFileToBase64IfSubmitterLocal(
-        clientCertSecretKey, internalClientCertUri)
-    val jarsSecretBase64 = BaseEncoding.base64().encode(jarsResourceSecret.getBytes(Charsets.UTF_8))
-    val filesSecretBase64 = BaseEncoding.base64().encode(
-      filesResourceSecret.getBytes(Charsets.UTF_8))
-    val secretData = Map(
-      jarsSecretKey -> jarsSecretBase64,
-      filesSecretKey -> filesSecretBase64) ++
-      trustStoreBase64 ++
-      clientCertBase64
-    val kubernetesSecret = new SecretBuilder()
-      .withNewMetadata()
-      .withName(secretName)
-      .endMetadata()
-      .addToData(secretData.asJava)
-      .build()
-    kubernetesSecret
-  }
-
-  private def convertFileToBase64IfSubmitterLocal(secretKey: String, secretUri: Option[String])
-      : Map[String, String] = {
-    secretUri.filter { trustStore =>
-      Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") == "file"
-    }.map { uri =>
-      val file = new File(Utils.resolveURI(uri).getPath)
-      require(file.isFile, "Dependency server trustStore provided at" +
-        file.getAbsolutePath + " does not exist or is not a file.")
-      (secretKey, BaseEncoding.base64().encode(Files.toByteArray(file)))
-    }.toMap
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala
new file mode 100644
index 0000000000000..022b5fccdc5e1
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, PodBuilder, QuantityBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.ConfigurationUtils
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+
+/**
+ * Represents the initial setup required for the driver.
+ */
+private[spark] class BaseDriverConfigurationStep(
+    kubernetesAppId: String,
+    kubernetesResourceNamePrefix: String,
+    driverLabels: Map[String, String],
+    dockerImagePullPolicy: String,
+    appName: String,
+    mainClass: String,
+    appArgs: Array[String],
+    submissionSparkConf: SparkConf) extends DriverConfigurationStep {
+
+  private val kubernetesDriverPodName = submissionSparkConf.get(KUBERNETES_DRIVER_POD_NAME)
+      .getOrElse(s"$kubernetesResourceNamePrefix-driver")
+  private val driverExtraClasspath = submissionSparkConf.get(
+      org.apache.spark.internal.config.DRIVER_CLASS_PATH)
+  // CPU settings
+  private val driverCpuCores = submissionSparkConf.getOption("spark.driver.cores").getOrElse("1")
+  private val driverLimitCores = submissionSparkConf.get(KUBERNETES_DRIVER_LIMIT_CORES)
+
+  // Memory settings
+  private val driverMemoryMb = submissionSparkConf.get(
+      org.apache.spark.internal.config.DRIVER_MEMORY)
+  private val memoryOverheadMb = submissionSparkConf
+      .get(KUBERNETES_DRIVER_MEMORY_OVERHEAD)
+      .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMb).toInt,
+          MEMORY_OVERHEAD_MIN))
+  private val driverContainerMemoryWithOverhead = driverMemoryMb + memoryOverheadMb
+  private val driverDockerImage = submissionSparkConf.get(DRIVER_DOCKER_IMAGE)
+
+  override def configureDriver(
+      driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
+      new EnvVarBuilder()
+        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+        .withValue(classPath)
+        .build()
+    }
+    val driverCustomAnnotations = ConfigurationUtils
+        .combinePrefixedKeyValuePairsWithDeprecatedConf(
+            submissionSparkConf,
+            KUBERNETES_DRIVER_ANNOTATION_PREFIX,
+            KUBERNETES_DRIVER_ANNOTATIONS,
+            "annotation")
+    require(!driverCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION),
+        s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
+            s" Spark bookkeeping operations.")
+    val allDriverAnnotations = driverCustomAnnotations ++ Map(SPARK_APP_NAME_ANNOTATION -> appName)
+    val driverCpuQuantity = new QuantityBuilder(false)
+      .withAmount(driverCpuCores)
+      .build()
+    val driverMemoryQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverMemoryMb}M")
+      .build()
+    val driverMemoryLimitQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverContainerMemoryWithOverhead}M")
+      .build()
+    val maybeCpuLimitQuantity = driverLimitCores.map { limitCores =>
+      ("cpu", new QuantityBuilder(false).withAmount(limitCores).build())
+    }
+    val driverContainer = new ContainerBuilder(driverSpec.driverContainer)
+      .withName(DRIVER_CONTAINER_NAME)
+      .withImage(driverDockerImage)
+      .withImagePullPolicy(dockerImagePullPolicy)
+      .addToEnv(driverExtraClasspathEnv.toSeq: _*)
+      .addNewEnv()
+        .withName(ENV_DRIVER_MEMORY)
+        .withValue(driverContainerMemoryWithOverhead + "m")
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_MAIN_CLASS)
+        .withValue(mainClass)
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_ARGS)
+        .withValue(appArgs.mkString(" "))
+        .endEnv()
+      .withNewResources()
+        .addToRequests("cpu", driverCpuQuantity)
+        .addToRequests("memory", driverMemoryQuantity)
+        .addToLimits("memory", driverMemoryLimitQuantity)
+        .addToLimits(maybeCpuLimitQuantity.toMap.asJava)
+        .endResources()
+      .build()
+    val baseDriverPod = new PodBuilder(driverSpec.driverPod)
+      .editOrNewMetadata()
+        .withName(kubernetesDriverPodName)
+        .addToLabels(driverLabels.asJava)
+        .addToAnnotations(allDriverAnnotations.asJava)
+      .endMetadata()
+      .withNewSpec()
+        .withRestartPolicy("Never")
+        .endSpec()
+      .build()
+    val resolvedSparkConf = driverSpec.driverSparkConf.clone()
+      .setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
+      .set("spark.app.id", kubernetesAppId)
+      .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, kubernetesResourceNamePrefix)
+    driverSpec.copy(
+      driverPod = baseDriverPod,
+      driverSparkConf = resolvedSparkConf,
+      driverContainer = driverContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStep.scala
new file mode 100644
index 0000000000000..dddc62410d6c9
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStep.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+
+import java.io.File
+
+import io.fabric8.kubernetes.api.model.ContainerBuilder
+
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+import org.apache.spark.util.Utils
+
+/**
+ * Step that configures the classpath, spark.jars, and spark.files for the driver given that
+ * the init-container will download files to the download paths and that the user may provide
+ * files with local:// schemes. Note that this is separate from the init-container bootstrap
+ * step because jars with local schemes can still be provided even if the init-container is
+ * not being used, and those URIs still need to be resolved.
+ */
+private[spark] class DependencyResolutionStep(
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    filesDownloadPath: String) extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val resolvedSparkJars = KubernetesFileUtils.resolveSubmittedUris(sparkJars, jarsDownloadPath)
+    val resolvedSparkFiles = KubernetesFileUtils.resolveSubmittedUris(sparkFiles, filesDownloadPath)
+    val sparkConfResolvedSparkDependencies = driverSpec.driverSparkConf.clone()
+    if (resolvedSparkJars.nonEmpty) {
+      sparkConfResolvedSparkDependencies.set("spark.jars", resolvedSparkJars.mkString(","))
+    }
+    if (resolvedSparkFiles.nonEmpty) {
+      sparkConfResolvedSparkDependencies.set("spark.files", resolvedSparkFiles.mkString(","))
+    }
+    val resolvedClasspath = KubernetesFileUtils.resolveFilePaths(sparkJars, jarsDownloadPath)
+    val driverContainerWithResolvedClasspath = if (resolvedClasspath.nonEmpty) {
+      new ContainerBuilder(driverSpec.driverContainer)
+        .addNewEnv()
+          .withName(ENV_MOUNTED_CLASSPATH)
+          .withValue(resolvedClasspath.mkString(File.pathSeparator))
+          .endEnv()
+        .build()
+    } else {
+      driverSpec.driverContainer
+    }
+    driverSpec.copy(
+        driverContainer = driverContainerWithResolvedClasspath,
+        driverSparkConf = sparkConfResolvedSparkDependencies)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverConfigurationStep.scala
new file mode 100644
index 0000000000000..8070e32371f94
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverConfigurationStep.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+/**
+ * Represents a step in preparing the Kubernetes driver.
+ */
+private[spark] trait DriverConfigurationStep {
+
+  /**
+   * Apply some transformation to the previous state of the driver to add a new feature to it.
+   */
+  def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala
new file mode 100644
index 0000000000000..0c58006130659
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import java.io.File
+import java.nio.charset.StandardCharsets
+
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, Secret, SecretBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+
+/**
+ * Mounts Kubernetes credentials into the driver pod. The driver will use such mounted credentials
+ * to request executors.
+ */
+private[spark] class DriverKubernetesCredentialsStep(
+    submissionSparkConf: SparkConf,
+    kubernetesResourceNamePrefix: String) extends DriverConfigurationStep {
+
+  private val maybeMountedOAuthTokenFile = submissionSparkConf.getOption(
+      s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX")
+  private val maybeMountedClientKeyFile = submissionSparkConf.getOption(
+      s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX")
+  private val maybeMountedClientCertFile = submissionSparkConf.getOption(
+      s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX")
+  private val maybeMountedCaCertFile = submissionSparkConf.getOption(
+      s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX")
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val driverSparkConf = driverSpec.driverSparkConf.clone()
+    val oauthTokenBase64 = submissionSparkConf
+        .getOption(s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX")
+        .map { token =>
+          BaseEncoding.base64().encode(token.getBytes(StandardCharsets.UTF_8))
+        }
+    val caCertDataBase64 = safeFileConfToBase64(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+        s"Driver CA cert file provided at %s does not exist or is not a file.")
+    val clientKeyDataBase64 = safeFileConfToBase64(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+        s"Driver client key file provided at %s does not exist or is not a file.")
+    val clientCertDataBase64 = safeFileConfToBase64(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+        s"Driver client cert file provided at %s does not exist or is not a file.")
+    val driverSparkConfWithCredentialsLocations = setDriverPodKubernetesCredentialLocations(
+        driverSparkConf,
+        oauthTokenBase64,
+        caCertDataBase64,
+        clientKeyDataBase64,
+        clientCertDataBase64)
+    val kubernetesCredentialsSecret = createCredentialsSecret(
+        oauthTokenBase64,
+        caCertDataBase64,
+        clientKeyDataBase64,
+        clientCertDataBase64)
+    val driverPodWithMountedKubernetesCredentials = kubernetesCredentialsSecret.map { secret =>
+      new PodBuilder(driverSpec.driverPod)
+        .editOrNewSpec()
+          .addNewVolume()
+            .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+            .withNewSecret().withSecretName(secret.getMetadata.getName).endSecret()
+            .endVolume()
+          .endSpec()
+        .build()
+    }.getOrElse(driverSpec.driverPod)
+    val driverContainerWithMountedSecretVolume = kubernetesCredentialsSecret.map { secret =>
+      new ContainerBuilder(driverSpec.driverContainer)
+        .addNewVolumeMount()
+          .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+          .withMountPath(DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
+          .endVolumeMount()
+        .build()
+    }.getOrElse(driverSpec.driverContainer)
+    driverSpec.copy(
+        driverPod = driverPodWithMountedKubernetesCredentials,
+        otherKubernetesResources =
+            driverSpec.otherKubernetesResources ++ kubernetesCredentialsSecret.toSeq,
+        driverSparkConf = driverSparkConfWithCredentialsLocations,
+        driverContainer = driverContainerWithMountedSecretVolume)
+  }
+
+  private def createCredentialsSecret(
+      driverOAuthTokenBase64: Option[String],
+      driverCaCertDataBase64: Option[String],
+      driverClientKeyDataBase64: Option[String],
+      driverClientCertDataBase64: Option[String]): Option[Secret] = {
+    val allSecretData =
+        resolveSecretData(
+          maybeMountedClientKeyFile,
+          driverClientKeyDataBase64,
+          DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME) ++
+        resolveSecretData(
+            maybeMountedClientCertFile,
+            driverClientCertDataBase64,
+            DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME) ++
+        resolveSecretData(
+            maybeMountedCaCertFile,
+            driverCaCertDataBase64,
+            DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME) ++
+        resolveSecretData(
+            maybeMountedOAuthTokenFile,
+            driverOAuthTokenBase64,
+            DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME)
+    if (allSecretData.isEmpty) {
+      None
+    } else {
+      Some(new SecretBuilder()
+        .withNewMetadata()
+          .withName(s"$kubernetesResourceNamePrefix-kubernetes-credentials")
+          .endMetadata()
+        .withData(allSecretData.asJava)
+        .build())
+    }
+  }
+
+  private def setDriverPodKubernetesCredentialLocations(
+      driverSparkConf: SparkConf,
+      driverOauthTokenBase64: Option[String],
+      driverCaCertDataBase64: Option[String],
+      driverClientKeyDataBase64: Option[String],
+      driverClientCertDataBase64: Option[String]): SparkConf = {
+    val resolvedMountedOAuthTokenFile = resolveSecretLocation(
+        maybeMountedOAuthTokenFile,
+        driverOauthTokenBase64,
+        DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
+    val resolvedMountedClientKeyFile = resolveSecretLocation(
+        maybeMountedClientKeyFile,
+        driverClientKeyDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
+    val resolvedMountedClientCertFile = resolveSecretLocation(
+        maybeMountedClientCertFile,
+        driverClientCertDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
+    val resolvedMountedCaCertFile = resolveSecretLocation(
+        maybeMountedCaCertFile,
+        driverCaCertDataBase64,
+        DRIVER_CREDENTIALS_CA_CERT_PATH)
+    val sparkConfWithCredentialLocations = driverSparkConf
+        .setOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+            resolvedMountedCaCertFile)
+        .setOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+            resolvedMountedClientKeyFile)
+        .setOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+            resolvedMountedClientCertFile)
+        .setOption(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
+            resolvedMountedOAuthTokenFile)
+    // Redact all OAuth token values
+    sparkConfWithCredentialLocations
+        .getAll
+        .filter(_._1.endsWith(OAUTH_TOKEN_CONF_SUFFIX)).map(_._1)
+        .foreach {
+          sparkConfWithCredentialLocations.set(_, "<present_but_redacted>")
+        }
+    sparkConfWithCredentialLocations
+  }
+
+  private def safeFileConfToBase64(
+      conf: String,
+      fileNotFoundFormatString: String): Option[String] = {
+    submissionSparkConf.getOption(conf)
+        .map(new File(_))
+        .map { file =>
+          require(file.isFile, String.format(fileNotFoundFormatString, file.getAbsolutePath))
+          BaseEncoding.base64().encode(Files.toByteArray(file))
+        }
+  }
+
+  private def resolveSecretLocation(
+      mountedUserSpecified: Option[String],
+      valueMountedFromSubmitter: Option[String],
+      mountedCanonicalLocation: String): Option[String] = {
+    mountedUserSpecified.orElse(valueMountedFromSubmitter.map( _ => {
+      mountedCanonicalLocation
+    }))
+  }
+
+  private def resolveSecretData(
+      mountedUserSpecified: Option[String],
+      valueMountedFromSubmitter: Option[String],
+      secretName: String): Map[String, String] = {
+    mountedUserSpecified.map { _ => Map.empty[String, String]}
+        .getOrElse {
+          valueMountedFromSubmitter.map { valueBase64 =>
+            Map(secretName -> valueBase64)
+          }.getOrElse(Map.empty[String, String])
+        }
+  }
+
+  private implicit def augmentSparkConf(sparkConf: SparkConf): OptionSettableSparkConf = {
+    new OptionSettableSparkConf(sparkConf)
+  }
+}
+
+private class OptionSettableSparkConf(sparkConf: SparkConf) {
+  def setOption(configEntry: String, option: Option[String]): SparkConf = {
+    option.map( opt => {
+      sparkConf.set(configEntry, opt)
+    }).getOrElse(sparkConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/InitContainerBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/InitContainerBootstrapStep.scala
new file mode 100644
index 0000000000000..29cad18c484c0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/InitContainerBootstrapStep.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata}
+
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.{InitContainerUtil, PropertiesConfigMapFromScalaMapBuilder}
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer.{InitContainerConfigurationStep, InitContainerSpec}
+
+/**
+ * Configures the init-container that bootstraps dependencies into the driver pod.
+ */
+private[spark] class InitContainerBootstrapStep(
+    initContainerConfigurationSteps: Seq[InitContainerConfigurationStep],
+    initContainerConfigMapName: String,
+    initContainerConfigMapKey: String)
+  extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    var currentInitContainerSpec = InitContainerSpec(
+        initContainerProperties = Map.empty[String, String],
+        additionalDriverSparkConf = Map.empty[String, String],
+        initContainer = new ContainerBuilder().build(),
+        driverContainer = driverSpec.driverContainer,
+        podToInitialize = driverSpec.driverPod,
+        initContainerDependentResources = Seq.empty[HasMetadata])
+    for (nextStep <- initContainerConfigurationSteps) {
+      currentInitContainerSpec = nextStep.configureInitContainer(currentInitContainerSpec)
+    }
+    val configMap = PropertiesConfigMapFromScalaMapBuilder.buildConfigMap(
+        initContainerConfigMapName,
+        initContainerConfigMapKey,
+        currentInitContainerSpec.initContainerProperties)
+    val resolvedDriverSparkConf = driverSpec.driverSparkConf.clone()
+        .set(EXECUTOR_INIT_CONTAINER_CONFIG_MAP, initContainerConfigMapName)
+        .set(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY, initContainerConfigMapKey)
+        .setAll(currentInitContainerSpec.additionalDriverSparkConf)
+    val resolvedDriverPod = InitContainerUtil.appendInitContainer(
+        currentInitContainerSpec.podToInitialize, currentInitContainerSpec.initContainer)
+    driverSpec.copy(
+        driverPod = resolvedDriverPod,
+        driverContainer = currentInitContainerSpec.driverContainer,
+        driverSparkConf = resolvedDriverSparkConf,
+        otherKubernetesResources =
+            driverSpec.otherKubernetesResources ++
+                currentInitContainerSpec.initContainerDependentResources ++
+                Seq(configMap))
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/KubernetesDriverSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/KubernetesDriverSpec.scala
new file mode 100644
index 0000000000000..3ec4b6c4df10f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/KubernetesDriverSpec.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, HasMetadata, Pod, PodBuilder}
+
+import org.apache.spark.SparkConf
+
+/**
+ * Represents the components and characteristics of a Spark driver. The driver can be considered
+ * as being comprised of the driver pod itself, any other Kubernetes resources that the driver
+ * pod depends on, and the SparkConf that should be supplied to the Spark application. The driver
+ * container should be operated on via the specific field of this case class as opposed to trying
+ * to edit the container directly on the pod. The driver container should be attached at the
+ * end of executing all submission steps.
+ */
+private[spark] case class KubernetesDriverSpec(
+    driverPod: Pod,
+    driverContainer: Container,
+    otherKubernetesResources: Seq[HasMetadata],
+    driverSparkConf: SparkConf)
+
+private[spark] object KubernetesDriverSpec {
+  def initialSpec(initialSparkConf: SparkConf): KubernetesDriverSpec = {
+    KubernetesDriverSpec(
+        // Set new metadata and a new spec so that submission steps can use
+        // PodBuilder#editMetadata() and/or PodBuilder#editSpec() safely.
+        new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build(),
+        new ContainerBuilder().build(),
+        Seq.empty[HasMetadata],
+        initialSparkConf.clone())
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStep.scala
new file mode 100644
index 0000000000000..024d643ddf9fd
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStep.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import io.fabric8.kubernetes.api.model.ContainerBuilder
+
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+
+private[spark] class PythonStep(
+    primaryPyFile: String,
+    otherPyFiles: Seq[String],
+    filesDownloadPath: String) extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val resolvedOtherPyFilesString = if (otherPyFiles.isEmpty) {
+      "null"
+    } else {
+      KubernetesFileUtils.resolveFilePaths(otherPyFiles, filesDownloadPath).mkString(",")
+    }
+    val withPythonPrimaryFileContainer = new ContainerBuilder(driverSpec.driverContainer)
+      .addNewEnv()
+        .withName(ENV_PYSPARK_PRIMARY)
+        .withValue(KubernetesFileUtils.resolveFilePath(primaryPyFile, filesDownloadPath))
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_PYSPARK_FILES)
+        .withValue(resolvedOtherPyFilesString)
+        .endEnv()
+    driverSpec.copy(driverContainer = withPythonPrimaryFileContainer.build())
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilder.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStep.scala
similarity index 62%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilder.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStep.scala
index 4062a3113eddf..60bf27beacaaf 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilder.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStep.scala
@@ -14,32 +14,23 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.api.model.ConfigMap
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
 
+import org.apache.spark.deploy.kubernetes.{PodWithDetachedInitContainer, SparkPodInitContainerBootstrap}
 import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
 
-private[spark] trait SparkInitContainerConfigMapBuilder {
-  /**
-   * Construct a config map that an init-container should reference for fetching
-   * remote dependencies. The config map includes the remote jars and files to download,
-   * as well as details to fetch files from a resource staging server, if applicable.
-   */
-  def build(): ConfigMap
-}
-
-private[spark] class SparkInitContainerConfigMapBuilderImpl(
+private[spark] class BaseInitContainerConfigurationStep(
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
     jarsDownloadPath: String,
     filesDownloadPath: String,
     configMapName: String,
     configMapKey: String,
-    submittedDependenciesPlugin: Option[SubmittedDependencyInitContainerConfigPlugin])
-    extends SparkInitContainerConfigMapBuilder {
+    podAndInitContainerBootstrap: SparkPodInitContainerBootstrap)
+  extends InitContainerConfigurationStep {
 
-  override def build(): ConfigMap = {
+  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
     val remoteJarsToDownload = KubernetesFileUtils.getOnlyRemoteFiles(sparkJars)
     val remoteFilesToDownload = KubernetesFileUtils.getOnlyRemoteFiles(sparkFiles)
     val remoteJarsConf = if (remoteJarsToDownload.nonEmpty) {
@@ -57,12 +48,16 @@ private[spark] class SparkInitContainerConfigMapBuilderImpl(
       INIT_CONTAINER_FILES_DOWNLOAD_LOCATION.key -> filesDownloadPath) ++
       remoteJarsConf ++
       remoteFilesConf
-    val submittedDependenciesConfig = submittedDependenciesPlugin.map { plugin =>
-      plugin.configurationsToFetchSubmittedDependencies()
-    }.toSeq.flatten.toMap
-    PropertiesConfigMapFromScalaMapBuilder.buildConfigMap(
-        configMapName,
-        configMapKey,
-        baseInitContainerConfig ++ submittedDependenciesConfig)
+    val bootstrappedPodAndInitContainer =
+        podAndInitContainerBootstrap.bootstrapInitContainerAndVolumes(
+             PodWithDetachedInitContainer(
+                  initContainerSpec.podToInitialize,
+                  initContainerSpec.initContainer,
+                  initContainerSpec.driverContainer))
+    initContainerSpec.copy(
+      initContainer = bootstrappedPodAndInitContainer.initContainer,
+      driverContainer = bootstrappedPodAndInitContainer.mainContainer,
+      podToInitialize = bootstrappedPodAndInitContainer.pod,
+      initContainerProperties = baseInitContainerConfig)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStep.scala
new file mode 100644
index 0000000000000..7b7622c3d4f8b
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStep.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+/**
+ * Represents a step in preparing the init-container for the driver and executors.
+ */
+private[spark] trait InitContainerConfigurationStep {
+
+  def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestrator.scala
new file mode 100644
index 0000000000000..e4ea5235af18f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestrator.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, OptionRequirements, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.SubmittedDependencyUploaderImpl
+import org.apache.spark.deploy.rest.kubernetes.{ResourceStagingServerSslOptionsProviderImpl, RetrofitClientFactoryImpl}
+import org.apache.spark.util.Utils
+
+/**
+ * Returns the complete ordered list of steps required to configure the init-container.
+ */
+private[spark] class InitContainerConfigurationStepsOrchestrator(
+    namespace: String,
+    kubernetesResourceNamePrefix: String,
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    dockerImagePullPolicy: String,
+    driverLabels: Map[String, String],
+    initContainerConfigMapName: String,
+    initContainerConfigMapKey: String,
+    submissionSparkConf: SparkConf) {
+
+  private val submittedResourcesSecretName = s"$kubernetesResourceNamePrefix-init-secret"
+  private val resourceStagingServerUri = submissionSparkConf.get(RESOURCE_STAGING_SERVER_URI)
+  private val resourceStagingServerInternalUri =
+      submissionSparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_URI)
+  private val initContainerImage = submissionSparkConf.get(INIT_CONTAINER_DOCKER_IMAGE)
+  private val downloadTimeoutMinutes = submissionSparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
+  private val maybeResourceStagingServerInternalTrustStore =
+    submissionSparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_FILE)
+      .orElse(submissionSparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE))
+  private val maybeResourceStagingServerInternalTrustStorePassword =
+    submissionSparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_PASSWORD)
+      .orElse(submissionSparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD))
+  private val maybeResourceStagingServerInternalTrustStoreType =
+    submissionSparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_TRUSTSTORE_TYPE)
+      .orElse(submissionSparkConf.get(RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE))
+  private val maybeResourceStagingServerInternalClientCert =
+    submissionSparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_CLIENT_CERT_PEM)
+      .orElse(submissionSparkConf.get(RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM))
+  private val resourceStagingServerInternalSslEnabled =
+    submissionSparkConf.get(RESOURCE_STAGING_SERVER_INTERNAL_SSL_ENABLED)
+      .orElse(submissionSparkConf.get(RESOURCE_STAGING_SERVER_SSL_ENABLED))
+      .getOrElse(false)
+  OptionRequirements.requireNandDefined(
+    maybeResourceStagingServerInternalClientCert,
+    maybeResourceStagingServerInternalTrustStore,
+    "Cannot provide both a certificate file and a trustStore file for init-containers to" +
+      " use for contacting the resource staging server over TLS.")
+
+  require(maybeResourceStagingServerInternalTrustStore.forall { trustStore =>
+    Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") match {
+      case "file" | "local" => true
+      case _ => false
+    }
+  }, "TrustStore URI used for contacting the resource staging server from init containers must" +
+    " have no scheme, or scheme file://, or scheme local://.")
+
+  require(maybeResourceStagingServerInternalClientCert.forall { trustStore =>
+    Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") match {
+      case "file" | "local" => true
+      case _ => false
+    }
+  }, "Client cert file URI used for contacting the resource staging server from init containers" +
+    " must have no scheme, or scheme file://, or scheme local://.")
+
+  def getAllConfigurationSteps(): Seq[InitContainerConfigurationStep] = {
+    val initContainerBootstrap = new SparkPodInitContainerBootstrapImpl(
+        initContainerImage,
+        dockerImagePullPolicy,
+        jarsDownloadPath,
+        filesDownloadPath,
+        downloadTimeoutMinutes,
+        initContainerConfigMapName,
+        initContainerConfigMapKey)
+    val baseInitContainerStep = new BaseInitContainerConfigurationStep(
+        sparkJars,
+        sparkFiles,
+        jarsDownloadPath,
+        filesDownloadPath,
+        initContainerConfigMapName,
+        initContainerConfigMapKey,
+        initContainerBootstrap)
+    val submittedResourcesInitContainerStep = resourceStagingServerUri.map {
+        stagingServerUri =>
+      val mountSecretPlugin = new InitContainerResourceStagingServerSecretPluginImpl(
+          submittedResourcesSecretName,
+          INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH)
+      val submittedDependencyUploader = new SubmittedDependencyUploaderImpl(
+          driverLabels,
+          namespace,
+          stagingServerUri,
+          sparkJars,
+          sparkFiles,
+          new ResourceStagingServerSslOptionsProviderImpl(submissionSparkConf).getSslOptions,
+          RetrofitClientFactoryImpl)
+      new SubmittedResourcesInitContainerConfigurationStep(
+          submittedResourcesSecretName,
+          resourceStagingServerInternalUri.getOrElse(stagingServerUri),
+          INIT_CONTAINER_SECRET_VOLUME_MOUNT_PATH,
+          resourceStagingServerInternalSslEnabled,
+          maybeResourceStagingServerInternalTrustStore,
+          maybeResourceStagingServerInternalClientCert,
+          maybeResourceStagingServerInternalTrustStorePassword,
+          maybeResourceStagingServerInternalTrustStoreType,
+          submittedDependencyUploader,
+          mountSecretPlugin)
+    }
+    Seq(baseInitContainerStep) ++ submittedResourcesInitContainerStep.toSeq
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerSpec.scala
new file mode 100644
index 0000000000000..5b5ac3c1f17c2
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerSpec.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
+
+/**
+ * Represents a given configuration of the init-container, informing the main
+ * InitContainerBootstrapStep of how the driver should be configured. This includes:
+ * <p>
+ * - What properties should be set on the init-container,
+ * - What Spark properties should be set on the driver's SparkConf given this init-container,
+ * - The spec of the init container itself,
+ * - The spec of the main container so that it can be modified to share volumes with the
+ *   init-container
+ * - The spec of the pod EXCEPT for the addition of the given init-container (e.g. volumes
+ *   the init-container needs or modifications to a main container that shares data with the
+ *   init-container),
+ * - Any Kubernetes resources that need to be created for the init-container's function.
+ */
+private[spark] case class InitContainerSpec(
+    initContainerProperties: Map[String, String],
+    additionalDriverSparkConf: Map[String, String],
+    initContainer: Container,
+    driverContainer: Container,
+    podToInitialize: Pod,
+    initContainerDependentResources: Seq[HasMetadata])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerConfigurationStep.scala
new file mode 100644
index 0000000000000..7aa27a1de6811
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerConfigurationStep.scala
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{Secret, SecretBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkException
+import org.apache.spark.deploy.kubernetes.InitContainerResourceStagingServerSecretPlugin
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.SubmittedDependencyUploader
+import org.apache.spark.internal.config.OptionalConfigEntry
+import org.apache.spark.util.Utils
+
+private[spark] class SubmittedResourcesInitContainerConfigurationStep(
+    submittedResourcesSecretName: String,
+    internalResourceStagingServerUri: String,
+    initContainerSecretMountPath: String,
+    resourceStagingServerSslEnabled: Boolean,
+    maybeInternalTrustStoreUri: Option[String],
+    maybeInternalClientCertUri: Option[String],
+    maybeInternalTrustStorePassword: Option[String],
+    maybeInternalTrustStoreType: Option[String],
+    submittedDependencyUploader: SubmittedDependencyUploader,
+    submittedResourcesSecretPlugin: InitContainerResourceStagingServerSecretPlugin)
+  extends InitContainerConfigurationStep {
+
+  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
+    val jarsIdAndSecret = submittedDependencyUploader.uploadJars()
+    val filesIdAndSecret = submittedDependencyUploader.uploadFiles()
+
+    val submittedResourcesInitContainerProperties = Map[String, String](
+      RESOURCE_STAGING_SERVER_URI.key -> internalResourceStagingServerUri,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> jarsIdAndSecret.resourceId,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        s"$initContainerSecretMountPath/$INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY",
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> filesIdAndSecret.resourceId,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        s"$initContainerSecretMountPath/$INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> resourceStagingServerSslEnabled.toString) ++
+      resolveSecretPath(
+        maybeInternalTrustStoreUri,
+        INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY,
+        RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE,
+        "TrustStore URI") ++
+      resolveSecretPath(
+        maybeInternalClientCertUri,
+        INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY,
+        RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM,
+        "Client certificate URI") ++
+      maybeInternalTrustStorePassword.map { password =>
+        (RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key, password)
+      }.toMap ++
+      maybeInternalTrustStoreType.map { storeType =>
+        (RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key, storeType)
+      }.toMap
+    val initContainerSecret = createResourceStagingServerSecret(
+        jarsIdAndSecret.resourceSecret, filesIdAndSecret.resourceSecret)
+    val additionalDriverSparkConf =
+        Map(
+          EXECUTOR_INIT_CONTAINER_SECRET.key -> initContainerSecret.getMetadata.getName,
+          EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR.key -> initContainerSecretMountPath)
+    val initContainerWithSecretVolumeMount = submittedResourcesSecretPlugin
+        .mountResourceStagingServerSecretIntoInitContainer(initContainerSpec.initContainer)
+    val podWithSecretVolume = submittedResourcesSecretPlugin
+        .addResourceStagingServerSecretVolumeToPod(initContainerSpec.podToInitialize)
+    initContainerSpec.copy(
+        initContainer = initContainerWithSecretVolumeMount,
+        podToInitialize = podWithSecretVolume,
+        initContainerDependentResources =
+            initContainerSpec.initContainerDependentResources ++ Seq(initContainerSecret),
+        initContainerProperties =
+            initContainerSpec.initContainerProperties ++ submittedResourcesInitContainerProperties,
+        additionalDriverSparkConf = additionalDriverSparkConf)
+  }
+
+  private def createResourceStagingServerSecret(
+      jarsResourceSecret: String, filesResourceSecret: String): Secret = {
+    val trustStoreBase64 = convertFileToBase64IfSubmitterLocal(
+      INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY, maybeInternalTrustStoreUri)
+    val clientCertBase64 = convertFileToBase64IfSubmitterLocal(
+      INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY, maybeInternalClientCertUri)
+    val jarsSecretBase64 = BaseEncoding.base64().encode(jarsResourceSecret.getBytes(Charsets.UTF_8))
+    val filesSecretBase64 = BaseEncoding.base64().encode(
+      filesResourceSecret.getBytes(Charsets.UTF_8))
+    val secretData = Map(
+      INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY -> jarsSecretBase64,
+      INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY -> filesSecretBase64) ++
+      trustStoreBase64 ++
+      clientCertBase64
+    val kubernetesSecret = new SecretBuilder()
+      .withNewMetadata()
+        .withName(submittedResourcesSecretName)
+        .endMetadata()
+      .addToData(secretData.asJava)
+      .build()
+    kubernetesSecret
+  }
+
+  private def convertFileToBase64IfSubmitterLocal(secretKey: String, secretUri: Option[String])
+      : Map[String, String] = {
+    secretUri.filter { trustStore =>
+      Option(Utils.resolveURI(trustStore).getScheme).getOrElse("file") == "file"
+    }.map { uri =>
+      val file = new File(Utils.resolveURI(uri).getPath)
+      require(file.isFile, "Dependency server trustStore provided at" +
+        file.getAbsolutePath + " does not exist or is not a file.")
+      (secretKey, BaseEncoding.base64().encode(Files.toByteArray(file)))
+    }.toMap
+  }
+
+  private def resolveSecretPath(
+      maybeUri: Option[String],
+      secretKey: String,
+      configEntry: OptionalConfigEntry[String],
+      uriType: String): Map[String, String] = {
+    maybeUri.map(Utils.resolveURI).map { uri =>
+      val resolvedPath = Option(uri.getScheme).getOrElse("file") match {
+        case "file" => s"$initContainerSecretMountPath/$secretKey"
+        case "local" => uri.getPath
+        case invalid => throw new SparkException(s"$uriType has invalid scheme $invalid must be" +
+          s" local://, file://, or empty.")
+      }
+      (configEntry.key, resolvedPath)
+    }.toMap
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
index ac19c2463218b..0e274678ad6f0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/kubernetes/KubernetesSparkDependencyDownloadInitContainer.scala
@@ -76,7 +76,6 @@ private[spark] class KubernetesSparkDependencyDownloadInitContainer(
     fileFetcher: FileFetcher,
     resourceStagingServerSslOptions: SSLOptions) extends Logging {
 
-
   private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
     ThreadUtils.newDaemonCachedThreadPool("download-executor"))
   private val maybeResourceStagingServerUri = sparkConf.get(RESOURCE_STAGING_SERVER_URI)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index 2a0f6e78c2aea..fa0ecca3b4ee6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -70,8 +70,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         sparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION),
         sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT),
         configMap,
-        configMapKey,
-        executorInitContainerSecretVolumePlugin)
+        configMapKey)
     }
     if (maybeConfigMap.isEmpty) {
       logWarning("The executor's init-container config map was not specified. Executors will" +
@@ -89,7 +88,11 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)),
         Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)))
     new KubernetesClusterSchedulerBackend(
-      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl], sc, bootStrap, kubernetesClient)
+        sc.taskScheduler.asInstanceOf[TaskSchedulerImpl],
+        sc,
+        bootStrap,
+        executorInitContainerSecretVolumePlugin,
+        kubernetesClient)
   }
 
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index d880cee315c0d..e5f980ad1f366 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -23,7 +23,7 @@ import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder}
 import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.io.FilenameUtils
@@ -32,9 +32,10 @@ import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkEnv, SparkException}
-import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, SparkPodInitContainerBootstrap}
+import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, InitContainerResourceStagingServerSecretPlugin, PodWithDetachedInitContainer, SparkPodInitContainerBootstrap}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.InitContainerUtil
 import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClient
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointAddress, RpcEnv}
@@ -47,6 +48,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
     val sc: SparkContext,
     executorInitContainerBootstrap: Option[SparkPodInitContainerBootstrap],
+    executorMountInitContainerSecretPlugin: Option[InitContainerResourceStagingServerSecretPlugin],
     kubernetesClient: KubernetesClient)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
 
@@ -319,8 +321,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
     nodeToLocalTaskCount.toMap[String, Int]
   }
 
-  private def addNodeAffinityAnnotationIfUseful(basePodBuilder: PodBuilder,
-                                                nodeToTaskCount: Map[String, Int]): PodBuilder = {
+  private def addNodeAffinityAnnotationIfUseful(
+      baseExecutorPod: Pod, nodeToTaskCount: Map[String, Int]): Pod = {
     def scaleToRange(value: Int, baseMin: Double, baseMax: Double,
                      rangeMin: Double, rangeMax: Double): Int =
       (((rangeMax - rangeMin) * (value - baseMin) / (baseMax - baseMin)) + rangeMin).toInt
@@ -341,11 +343,12 @@ private[spark] class KubernetesClusterSchedulerBackend(
         )))
       // TODO: Use non-annotation syntax when we switch to K8s version 1.6.
       logDebug(s"Adding nodeAffinity as annotation $nodeAffinityJson")
-      basePodBuilder.editMetadata()
+      new PodBuilder(baseExecutorPod).editMetadata()
         .addToAnnotations(ANNOTATION_EXECUTOR_NODE_AFFINITY, nodeAffinityJson)
         .endMetadata()
+        .build()
     } else {
-      basePodBuilder
+      baseExecutorPod
     }
   }
 
@@ -416,7 +419,21 @@ private[spark] class KubernetesClusterSchedulerBackend(
           .build()
       })
 
-    val basePodBuilder = new PodBuilder()
+    val executorContainer = new ContainerBuilder()
+      .withName(s"executor")
+      .withImage(executorDockerImage)
+      .withImagePullPolicy(dockerImagePullPolicy)
+      .withNewResources()
+        .addToRequests("memory", executorMemoryQuantity)
+        .addToLimits("memory", executorMemoryLimitQuantity)
+        .addToRequests("cpu", executorCpuQuantity)
+      .endResources()
+      .addAllToEnv(requiredEnv.asJava)
+      .addToEnv(executorExtraClasspathEnv.toSeq: _*)
+      .withPorts(requiredPorts.asJava)
+      .build()
+
+    val executorPod = new PodBuilder()
       .withNewMetadata()
         .withName(name)
         .withLabels(resolvedExecutorLabels.asJava)
@@ -432,69 +449,77 @@ private[spark] class KubernetesClusterSchedulerBackend(
       .endMetadata()
       .withNewSpec()
         .withHostname(hostname)
-        .addNewContainer()
-          .withName(s"executor")
-          .withImage(executorDockerImage)
-          .withImagePullPolicy(dockerImagePullPolicy)
-          .withNewResources()
-            .addToRequests("memory", executorMemoryQuantity)
-            .addToLimits("memory", executorMemoryLimitQuantity)
-            .addToRequests("cpu", executorCpuQuantity)
-          .endResources()
-          .addAllToEnv(requiredEnv.asJava)
-          .addToEnv(executorExtraClasspathEnv.toSeq: _*)
-          .withPorts(requiredPorts.asJava)
-        .endContainer()
       .endSpec()
+      .build()
 
-    executorLimitCores.map {
+    val containerWithExecutorLimitCores = executorLimitCores.map {
       limitCores =>
         val executorCpuLimitQuantity = new QuantityBuilder(false)
           .withAmount(limitCores)
           .build()
-        basePodBuilder
+        new ContainerBuilder(executorContainer)
+          .editResources()
+            .addToLimits("cpu", executorCpuLimitQuantity)
+            .endResources()
+          .build()
+    }.getOrElse(executorContainer)
+
+    val withMaybeShuffleConfigExecutorContainer = shuffleServiceConfig.map { config =>
+      config.shuffleDirs.foldLeft(containerWithExecutorLimitCores) { (container, dir) =>
+        new ContainerBuilder(container)
+          .addNewVolumeMount()
+            .withName(FilenameUtils.getBaseName(dir))
+            .withMountPath(dir)
+            .endVolumeMount()
+          .build()
+      }
+    }.getOrElse(containerWithExecutorLimitCores)
+    val withMaybeShuffleConfigPod = shuffleServiceConfig.map { config =>
+      config.shuffleDirs.foldLeft(executorPod) { (builder, dir) =>
+        new PodBuilder(builder)
           .editSpec()
-            .editFirstContainer()
-              .editResources
-                .addToLimits("cpu", executorCpuLimitQuantity)
-              .endResources()
-            .endContainer()
-          .endSpec()
-    }
-
-    val withMaybeShuffleConfigPodBuilder = shuffleServiceConfig
-      .map { config =>
-        config.shuffleDirs.foldLeft(basePodBuilder) { (builder, dir) =>
-          builder
-            .editSpec()
-              .addNewVolume()
-                .withName(FilenameUtils.getBaseName(dir))
-                .withNewHostPath()
-                  .withPath(dir)
+            .addNewVolume()
+              .withName(FilenameUtils.getBaseName(dir))
+              .withNewHostPath()
+                .withPath(dir)
                 .endHostPath()
               .endVolume()
-              .editFirstContainer()
-                .addNewVolumeMount()
-                  .withName(FilenameUtils.getBaseName(dir))
-                  .withMountPath(dir)
-                .endVolumeMount()
-              .endContainer()
             .endSpec()
-        }
-      }.getOrElse(basePodBuilder)
-
-    val executorInitContainerPodBuilder = executorInitContainerBootstrap.map {
-        bootstrap =>
-          bootstrap.bootstrapInitContainerAndVolumes(
-            "executor",
-            withMaybeShuffleConfigPodBuilder)
-      }.getOrElse(withMaybeShuffleConfigPodBuilder)
-
-    val resolvedExecutorPodBuilder = addNodeAffinityAnnotationIfUseful(
-      executorInitContainerPodBuilder, nodeToLocalTaskCount)
-
+          .build()
+      }
+    }.getOrElse(executorPod)
+    val (executorPodWithInitContainer, initBootstrappedExecutorContainer) =
+        executorInitContainerBootstrap.map { bootstrap =>
+          val podWithDetachedInitContainer = bootstrap.bootstrapInitContainerAndVolumes(
+              PodWithDetachedInitContainer(
+                  withMaybeShuffleConfigPod,
+                  new ContainerBuilder().build(),
+                  withMaybeShuffleConfigExecutorContainer))
+
+          val resolvedInitContainer = executorMountInitContainerSecretPlugin.map { plugin =>
+            plugin.mountResourceStagingServerSecretIntoInitContainer(
+                podWithDetachedInitContainer.initContainer)
+          }.getOrElse(podWithDetachedInitContainer.initContainer)
+
+          val podWithAttachedInitContainer = InitContainerUtil.appendInitContainer(
+              podWithDetachedInitContainer.pod, resolvedInitContainer)
+
+          val resolvedPodWithMountedSecret = executorMountInitContainerSecretPlugin.map { plugin =>
+            plugin.addResourceStagingServerSecretVolumeToPod(podWithAttachedInitContainer)
+          }.getOrElse(podWithAttachedInitContainer)
+
+          (resolvedPodWithMountedSecret, podWithDetachedInitContainer.mainContainer)
+      }.getOrElse((withMaybeShuffleConfigPod, withMaybeShuffleConfigExecutorContainer))
+
+    val executorPodWithNodeAffinity = addNodeAffinityAnnotationIfUseful(
+        executorPodWithInitContainer, nodeToLocalTaskCount)
+    val resolvedExecutorPod = new PodBuilder(executorPodWithNodeAffinity)
+      .editSpec()
+        .addToContainers(initBootstrappedExecutorContainer)
+        .endSpec()
+      .build()
     try {
-      (executorId, kubernetesClient.pods.create(resolvedExecutorPodBuilder.build()))
+      (executorId, kubernetesClient.pods.create(resolvedExecutorPod))
     } catch {
       case throwable: Throwable =>
         logError("Failed to allocate executor pod.", throwable)
@@ -606,10 +631,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
-  case class ShuffleServiceConfig(shuffleNamespace: String,
+}
+case class ShuffleServiceConfig(
+    shuffleNamespace: String,
     shuffleLabels: Map[String, String],
     shuffleDirs: Seq[String])
-}
 
 private object KubernetesClusterSchedulerBackend {
   private val DEFAULT_STATIC_PORT = 10000
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala
new file mode 100644
index 0000000000000..f5b2db36aff8f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import org.scalatest.BeforeAndAfter
+import io.fabric8.kubernetes.api.model._
+import org.apache.spark.deploy.kubernetes.constants._
+
+import scala.collection.JavaConverters._
+import org.apache.spark.SparkFunSuite
+
+class InitContainerResourceStagingServerSecretPluginSuite extends SparkFunSuite with BeforeAndAfter{
+  private val INIT_CONTAINER_SECRET_NAME = "init-secret"
+  private val INIT_CONTAINER_SECRET_MOUNT = "/tmp/secret"
+
+  private val initContainerSecretPlugin = new InitContainerResourceStagingServerSecretPluginImpl(
+    INIT_CONTAINER_SECRET_NAME,
+    INIT_CONTAINER_SECRET_MOUNT)
+
+  test("Volume Mount into InitContainer") {
+    val returnedCont = initContainerSecretPlugin.mountResourceStagingServerSecretIntoInitContainer(
+      new ContainerBuilder().withName("init-container").build())
+    assert(returnedCont.getName === "init-container")
+    assert(returnedCont.getVolumeMounts.asScala.map(
+      vm => (vm.getName, vm.getMountPath)) ===
+        List((INIT_CONTAINER_SECRET_VOLUME_NAME, INIT_CONTAINER_SECRET_MOUNT)))
+  }
+
+  test("Add Volume with Secret to Pod") {
+    val returnedPod = initContainerSecretPlugin.addResourceStagingServerSecretVolumeToPod(
+      basePod().build)
+    assert(returnedPod.getMetadata.getName === "spark-pod")
+    val volume = returnedPod.getSpec.getVolumes.asScala.head
+    assert(volume.getName === INIT_CONTAINER_SECRET_VOLUME_NAME)
+    assert(volume.getSecret.getSecretName === INIT_CONTAINER_SECRET_NAME)
+  }
+  private def basePod(): PodBuilder = {
+    new PodBuilder()
+      .withNewMetadata()
+      .withName("spark-pod")
+      .endMetadata()
+      .withNewSpec()
+      .endSpec()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
index 90d7b10df211c..0557b5677b919 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
@@ -16,16 +16,14 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import com.fasterxml.jackson.databind.ObjectMapper
-import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBuilder}
 import org.scalatest.BeforeAndAfter
-import scala.collection.JavaConverters._
+import io.fabric8.kubernetes.api.model._
+import org.apache.spark.deploy.kubernetes.constants._
 
+import scala.collection.JavaConverters._
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.constants._
 
 class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAfter {
-  private val OBJECT_MAPPER = new ObjectMapper()
   private val INIT_CONTAINER_IMAGE = "spark-init:latest"
   private val DOCKER_IMAGE_PULL_POLICY = "IfNotPresent"
   private val JARS_DOWNLOAD_PATH = "/var/data/spark-jars"
@@ -33,134 +31,66 @@ class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAf
   private val DOWNLOAD_TIMEOUT_MINUTES = 5
   private val INIT_CONTAINER_CONFIG_MAP_NAME = "spark-init-config-map"
   private val INIT_CONTAINER_CONFIG_MAP_KEY = "spark-init-config-map-key"
-  private val ADDED_SUBMITTED_DEPENDENCY_ENV = "ADDED_SUBMITTED_DEPENDENCY"
-  private val ADDED_SUBMITTED_DEPENDENCY_ANNOTATION = "added-submitted-dependencies"
   private val MAIN_CONTAINER_NAME = "spark-main"
-  private val TRUE = "true"
 
-  private val submittedDependencyPlugin = new InitContainerResourceStagingServerSecretPlugin {
-    override def addResourceStagingServerSecretVolumeToPod(basePod: PodBuilder)
-        : PodBuilder = {
-      basePod.editMetadata()
-        .addToAnnotations(ADDED_SUBMITTED_DEPENDENCY_ANNOTATION, TRUE)
-        .endMetadata()
-    }
+  private val sparkPodInit = new SparkPodInitContainerBootstrapImpl(
+    INIT_CONTAINER_IMAGE,
+    DOCKER_IMAGE_PULL_POLICY,
+    JARS_DOWNLOAD_PATH,
+    FILES_DOWNLOAD_PATH,
+    DOWNLOAD_TIMEOUT_MINUTES,
+    INIT_CONTAINER_CONFIG_MAP_NAME,
+    INIT_CONTAINER_CONFIG_MAP_KEY)
+  private val expectedSharedVolumeMap = Map(
+    JARS_DOWNLOAD_PATH -> INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME,
+    FILES_DOWNLOAD_PATH -> INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
 
-    override def mountResourceStagingServerSecretIntoInitContainer(container: ContainerBuilder)
-        : ContainerBuilder = {
-      container
-        .addNewEnv()
-          .withName(ADDED_SUBMITTED_DEPENDENCY_ENV)
-          .withValue(TRUE)
-          .endEnv()
-    }
-  }
-
-  test("Running without submitted dependencies adds init-container with volume mounts.") {
-    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
-    val podAnnotations = bootstrappedPod.getMetadata.getAnnotations.asScala
-    assert(podAnnotations.contains(INIT_CONTAINER_ANNOTATION))
-    val initContainers = OBJECT_MAPPER.readValue(
-        podAnnotations(INIT_CONTAINER_ANNOTATION), classOf[Array[Container]])
-    assert(initContainers.length === 1)
-    val initContainer = initContainers.head
-    val initContainerVolumeMounts = initContainer.getVolumeMounts.asScala.map {
-      mount => (mount.getName, mount.getMountPath)
-    }.toMap
-    val expectedInitContainerVolumeMounts = Map(
-      INIT_CONTAINER_PROPERTIES_FILE_VOLUME -> INIT_CONTAINER_PROPERTIES_FILE_DIR,
-      INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
-      INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH)
-    assert(initContainerVolumeMounts === expectedInitContainerVolumeMounts)
+  test("InitContainer: Volume mounts, args, and builder specs") {
+    val returnedPodWithCont = sparkPodInit.bootstrapInitContainerAndVolumes(
+      PodWithDetachedInitContainer(
+        pod = basePod().build(),
+        initContainer = new Container(),
+        mainContainer = new ContainerBuilder().withName(MAIN_CONTAINER_NAME).build()))
+    val initContainer: Container = returnedPodWithCont.initContainer
+    val volumes = initContainer.getVolumeMounts.asScala
+    assert(volumes.map(vm => (vm.getMountPath, vm.getName)).toMap === expectedSharedVolumeMap
+      ++ Map("/etc/spark-init" -> "spark-init-properties"))
     assert(initContainer.getName === "spark-init")
     assert(initContainer.getImage === INIT_CONTAINER_IMAGE)
-    assert(initContainer.getImagePullPolicy === "IfNotPresent")
-    assert(initContainer.getArgs.asScala === List(INIT_CONTAINER_PROPERTIES_FILE_PATH))
+    assert(initContainer.getImagePullPolicy === DOCKER_IMAGE_PULL_POLICY)
+    assert(initContainer.getArgs.asScala.head === INIT_CONTAINER_PROPERTIES_FILE_PATH)
   }
-
-  test("Running without submitted dependencies adds volume mounts to main container.") {
-    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
-    val containers = bootstrappedPod.getSpec.getContainers.asScala
-    val mainContainer = containers.find(_.getName === MAIN_CONTAINER_NAME)
-    assert(mainContainer.isDefined)
-    val volumeMounts = mainContainer.map(_.getVolumeMounts.asScala).toSeq.flatten.map {
-      mount => (mount.getName, mount.getMountPath)
-    }.toMap
-    val expectedVolumeMounts = Map(
-      INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME -> JARS_DOWNLOAD_PATH,
-      INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME -> FILES_DOWNLOAD_PATH)
-    assert(volumeMounts === expectedVolumeMounts)
+  test("Main: Volume mounts and env") {
+    val returnedPodWithCont = sparkPodInit.bootstrapInitContainerAndVolumes(
+      PodWithDetachedInitContainer(
+        pod = basePod().build(),
+        initContainer = new Container(),
+        mainContainer = new ContainerBuilder().withName(MAIN_CONTAINER_NAME).build()))
+    val mainContainer: Container = returnedPodWithCont.mainContainer
+    assert(mainContainer.getName === MAIN_CONTAINER_NAME)
+    val volumeMounts = mainContainer.getVolumeMounts.asScala
+    assert(volumeMounts.map(vm => (vm.getMountPath, vm.getName)).toMap === expectedSharedVolumeMap)
+    assert(mainContainer.getEnv.asScala.map(e => (e.getName, e.getValue)).toMap ===
+      Map(ENV_MOUNTED_FILES_DIR -> FILES_DOWNLOAD_PATH))
   }
-
-  test("Running without submitted dependencies adds volumes to the pod") {
-    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
-    val podVolumes = bootstrappedPod.getSpec.getVolumes.asScala
-    assert(podVolumes.size === 3)
-    assert(podVolumes.exists { volume =>
-      volume.getName == INIT_CONTAINER_PROPERTIES_FILE_VOLUME &&
-        Option(volume.getConfigMap).map { configMap =>
-          configMap.getItems.asScala.map {
-            keyToPath => (keyToPath.getKey, keyToPath.getPath)
-          }.toMap
-        }.contains(Map(INIT_CONTAINER_CONFIG_MAP_KEY -> INIT_CONTAINER_PROPERTIES_FILE_NAME))
-    })
-    assert(podVolumes.exists { volume =>
-      volume.getName == INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME && volume.getEmptyDir != null
-    })
-    assert(podVolumes.exists { volume =>
-      volume.getName == INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME && volume.getEmptyDir != null
-    })
-  }
-
-  test("Files download path is set as environment variable") {
-    val bootstrappedPod = bootstrapPodWithoutSubmittedDependencies()
-    val containers = bootstrappedPod.getSpec.getContainers.asScala
-    val maybeMainContainer = containers.find(_.getName === MAIN_CONTAINER_NAME)
-    assert(maybeMainContainer.exists { mainContainer =>
-      mainContainer.getEnv.asScala.exists(envVar =>
-        envVar.getName == ENV_MOUNTED_FILES_DIR && envVar.getValue == FILES_DOWNLOAD_PATH)
-    })
-  }
-
-  test("Running with submitted dependencies modifies the init container with the plugin.") {
-    val bootstrappedPod = bootstrapPodWithSubmittedDependencies()
-    val podAnnotations = bootstrappedPod.getMetadata.getAnnotations.asScala
-    assert(podAnnotations(ADDED_SUBMITTED_DEPENDENCY_ANNOTATION) === TRUE)
-    val initContainers = OBJECT_MAPPER.readValue(
-      podAnnotations(INIT_CONTAINER_ANNOTATION), classOf[Array[Container]])
-    assert(initContainers.length === 1)
-    val initContainer = initContainers.head
-    assert(initContainer.getEnv.asScala.exists {
-      env => env.getName === ADDED_SUBMITTED_DEPENDENCY_ENV && env.getValue === TRUE
-    })
-  }
-
-  private def bootstrapPodWithoutSubmittedDependencies(): Pod = {
-    val bootstrapUnderTest = new SparkPodInitContainerBootstrapImpl(
-      INIT_CONTAINER_IMAGE,
-      DOCKER_IMAGE_PULL_POLICY,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      DOWNLOAD_TIMEOUT_MINUTES,
-      INIT_CONTAINER_CONFIG_MAP_NAME,
-      INIT_CONTAINER_CONFIG_MAP_KEY,
-      None)
-    bootstrapUnderTest.bootstrapInitContainerAndVolumes(
-      MAIN_CONTAINER_NAME, basePod()).build()
-  }
-
-  private def bootstrapPodWithSubmittedDependencies(): Pod = {
-    val bootstrapUnderTest = new SparkPodInitContainerBootstrapImpl(
-      INIT_CONTAINER_IMAGE,
-      DOCKER_IMAGE_PULL_POLICY,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      DOWNLOAD_TIMEOUT_MINUTES,
-      INIT_CONTAINER_CONFIG_MAP_NAME,
-      INIT_CONTAINER_CONFIG_MAP_KEY,
-      Some(submittedDependencyPlugin))
-    bootstrapUnderTest.bootstrapInitContainerAndVolumes(
-      MAIN_CONTAINER_NAME, basePod()).build()
+  test("Pod: Volume Mounts") {
+    val returnedPodWithCont = sparkPodInit.bootstrapInitContainerAndVolumes(
+      PodWithDetachedInitContainer(
+        pod = basePod().build(),
+        initContainer = new Container(),
+        mainContainer = new ContainerBuilder().withName(MAIN_CONTAINER_NAME).build()))
+    val returnedPod = returnedPodWithCont.pod
+    assert(returnedPod.getMetadata.getName === "spark-pod")
+    val volumes = returnedPod.getSpec.getVolumes.asScala.toList
+    assert(volumes.head.getName === INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+    assert(volumes.head.getConfigMap.getName === INIT_CONTAINER_CONFIG_MAP_NAME)
+    assert(volumes.head.getConfigMap.getItems.asScala.map(
+      i => (i.getKey, i.getPath)) ===
+        List((INIT_CONTAINER_CONFIG_MAP_KEY, INIT_CONTAINER_PROPERTIES_FILE_NAME)))
+    assert(volumes(1).getName === INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
+    assert(volumes(1).getEmptyDir === new EmptyDirVolumeSource())
+    assert(volumes(2).getName === INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
+    assert(volumes(2).getEmptyDir === new EmptyDirVolumeSource())
   }
 
   private def basePod(): PodBuilder = {
@@ -169,9 +99,6 @@ class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAf
         .withName("spark-pod")
         .endMetadata()
       .withNewSpec()
-        .addNewContainer()
-          .withName(MAIN_CONTAINER_NAME)
-          .endContainer()
-        .endSpec()
+      .endSpec()
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala
deleted file mode 100644
index 473d369c8eca3..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SubmittedDependencyInitContainerVolumesPluginSuite.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes
-
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.constants._
-
-class SubmittedDependencyInitContainerVolumesPluginSuite extends SparkFunSuite {
-
-  private val SECRET_NAME = "secret"
-  private val SECRET_MOUNT_PATH = "/mnt/secrets"
-  private val plugin = new InitContainerResourceStagingServerSecretPluginImpl(
-      SECRET_NAME, SECRET_MOUNT_PATH)
-
-  test("The init container should have the secret volume mount.") {
-    val baseInitContainer = new ContainerBuilder().withName("container")
-    val configuredInitContainer = plugin.mountResourceStagingServerSecretIntoInitContainer(
-        baseInitContainer).build()
-    val volumeMounts = configuredInitContainer.getVolumeMounts.asScala
-    assert(volumeMounts.size === 1)
-    assert(volumeMounts.exists { volumeMount =>
-      volumeMount.getName === INIT_CONTAINER_SECRET_VOLUME_NAME &&
-          volumeMount.getMountPath === SECRET_MOUNT_PATH
-    })
-  }
-
-  test("The pod should have the secret volume.") {
-    val basePod = new PodBuilder()
-      .withNewMetadata().withName("pod").endMetadata()
-      .withNewSpec()
-        .addNewContainer()
-          .withName("container")
-          .endContainer()
-        .endSpec()
-    val configuredPod = plugin.addResourceStagingServerSecretVolumeToPod(basePod).build()
-    val volumes = configuredPod.getSpec.getVolumes.asScala
-    assert(volumes.size === 1)
-    assert(volumes.exists { volume =>
-      volume.getName === INIT_CONTAINER_SECRET_VOLUME_NAME &&
-          Option(volume.getSecret).map(_.getSecretName).contains(SECRET_NAME)
-    })
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientSuite.scala
new file mode 100644
index 0000000000000..965ee75c248b8
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientSuite.scala
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import com.google.common.collect.Iterables
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
+import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
+import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, NamespaceVisitFromServerGetWatchDeleteRecreateWaitApplicable, PodResource, Resource}
+import org.mockito.{ArgumentCaptor, Mock, MockitoAnnotations}
+import org.mockito.Mockito.{doReturn, verify, when}
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mock.MockitoSugar._
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.{DriverConfigurationStep, KubernetesDriverSpec}
+
+class ClientSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val DRIVER_POD_UID = "pod-id"
+  private val DRIVER_POD_API_VERSION = "v1"
+  private val DRIVER_POD_KIND = "pod"
+
+  private type ResourceList = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
+      HasMetadata, Boolean]
+  private type Pods = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+
+  @Mock
+  private var kubernetesClient: KubernetesClient = _
+
+  @Mock
+  private var podOperations: Pods = _
+
+  @Mock
+  private var namedPods: PodResource[Pod, DoneablePod] = _
+
+  @Mock
+  private var loggingPodStatusWatcher: LoggingPodStatusWatcher = _
+
+  @Mock
+  private var resourceList: ResourceList = _
+
+  private val submissionSteps = Seq(FirstTestConfigurationStep, SecondTestConfigurationStep)
+  private var createdPodArgumentCaptor: ArgumentCaptor[Pod] = _
+  private var createdResourcesArgumentCaptor: ArgumentCaptor[HasMetadata] = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    when(kubernetesClient.pods()).thenReturn(podOperations)
+    when(podOperations.withName(FirstTestConfigurationStep.podName)).thenReturn(namedPods)
+
+    createdPodArgumentCaptor = ArgumentCaptor.forClass(classOf[Pod])
+    createdResourcesArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
+    when(podOperations.create(createdPodArgumentCaptor.capture())).thenAnswer(new Answer[Pod] {
+      override def answer(invocation: InvocationOnMock): Pod = {
+        new PodBuilder(invocation.getArgumentAt(0, classOf[Pod]))
+          .editMetadata()
+            .withUid(DRIVER_POD_UID)
+            .endMetadata()
+          .withApiVersion(DRIVER_POD_API_VERSION)
+          .withKind(DRIVER_POD_KIND)
+          .build()
+      }
+    })
+    when(podOperations.withName(FirstTestConfigurationStep.podName)).thenReturn(namedPods)
+    when(namedPods.watch(loggingPodStatusWatcher)).thenReturn(mock[Watch])
+    doReturn(resourceList)
+        .when(kubernetesClient)
+        .resourceList(createdResourcesArgumentCaptor.capture())
+  }
+
+  test("The client should configure the pod with the submission steps.") {
+    val submissionClient = new Client(
+        submissionSteps,
+        new SparkConf(false),
+        kubernetesClient,
+        false,
+        "spark",
+        loggingPodStatusWatcher)
+    submissionClient.run()
+    val createdPod = createdPodArgumentCaptor.getValue
+    assert(createdPod.getMetadata.getName === FirstTestConfigurationStep.podName)
+    assert(createdPod.getMetadata.getLabels.asScala ===
+        Map(FirstTestConfigurationStep.labelKey -> FirstTestConfigurationStep.labelValue))
+    assert(createdPod.getMetadata.getAnnotations.asScala ===
+        Map(SecondTestConfigurationStep.annotationKey ->
+            SecondTestConfigurationStep.annotationValue))
+    assert(createdPod.getSpec.getContainers.size() === 1)
+    assert(createdPod.getSpec.getContainers.get(0).getName ===
+        SecondTestConfigurationStep.containerName)
+  }
+
+  test("The client should create the secondary Kubernetes resources.") {
+    val submissionClient = new Client(
+        submissionSteps,
+        new SparkConf(false),
+        kubernetesClient,
+        false,
+        "spark",
+        loggingPodStatusWatcher)
+    submissionClient.run()
+    val createdPod = createdPodArgumentCaptor.getValue
+    val otherCreatedResources = createdResourcesArgumentCaptor.getAllValues
+    assert(otherCreatedResources.size === 1)
+    val createdResource = Iterables.getOnlyElement(otherCreatedResources).asInstanceOf[Secret]
+    assert(createdResource.getMetadata.getName === FirstTestConfigurationStep.secretName)
+    assert(createdResource.getData.asScala ===
+        Map(FirstTestConfigurationStep.secretKey -> FirstTestConfigurationStep.secretData))
+    val ownerReference = Iterables.getOnlyElement(createdResource.getMetadata.getOwnerReferences)
+    assert(ownerReference.getName === createdPod.getMetadata.getName)
+    assert(ownerReference.getKind === DRIVER_POD_KIND)
+    assert(ownerReference.getUid === DRIVER_POD_UID)
+    assert(ownerReference.getApiVersion === DRIVER_POD_API_VERSION)
+  }
+
+  test("The client should attach the driver container with the appropriate JVM options.") {
+    val sparkConf = new SparkConf(false)
+        .set("spark.logConf", "true")
+        .set(
+          org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS,
+          "-XX:+|-HeapDumpOnOutOfMemoryError")
+    val submissionClient = new Client(
+        submissionSteps,
+        sparkConf,
+        kubernetesClient,
+        false,
+        "spark",
+        loggingPodStatusWatcher)
+    submissionClient.run()
+    val createdPod = createdPodArgumentCaptor.getValue
+    val driverContainer = Iterables.getOnlyElement(createdPod.getSpec.getContainers)
+    assert(driverContainer.getName === SecondTestConfigurationStep.containerName)
+    val driverJvmOptsEnv = Iterables.getOnlyElement(driverContainer.getEnv)
+    assert(driverJvmOptsEnv.getName === ENV_DRIVER_JAVA_OPTS)
+    val driverJvmOpts = driverJvmOptsEnv.getValue.split(" ").toSet
+    assert(driverJvmOpts.contains("-Dspark.logConf=true"))
+    assert(driverJvmOpts.contains(
+        s"-D${SecondTestConfigurationStep.sparkConfKey}=" +
+          SecondTestConfigurationStep.sparkConfValue))
+    assert(driverJvmOpts.contains(
+        "-XX:+|-HeapDumpOnOutOfMemoryError"))
+  }
+
+  test("Waiting for app completion should stall on the watcher") {
+    val submissionClient = new Client(
+      submissionSteps,
+      new SparkConf(false),
+      kubernetesClient,
+      true,
+      "spark",
+      loggingPodStatusWatcher)
+    submissionClient.run()
+    verify(loggingPodStatusWatcher).awaitCompletion()
+  }
+
+}
+
+private object FirstTestConfigurationStep extends DriverConfigurationStep {
+
+  val podName = "test-pod"
+  val secretName = "test-secret"
+  val labelKey = "first-submit"
+  val labelValue = "true"
+  val secretKey = "secretKey"
+  val secretData = "secretData"
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val modifiedPod = new PodBuilder(driverSpec.driverPod)
+      .editMetadata()
+      .withName(podName)
+      .addToLabels(labelKey, labelValue)
+      .endMetadata()
+      .build()
+    val additionalResource = new SecretBuilder()
+      .withNewMetadata()
+      .withName(secretName)
+      .endMetadata()
+      .addToData(secretKey, secretData)
+      .build()
+    driverSpec.copy(
+      driverPod = modifiedPod,
+      otherKubernetesResources = driverSpec.otherKubernetesResources ++ Seq(additionalResource))
+  }
+}
+
+private object SecondTestConfigurationStep extends DriverConfigurationStep {
+
+  val annotationKey = "second-submit"
+  val annotationValue = "submitted"
+  val sparkConfKey = "spark.custom-conf"
+  val sparkConfValue = "custom-conf-value"
+  val containerName = "driverContainer"
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val modifiedPod = new PodBuilder(driverSpec.driverPod)
+      .editMetadata()
+      .addToAnnotations(annotationKey, annotationValue)
+      .endMetadata()
+      .build()
+    val resolvedSparkConf = driverSpec.driverSparkConf.clone().set(sparkConfKey, sparkConfValue)
+    val modifiedContainer = new ContainerBuilder(driverSpec.driverContainer)
+      .withName(containerName)
+      .build()
+    driverSpec.copy(
+      driverPod = modifiedPod,
+      driverSparkConf = resolvedSparkConf,
+      driverContainer = modifiedContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
deleted file mode 100644
index a58a37691f4eb..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ClientV2Suite.scala
+++ /dev/null
@@ -1,558 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import java.io.File
-
-import io.fabric8.kubernetes.api.model._
-import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
-import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
-import org.hamcrest.{BaseMatcher, Description}
-import org.mockito.{AdditionalAnswers, ArgumentCaptor, Mock, MockitoAnnotations}
-import org.mockito.Matchers.{any, anyVararg, argThat, eq => mockitoEq}
-import org.mockito.Mockito.{times, verify, when}
-import org.mockito.invocation.InvocationOnMock
-import org.mockito.stubbing.Answer
-import org.scalatest.{BeforeAndAfter, Matchers}
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.kubernetes.{KubernetesExternalShuffleService, KubernetesShuffleBlockHandler, SparkPodInitContainerBootstrap}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.network.netty.SparkTransportConf
-import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClient
-
-class ClientV2Suite extends SparkFunSuite with BeforeAndAfter {
-  private val JARS_RESOURCE = SubmittedResourceIdAndSecret("jarsId", "jarsSecret")
-  private val FILES_RESOURCE = SubmittedResourceIdAndSecret("filesId", "filesSecret")
-  private val SUBMITTED_RESOURCES = SubmittedResources(JARS_RESOURCE, FILES_RESOURCE)
-  private val BOOTSTRAPPED_POD_ANNOTATION = "bootstrapped"
-  private val TRUE = "true"
-  private val APP_NAME = "spark-test"
-  private val APP_RESOURCE_PREFIX = "spark-prefix"
-  private val APP_ID = "spark-id"
-  private val CUSTOM_LABEL_KEY = "customLabel"
-  private val CUSTOM_LABEL_VALUE = "customLabelValue"
-  private val DEPRECATED_CUSTOM_LABEL_KEY = "deprecatedCustomLabel"
-  private val DEPRECATED_CUSTOM_LABEL_VALUE = "deprecatedCustomLabelValue"
-  private val ALL_EXPECTED_LABELS = Map(
-      CUSTOM_LABEL_KEY -> CUSTOM_LABEL_VALUE,
-      DEPRECATED_CUSTOM_LABEL_KEY -> DEPRECATED_CUSTOM_LABEL_VALUE,
-      SPARK_APP_ID_LABEL -> APP_ID,
-      SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
-  private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
-  private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
-  private val DEPRECATED_CUSTOM_ANNOTATION_KEY = "deprecatedCustomAnnotation"
-  private val DEPRECATED_CUSTOM_ANNOTATION_VALUE = "deprecatedCustomAnnotationValue"
-  private val INIT_CONTAINER_SECRET_NAME = "init-container-secret"
-  private val INIT_CONTAINER_SECRET_DATA = Map("secret-key" -> "secret-data")
-  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
-  private val PYSPARK_APP_ARGS = Array(null, "500")
-  private val APP_ARGS = Array("3", "20")
-  private val SPARK_JARS = Seq(
-      "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
-  private val RESOLVED_SPARK_JARS = Seq(
-      "hdfs://localhost:9000/app/jars/jar1.jar", "file:///var/data/spark-jars/jar2.jar")
-  private val RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS = Seq(
-      "/var/data/spark-jars/jar1.jar", "/var/data/spark-jars/jar2.jar")
-  private val SPARK_FILES = Seq(
-      "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
-  private val PYSPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.py",
-    "file:///app/files/file2.py",
-    "local:///app/files/file3.py",
-    "http://app/files/file4.py",
-    "file:///app/files/file5.py")
-  private val RESOLVED_PYSPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.py",
-    "/var/spark-data/spark-files/file2.py",
-    "local:///app/files/file3.py",
-    "http://app/files/file4.py")
-  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
-  private val RESOLVED_PYSPARK_PRIMARY_FILE = "/var/spark-data/spark-file/file5.py"
-
-  private val RESOLVED_SPARK_FILES = Seq(
-      "hdfs://localhost:9000/app/files/file1.txt", "file:///var/data/spark-files/file2.txt")
-  private val INIT_CONTAINER_SECRET = new SecretBuilder()
-    .withNewMetadata()
-      .withName(INIT_CONTAINER_SECRET_NAME)
-      .endMetadata()
-    .withData(INIT_CONTAINER_SECRET_DATA.asJava)
-    .build()
-  private val CUSTOM_JAVA_OPTION_KEY = "myappoption"
-  private val CUSTOM_JAVA_OPTION_VALUE = "myappoptionvalue"
-  private val DRIVER_JAVA_OPTIONS = s"-D$CUSTOM_JAVA_OPTION_KEY=$CUSTOM_JAVA_OPTION_VALUE"
-  private val DRIVER_EXTRA_CLASSPATH = "/var/data/spark-app-custom/custom-jar.jar"
-  private val CONFIG_MAP_NAME = "config-map"
-  private val CONFIG_MAP_DATA = Map("config-map-key" -> "config-map-data")
-  private val INIT_CONTAINER_CONFIG_MAP = new ConfigMapBuilder()
-    .withNewMetadata()
-      .withName(CONFIG_MAP_NAME)
-      .endMetadata()
-    .withData(CONFIG_MAP_DATA.asJava)
-    .build()
-  private val CUSTOM_DRIVER_IMAGE = "spark-custom-driver:latest"
-  private val DRIVER_MEMORY_MB = 512
-  private val DRIVER_MEMORY_OVERHEAD_MB = 128
-  private val SPARK_CONF = new SparkConf(true)
-      .set(DRIVER_DOCKER_IMAGE, CUSTOM_DRIVER_IMAGE)
-      .set(org.apache.spark.internal.config.DRIVER_MEMORY, DRIVER_MEMORY_MB.toLong)
-      .set(KUBERNETES_DRIVER_MEMORY_OVERHEAD, DRIVER_MEMORY_OVERHEAD_MB.toLong)
-      .set(KUBERNETES_DRIVER_LABELS, s"$DEPRECATED_CUSTOM_LABEL_KEY=$DEPRECATED_CUSTOM_LABEL_VALUE")
-      .set(KUBERNETES_DRIVER_ANNOTATIONS,
-          s"$DEPRECATED_CUSTOM_ANNOTATION_KEY=$DEPRECATED_CUSTOM_ANNOTATION_VALUE")
-      .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
-      .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
-      .set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, DRIVER_EXTRA_CLASSPATH)
-      .set(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS, DRIVER_JAVA_OPTIONS)
-  private val EXECUTOR_INIT_CONF_KEY = "executor-init-conf"
-  private val SPARK_CONF_WITH_EXECUTOR_INIT_CONF = SPARK_CONF.clone()
-      .set(EXECUTOR_INIT_CONF_KEY, TRUE)
-  private val DRIVER_POD_UID = "driver-pod-uid"
-  private val DRIVER_POD_KIND = "pod"
-  private val DRIVER_POD_API_VERSION = "v1"
-  private val CREDENTIALS_SECRET_NAME = "credentials-secret"
-  private val CREDENTIALS_SECRET_DATA = Map("credentials-secret-key" -> "credentials-secret-value")
-  private val CREDENTIALS_SECRET = new SecretBuilder()
-    .withNewMetadata()
-      .withName(CREDENTIALS_SECRET_NAME)
-      .endMetadata()
-    .withData(CREDENTIALS_SECRET_DATA.asJava)
-    .build()
-  private val CREDENTIALS_SET_CONF = "spark.kubernetes.driverCredentials.provided"
-  private val CREDENTIALS_SET_ANNOTATION = "credentials-set"
-
-  @Mock
-  private var containerLocalizedFilesResolver: ContainerLocalizedFilesResolver = _
-  @Mock
-  private var executorInitContainerConfiguration: ExecutorInitContainerConfiguration = _
-  @Mock
-  private var submittedDependencyUploader: SubmittedDependencyUploader = _
-  @Mock
-  private var submittedDependenciesSecretBuilder: SubmittedDependencySecretBuilder = _
-  @Mock
-  private var initContainerBootstrap: SparkPodInitContainerBootstrap = _
-  @Mock
-  private var initContainerComponentsProvider: DriverInitContainerComponentsProvider = _
-  @Mock
-  private var kubernetesClient: KubernetesClient = _
-  @Mock
-  private var podOps: MixedOperation[
-    Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]] = _
-  private type ResourceListOps = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
-      HasMetadata, java.lang.Boolean]
-  @Mock
-  private var resourceListOps: ResourceListOps = _
-  @Mock
-  private var credentialsMounterProvider: DriverPodKubernetesCredentialsMounterProvider = _
-  @Mock
-  private var fileMounter: DriverPodKubernetesFileMounter = _
-  @Mock
-  private var credentialsMounter: DriverPodKubernetesCredentialsMounter = _
-  @Mock
-  private var loggingPodStatusWatcher: LoggingPodStatusWatcher = _
-  @Mock
-  private var namedPodResource: PodResource[Pod, DoneablePod] = _
-  @Mock
-  private var watch: Watch = _
-
-  before {
-    MockitoAnnotations.initMocks(this)
-    when(initContainerComponentsProvider.provideInitContainerBootstrap())
-      .thenReturn(initContainerBootstrap)
-    when(submittedDependencyUploader.uploadJars()).thenReturn(JARS_RESOURCE)
-    when(submittedDependencyUploader.uploadFiles()).thenReturn(FILES_RESOURCE)
-    when(initContainerBootstrap
-      .bootstrapInitContainerAndVolumes(mockitoEq(DRIVER_CONTAINER_NAME), any()))
-      .thenAnswer(new Answer[PodBuilder] {
-        override def answer(invocationOnMock: InvocationOnMock): PodBuilder = {
-          invocationOnMock.getArgumentAt(1, classOf[PodBuilder]).editMetadata()
-            .addToAnnotations(BOOTSTRAPPED_POD_ANNOTATION, TRUE)
-            .endMetadata()
-        }
-      })
-    when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver(
-      any[String])).thenReturn(containerLocalizedFilesResolver)
-    when(initContainerComponentsProvider.provideDriverPodFileMounter())
-        .thenReturn(fileMounter)
-    when(submittedDependenciesSecretBuilder.build())
-      .thenReturn(INIT_CONTAINER_SECRET)
-    when(kubernetesClient.pods()).thenReturn(podOps)
-    when(podOps.create(any())).thenAnswer(new Answer[Pod] {
-      override def answer(invocation: InvocationOnMock): Pod = {
-        new PodBuilder(invocation.getArgumentAt(0, classOf[Pod]))
-          .editMetadata()
-            .withUid(DRIVER_POD_UID)
-          .endMetadata()
-            .withKind(DRIVER_POD_KIND)
-          .withApiVersion(DRIVER_POD_API_VERSION)
-          .build()
-      }
-    })
-    when(podOps.withName(s"$APP_RESOURCE_PREFIX-driver")).thenReturn(namedPodResource)
-    when(fileMounter.addPySparkFiles(
-      mockitoEq(RESOLVED_PYSPARK_PRIMARY_FILE),
-      mockitoEq(RESOLVED_PYSPARK_FILES.mkString(",")),
-      any[String],
-      any())).thenAnswer( new Answer[PodBuilder] {
-        override def answer(invocation: InvocationOnMock) : PodBuilder = {
-          invocation.getArgumentAt(3, classOf[PodBuilder])
-          .editMetadata()
-            .withUid(DRIVER_POD_UID)
-            .withName(s"$APP_RESOURCE_PREFIX-driver")
-            .addToLabels("pyspark-test", "true")
-          .endMetadata()
-          .withKind(DRIVER_POD_KIND)
-          .withApiVersion(DRIVER_POD_API_VERSION)
-        }
-      })
-    when(namedPodResource.watch(loggingPodStatusWatcher)).thenReturn(watch)
-    when(containerLocalizedFilesResolver.resolveSubmittedAndRemoteSparkJars())
-        .thenReturn(RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS)
-    when(containerLocalizedFilesResolver.resolveSubmittedSparkJars())
-        .thenReturn(RESOLVED_SPARK_JARS)
-    when(containerLocalizedFilesResolver.resolveSubmittedSparkFiles())
-        .thenReturn(RESOLVED_SPARK_FILES)
-    when(containerLocalizedFilesResolver.resolvePrimaryResourceFile())
-      .thenReturn(RESOLVED_PYSPARK_PRIMARY_FILE)
-    when(containerLocalizedFilesResolver.resolveSubmittedPySparkFiles())
-      .thenReturn(RESOLVED_PYSPARK_FILES)
-    when(executorInitContainerConfiguration.configureSparkConfForExecutorInitContainer(SPARK_CONF))
-        .thenReturn(SPARK_CONF_WITH_EXECUTOR_INIT_CONF)
-    when(kubernetesClient.resourceList(anyVararg[HasMetadata]())).thenReturn(resourceListOps)
-    when(credentialsMounterProvider.getDriverPodKubernetesCredentialsMounter())
-        .thenReturn(credentialsMounter)
-  }
-
-  test("Run with dependency uploader") {
-    expectationsForNoMountedCredentials()
-    when(initContainerComponentsProvider
-        .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
-        .thenReturn(Some(submittedDependencyUploader))
-    when(initContainerComponentsProvider
-        .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets())))
-        .thenReturn(Some(submittedDependenciesSecretBuilder))
-    when(initContainerComponentsProvider.provideInitContainerBundle(mockitoEq(
-      Option(SUBMITTED_RESOURCES.ids())),
-      mockitoEq(RESOLVED_SPARK_JARS ++ RESOLVED_SPARK_FILES)))
-        .thenReturn(Option(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
-          initContainerBootstrap, executorInitContainerConfiguration)))
-    runAndVerifyDriverPodHasCorrectProperties()
-    val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
-    verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
-    val createdResources = resourceListArgumentCaptor.getAllValues.asScala
-    assert(createdResources.size === 2)
-    verifyCreatedResourcesHaveOwnerReferences(createdResources)
-    assert(createdResources.exists {
-      case secret: Secret =>
-        secret.getMetadata.getName == INIT_CONTAINER_SECRET_NAME &&
-            secret.getData.asScala == INIT_CONTAINER_SECRET_DATA
-      case _ => false
-    })
-    verifyConfigMapWasCreated(createdResources)
-    verify(submittedDependencyUploader).uploadJars()
-    verify(submittedDependencyUploader).uploadFiles()
-    verify(initContainerComponentsProvider)
-      .provideSubmittedDependenciesSecretBuilder(Some(SUBMITTED_RESOURCES.secrets()))
-  }
-
-  test("Run without dependency uploader") {
-    expectationsForNoMountedCredentials()
-    expectationsForNoDependencyUploader()
-    runAndVerifyDriverPodHasCorrectProperties()
-    val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
-    verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
-    val createdResources = resourceListArgumentCaptor.getAllValues.asScala
-    assert(createdResources.size === 1)
-    verifyCreatedResourcesHaveOwnerReferences(createdResources)
-    verifyConfigMapWasCreated(createdResources)
-    verify(submittedDependencyUploader, times(0)).uploadJars()
-    verify(submittedDependencyUploader, times(0)).uploadFiles()
-    verify(initContainerComponentsProvider)
-      .provideSubmittedDependenciesSecretBuilder(None)
-  }
-
-  test("Run with mounted credentials") {
-    expectationsForNoDependencyUploader()
-    when(credentialsMounter.createCredentialsSecret()).thenReturn(Some(CREDENTIALS_SECRET))
-    when(credentialsMounter.mountDriverKubernetesCredentials(
-        any(), mockitoEq(DRIVER_CONTAINER_NAME), mockitoEq(Some(CREDENTIALS_SECRET))))
-        .thenAnswer(new Answer[PodBuilder] {
-          override def answer(invocation: InvocationOnMock): PodBuilder = {
-            invocation.getArgumentAt(0, classOf[PodBuilder]).editMetadata()
-              .addToAnnotations(CREDENTIALS_SET_ANNOTATION, TRUE)
-              .endMetadata()
-          }
-        })
-    when(credentialsMounter.setDriverPodKubernetesCredentialLocations(any()))
-        .thenAnswer(new Answer[SparkConf] {
-          override def answer(invocation: InvocationOnMock): SparkConf = {
-            invocation.getArgumentAt(0, classOf[SparkConf]).clone().set(CREDENTIALS_SET_CONF, TRUE)
-          }
-        })
-    runAndVerifyPodMatchesPredicate { p =>
-      Option(p)
-        .filter(pod => containerHasCorrectJvmOptions(pod, _(CREDENTIALS_SET_CONF) == TRUE))
-        .exists { pod =>
-          pod.getMetadata.getAnnotations.asScala(CREDENTIALS_SET_ANNOTATION) == TRUE
-        }
-    }
-    val resourceListArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
-    verify(kubernetesClient).resourceList(resourceListArgumentCaptor.capture())
-    val createdResources = resourceListArgumentCaptor.getAllValues.asScala
-    assert(createdResources.size === 2)
-    verifyCreatedResourcesHaveOwnerReferences(createdResources)
-    assert(createdResources.exists {
-      case secret: Secret =>
-        secret.getMetadata.getName == CREDENTIALS_SECRET_NAME &&
-            secret.getData.asScala == CREDENTIALS_SECRET_DATA
-      case _ => false
-    })
-  }
-
-  test("Waiting for completion should await completion on the status watcher.") {
-    expectationsForNoMountedCredentials()
-    expectationsForNoDependencyUploader()
-    new Client(
-      APP_NAME,
-      APP_RESOURCE_PREFIX,
-      APP_ID,
-      "",
-      None,
-      MAIN_CLASS,
-      SPARK_CONF,
-      APP_ARGS,
-      true,
-      kubernetesClient,
-      initContainerComponentsProvider,
-      credentialsMounterProvider,
-      loggingPodStatusWatcher).run()
-    verify(loggingPodStatusWatcher).awaitCompletion()
-  }
-
-  test("Mounting environmental variables correctly onto Driver Pod for PySpark Jobs") {
-    expectationsForNoMountedCredentials()
-    expectationsForNoDependencyUploader()
-    expectationsForNoSparkJarsOrFiles()
-    runAndVerifyDriverPodHasCorrectPySparkProperties()
-  }
-
-  private def expectationsForNoSparkJarsOrFiles(): Unit = {
-    when(containerLocalizedFilesResolver.resolveSubmittedSparkFiles())
-        .thenReturn(Nil)
-    when(containerLocalizedFilesResolver.resolveSubmittedSparkJars())
-      .thenReturn(Nil)
-  }
-
-  private def expectationsForNoDependencyUploader(): Unit = {
-    when(initContainerComponentsProvider
-      .provideInitContainerSubmittedDependencyUploader(ALL_EXPECTED_LABELS))
-      .thenReturn(None)
-    when(initContainerComponentsProvider
-      .provideSubmittedDependenciesSecretBuilder(None))
-      .thenReturn(None)
-    when(initContainerComponentsProvider.provideInitContainerBundle(mockitoEq(None),
-      mockitoEq(RESOLVED_SPARK_JARS ++ RESOLVED_SPARK_FILES)))
-        .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
-          initContainerBootstrap, executorInitContainerConfiguration)))
-  }
-
-  private def expectationsForNoMountedCredentials(): Unit = {
-    when(credentialsMounter.setDriverPodKubernetesCredentialLocations(any()))
-        .thenAnswer(AdditionalAnswers.returnsFirstArg())
-    when(credentialsMounter.createCredentialsSecret()).thenReturn(None)
-    when(credentialsMounter.mountDriverKubernetesCredentials(
-        any(), mockitoEq(DRIVER_CONTAINER_NAME), mockitoEq(None)))
-        .thenAnswer(AdditionalAnswers.returnsFirstArg())
-  }
-
-  private def verifyCreatedResourcesHaveOwnerReferences(
-      createdResources: mutable.Buffer[HasMetadata]): Unit = {
-    assert(createdResources.forall { resource =>
-      val owners = resource.getMetadata.getOwnerReferences.asScala
-      owners.size === 1 &&
-        owners.head.getController &&
-        owners.head.getKind == DRIVER_POD_KIND &&
-        owners.head.getUid == DRIVER_POD_UID &&
-        owners.head.getName == s"$APP_RESOURCE_PREFIX-driver" &&
-        owners.head.getApiVersion == DRIVER_POD_API_VERSION
-    })
-  }
-
-  private def verifyConfigMapWasCreated(createdResources: mutable.Buffer[HasMetadata]): Unit = {
-    assert(createdResources.exists {
-      case configMap: ConfigMap =>
-        configMap.getMetadata.getName == CONFIG_MAP_NAME &&
-            configMap.getData.asScala == CONFIG_MAP_DATA
-      case _ => false
-    })
-  }
-
-  private def runAndVerifyDriverPodHasCorrectProperties(): Unit = {
-    val expectedOptions = SPARK_CONF.getAll
-      .filterNot(_._1 == org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS.key)
-      .toMap ++
-      Map(
-        "spark.app.id" -> APP_ID,
-        KUBERNETES_DRIVER_POD_NAME.key -> s"$APP_RESOURCE_PREFIX-driver",
-        KUBERNETES_EXECUTOR_POD_NAME_PREFIX.key -> APP_RESOURCE_PREFIX,
-        EXECUTOR_INIT_CONF_KEY -> TRUE,
-        CUSTOM_JAVA_OPTION_KEY -> CUSTOM_JAVA_OPTION_VALUE,
-        "spark.jars" -> RESOLVED_SPARK_JARS.mkString(","),
-        "spark.files" -> RESOLVED_SPARK_FILES.mkString(","))
-    runAndVerifyPodMatchesPredicate { p =>
-      Option(p)
-        .filter(_.getMetadata.getName == s"$APP_RESOURCE_PREFIX-driver")
-        .filter(podHasCorrectAnnotations)
-        .filter(_.getMetadata.getLabels.asScala == ALL_EXPECTED_LABELS)
-        .filter(containerHasCorrectBasicContainerConfiguration)
-        .filter(containerHasCorrectBasicEnvs)
-        .filter(containerHasCorrectMountedClasspath)
-        .exists(pod => containerHasCorrectJvmOptions(pod, _ == expectedOptions))
-    }
-  }
-
-  private def runAndVerifyDriverPodHasCorrectPySparkProperties(): Unit = {
-    when(initContainerComponentsProvider.provideContainerLocalizedFilesResolver(
-      mockitoEq(PYSPARK_PRIMARY_FILE))).thenReturn(containerLocalizedFilesResolver)
-    when(initContainerComponentsProvider.provideInitContainerBundle(
-      any[Option[SubmittedResourceIds]], any[Iterable[String]]))
-      .thenReturn(Some(InitContainerBundle(INIT_CONTAINER_CONFIG_MAP,
-        initContainerBootstrap, executorInitContainerConfiguration)))
-    runAndVerifyPySparkPodMatchesPredicate { p =>
-      Option(p).exists(pod => containerHasCorrectPySparkEnvs(pod))
-    }
-  }
-
-  private def runAndVerifyPodMatchesPredicate(pred: (Pod => Boolean)): Unit = {
-    new Client(
-      APP_NAME,
-      APP_RESOURCE_PREFIX,
-      APP_ID,
-      "",
-      None,
-      MAIN_CLASS,
-      SPARK_CONF,
-      APP_ARGS,
-      false,
-      kubernetesClient,
-      initContainerComponentsProvider,
-      credentialsMounterProvider,
-      loggingPodStatusWatcher).run()
-    val podMatcher = new BaseMatcher[Pod] {
-      override def matches(o: scala.Any): Boolean = {
-        o match {
-          case p: Pod => pred(p)
-          case _ => false
-        }
-      }
-      override def describeTo(description: Description): Unit = {}
-    }
-    verify(podOps).create(argThat(podMatcher))
-  }
-
-  private def containerHasCorrectJvmOptions(
-      pod: Pod, optionsCorrectnessPredicate: (Map[String, String] => Boolean)): Boolean = {
-    val driverContainer = pod.getSpec.getContainers.asScala.head
-    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
-    envs.toMap.get(ENV_DRIVER_JAVA_OPTS).exists { javaOptions =>
-      val splitOptions = javaOptions.split(" ")
-      splitOptions.forall(_.startsWith("-D")) &&
-        optionsCorrectnessPredicate(splitOptions.map { option =>
-          val withoutPrefix = option.substring(2)
-          (withoutPrefix.split("=", 2)(0), withoutPrefix.split("=", 2)(1))
-        }.toMap)
-    }
-  }
-
-  private def containerHasCorrectMountedClasspath(pod: Pod): Boolean = {
-    val driverContainer = pod.getSpec.getContainers.asScala.head
-    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
-    envs.toMap.get(ENV_MOUNTED_CLASSPATH).exists { classpath =>
-      val mountedClasspathEntities = classpath.split(File.pathSeparator)
-      mountedClasspathEntities.toSet == RESOLVED_SPARK_REMOTE_AND_LOCAL_JARS.toSet
-    }
-  }
-
-  private def containerHasCorrectBasicEnvs(pod: Pod): Boolean = {
-    val driverContainer = pod.getSpec.getContainers.asScala.head
-    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue))
-    val expectedBasicEnvs = Map(
-      ENV_SUBMIT_EXTRA_CLASSPATH -> DRIVER_EXTRA_CLASSPATH,
-      ENV_DRIVER_MEMORY -> s"${DRIVER_MEMORY_MB + DRIVER_MEMORY_OVERHEAD_MB}m",
-      ENV_DRIVER_MAIN_CLASS -> MAIN_CLASS,
-      ENV_DRIVER_ARGS -> APP_ARGS.mkString(" "))
-    expectedBasicEnvs.toSet.subsetOf(envs.toSet)
-  }
-
-  private def containerHasCorrectPySparkEnvs(pod: Pod): Boolean = {
-    val driverPodLabels =
-      pod.getMetadata.getLabels.asScala.map(env => (env._1.toString, env._2.toString))
-    val expectedBasicLabels = Map(
-      "pyspark-test" -> "true",
-      "spark-role" -> "driver")
-    expectedBasicLabels.toSet.subsetOf(driverPodLabels.toSet)
-  }
-
-  private def containerHasCorrectBasicContainerConfiguration(pod: Pod): Boolean = {
-    val containers = pod.getSpec.getContainers.asScala
-    containers.size == 1 &&
-      containers.head.getName == DRIVER_CONTAINER_NAME &&
-      containers.head.getImage == CUSTOM_DRIVER_IMAGE &&
-      containers.head.getImagePullPolicy == "IfNotPresent"
-  }
-
-  private def podHasCorrectAnnotations(pod: Pod): Boolean = {
-    val expectedAnnotations = Map(
-      DEPRECATED_CUSTOM_ANNOTATION_KEY -> DEPRECATED_CUSTOM_ANNOTATION_VALUE,
-      CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
-      SPARK_APP_NAME_ANNOTATION -> APP_NAME,
-      BOOTSTRAPPED_POD_ANNOTATION -> TRUE)
-    pod.getMetadata.getAnnotations.asScala == expectedAnnotations
-  }
-
-  private def runAndVerifyPySparkPodMatchesPredicate(pred: (Pod => Boolean)): Unit = {
-    new Client(
-      APP_NAME,
-      APP_RESOURCE_PREFIX,
-      APP_ID,
-      PYSPARK_PRIMARY_FILE,
-      Option(new PythonSubmissionResourcesImpl(PYSPARK_PRIMARY_FILE, PYSPARK_APP_ARGS)),
-      MAIN_CLASS,
-      SPARK_CONF,
-      PYSPARK_APP_ARGS,
-      false,
-      kubernetesClient,
-      initContainerComponentsProvider,
-      credentialsMounterProvider,
-      loggingPodStatusWatcher).run()
-    val podMatcher = new BaseMatcher[Pod] {
-      override def matches(o: scala.Any): Boolean = {
-        o match {
-          case p: Pod => pred(p)
-          case _ => false
-        }
-      }
-      override def describeTo(description: Description): Unit = {}
-    }
-    verify(podOps).create(argThat(podMatcher))
-  }
-}
-
-
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
deleted file mode 100644
index 7e51abcd7b8e0..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ContainerLocalizedFilesResolverSuite.scala
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.SparkFunSuite
-
-class ContainerLocalizedFilesResolverSuite extends SparkFunSuite {
-  private val SPARK_JARS = Seq(
-    "hdfs://localhost:9000/app/jars/jar1.jar",
-    "file:///app/jars/jar2.jar",
-    "local:///app/jars/jar3.jar",
-    "http://app/jars/jar4.jar")
-  private val SPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.txt",
-    "file:///app/files/file2.txt",
-    "local:///app/files/file3.txt",
-    "http://app/files/file4.txt")
-  private val PYSPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.py",
-    "file:///app/files/file2.py",
-    "local:///app/files/file3.py",
-    "http://app/files/file4.py",
-    "file:///app/files/file5.py")
-  private val JARS_DOWNLOAD_PATH = "/var/data/spark-jars"
-  private val FILES_DOWNLOAD_PATH = "/var/data/spark-files"
-  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
-  private val localizedFilesResolver = new ContainerLocalizedFilesResolverImpl(
-    SPARK_JARS,
-    SPARK_FILES,
-    PYSPARK_FILES,
-    PYSPARK_PRIMARY_FILE,
-    JARS_DOWNLOAD_PATH,
-    FILES_DOWNLOAD_PATH)
-
-  test("Submitted and remote Spark jars should resolve non-local uris to download path.") {
-    val resolvedJars = localizedFilesResolver.resolveSubmittedAndRemoteSparkJars()
-    val expectedResolvedJars = Seq(
-      s"$JARS_DOWNLOAD_PATH/jar1.jar",
-      s"$JARS_DOWNLOAD_PATH/jar2.jar",
-      "/app/jars/jar3.jar",
-      s"$JARS_DOWNLOAD_PATH/jar4.jar")
-    assert(resolvedJars === expectedResolvedJars)
-  }
-
-  test("Submitted Spark jars should resolve to the download path.") {
-    val resolvedJars = localizedFilesResolver.resolveSubmittedSparkJars()
-    val expectedResolvedJars = Seq(
-      "hdfs://localhost:9000/app/jars/jar1.jar",
-      s"$JARS_DOWNLOAD_PATH/jar2.jar",
-      "local:///app/jars/jar3.jar",
-      "http://app/jars/jar4.jar")
-    assert(resolvedJars === expectedResolvedJars)
-  }
-
-  test("Submitted Spark files should resolve to the download path.") {
-    val resolvedFiles = localizedFilesResolver.resolveSubmittedSparkFiles()
-    val expectedResolvedFiles = Seq(
-      "hdfs://localhost:9000/app/files/file1.txt",
-      s"$FILES_DOWNLOAD_PATH/file2.txt",
-      "local:///app/files/file3.txt",
-      "http://app/files/file4.txt")
-    assert(resolvedFiles === expectedResolvedFiles)
-  }
-  test("Submitted PySpark files should resolve to the download path.") {
-    val resolvedPySparkFiles = localizedFilesResolver.resolveSubmittedPySparkFiles()
-    val expectedPySparkFiles = Seq(
-      "hdfs://localhost:9000/app/files/file1.py",
-      s"$FILES_DOWNLOAD_PATH/file2.py",
-      "local:///app/files/file3.py",
-      "http://app/files/file4.py")
-    assert(resolvedPySparkFiles === expectedPySparkFiles)
-  }
-  test("Submitted PySpark Primary resource should resolve to the download path.") {
-    val resolvedPySparkPrimary =
-      localizedFilesResolver.resolvePrimaryResourceFile()
-    val expectedPySparkPrimary = s"$FILES_DOWNLOAD_PATH/file5.py"
-    assert(resolvedPySparkPrimary === expectedPySparkPrimary)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala
new file mode 100644
index 0000000000000..e4f221ad99cc5
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.{BaseDriverConfigurationStep, DependencyResolutionStep, DriverKubernetesCredentialsStep, InitContainerBootstrapStep, PythonStep}
+
+private[spark] class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
+
+  private val NAMESPACE = "default"
+  private val APP_ID = "spark-app-id"
+  private val LAUNCH_TIME = 975256L
+  private val APP_NAME = "spark"
+  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
+  private val APP_ARGS = Array("arg1", "arg2")
+  private val ADDITIONAL_PYTHON_FILES = Seq("local:///var/apps/python/py1.py")
+
+  test("Base submission steps without an init-container or python files.") {
+    val sparkConf = new SparkConf(false)
+      .set("spark.jars", "local:///var/apps/jars/jar1.jar")
+    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
+    val orchestrator = new DriverConfigurationStepsOrchestrator(
+        NAMESPACE,
+        APP_ID,
+        LAUNCH_TIME,
+        mainAppResource,
+        APP_NAME,
+        MAIN_CLASS,
+        APP_ARGS,
+        ADDITIONAL_PYTHON_FILES,
+        sparkConf)
+    val steps = orchestrator.getAllConfigurationSteps()
+    assert(steps.size === 3)
+    assert(steps(0).isInstanceOf[BaseDriverConfigurationStep])
+    assert(steps(1).isInstanceOf[DriverKubernetesCredentialsStep])
+    assert(steps(2).isInstanceOf[DependencyResolutionStep])
+  }
+
+  test("Submission steps with an init-container.") {
+    val sparkConf = new SparkConf(false)
+      .set("spark.jars", "hdfs://localhost:9000/var/apps/jars/jar1.jar")
+    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
+    val orchestrator = new DriverConfigurationStepsOrchestrator(
+        NAMESPACE,
+        APP_ID,
+        LAUNCH_TIME,
+        mainAppResource,
+        APP_NAME,
+        MAIN_CLASS,
+        APP_ARGS,
+        ADDITIONAL_PYTHON_FILES,
+        sparkConf)
+    val steps = orchestrator.getAllConfigurationSteps()
+    assert(steps.size === 4)
+    assert(steps(0).isInstanceOf[BaseDriverConfigurationStep])
+    assert(steps(1).isInstanceOf[DriverKubernetesCredentialsStep])
+    assert(steps(2).isInstanceOf[DependencyResolutionStep])
+    assert(steps(3).isInstanceOf[InitContainerBootstrapStep])
+  }
+
+  test("Submission steps with python files.") {
+    val sparkConf = new SparkConf(false)
+    val mainAppResource = PythonMainAppResource("local:///var/apps/python/main.py")
+    val orchestrator = new DriverConfigurationStepsOrchestrator(
+        NAMESPACE,
+        APP_ID,
+        LAUNCH_TIME,
+        mainAppResource,
+        APP_NAME,
+        MAIN_CLASS,
+        APP_ARGS,
+        ADDITIONAL_PYTHON_FILES,
+        sparkConf)
+    val steps = orchestrator.getAllConfigurationSteps()
+    assert(steps.size === 4)
+    assert(steps(0).isInstanceOf[BaseDriverConfigurationStep])
+    assert(steps(1).isInstanceOf[DriverKubernetesCredentialsStep])
+    assert(steps(2).isInstanceOf[DependencyResolutionStep])
+    assert(steps(3).isInstanceOf[PythonStep])
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
deleted file mode 100644
index 2e0a7ba5098b2..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverPodKubernetesCredentialsMounterSuite.scala
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import io.fabric8.kubernetes.api.model.{PodBuilder, SecretBuilder}
-import org.scalatest.prop.TableDrivenPropertyChecks
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.kubernetes.KubernetesCredentials
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-
-class DriverPodKubernetesCredentialsMounterSuite
-    extends SparkFunSuite with TableDrivenPropertyChecks {
-
-  private val CLIENT_KEY_DATA = "client-key-data"
-  private val CLIENT_CERT_DATA = "client-cert-data"
-  private val OAUTH_TOKEN_DATA = "oauth-token"
-  private val CA_CERT_DATA = "ca-cert-data"
-  private val SUBMITTER_LOCAL_DRIVER_KUBERNETES_CREDENTIALS = KubernetesCredentials(
-    caCertDataBase64 = Some(CA_CERT_DATA),
-    clientKeyDataBase64 = Some(CLIENT_KEY_DATA),
-    clientCertDataBase64 = Some(CLIENT_CERT_DATA),
-    oauthTokenBase64 = Some(OAUTH_TOKEN_DATA))
-  private val APP_ID = "app-id"
-  private val USER_SPECIFIED_CLIENT_KEY_FILE = Some("/var/data/client-key.pem")
-  private val USER_SPECIFIED_CLIENT_CERT_FILE = Some("/var/data/client-cert.pem")
-  private val USER_SPECIFIED_OAUTH_TOKEN_FILE = Some("/var/data/token.txt")
-  private val USER_SPECIFIED_CA_CERT_FILE = Some("/var/data/ca.pem")
-
-  // Different configurations of credentials mounters
-  private val credentialsMounterWithPreMountedFiles =
-    new DriverPodKubernetesCredentialsMounterImpl(
-      kubernetesAppId = APP_ID,
-      submitterLocalDriverPodKubernetesCredentials = SUBMITTER_LOCAL_DRIVER_KUBERNETES_CREDENTIALS,
-      maybeUserSpecifiedMountedClientKeyFile = USER_SPECIFIED_CLIENT_KEY_FILE,
-      maybeUserSpecifiedMountedClientCertFile = USER_SPECIFIED_CLIENT_CERT_FILE,
-      maybeUserSpecifiedMountedOAuthTokenFile = USER_SPECIFIED_OAUTH_TOKEN_FILE,
-      maybeUserSpecifiedMountedCaCertFile = USER_SPECIFIED_CA_CERT_FILE)
-  private val credentialsMounterWithoutPreMountedFiles =
-    new DriverPodKubernetesCredentialsMounterImpl(
-      kubernetesAppId = APP_ID,
-      submitterLocalDriverPodKubernetesCredentials = SUBMITTER_LOCAL_DRIVER_KUBERNETES_CREDENTIALS,
-      maybeUserSpecifiedMountedClientKeyFile = None,
-      maybeUserSpecifiedMountedClientCertFile = None,
-      maybeUserSpecifiedMountedOAuthTokenFile = None,
-      maybeUserSpecifiedMountedCaCertFile = None)
-  private val credentialsMounterWithoutAnyDriverCredentials =
-    new DriverPodKubernetesCredentialsMounterImpl(
-      APP_ID, KubernetesCredentials(None, None, None, None), None, None, None, None)
-
-  // Test matrices
-  private val TEST_MATRIX_EXPECTED_SPARK_CONFS = Table(
-      ("Credentials Mounter Implementation",
-        "Expected client key file",
-        "Expected client cert file",
-        "Expected CA Cert file",
-        "Expected OAuth Token File"),
-      (credentialsMounterWithoutAnyDriverCredentials,
-        None,
-        None,
-        None,
-        None),
-      (credentialsMounterWithoutPreMountedFiles,
-        Some(DRIVER_CREDENTIALS_CLIENT_KEY_PATH),
-        Some(DRIVER_CREDENTIALS_CLIENT_CERT_PATH),
-        Some(DRIVER_CREDENTIALS_CA_CERT_PATH),
-        Some(DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)),
-      (credentialsMounterWithPreMountedFiles,
-        USER_SPECIFIED_CLIENT_KEY_FILE,
-        USER_SPECIFIED_CLIENT_CERT_FILE,
-        USER_SPECIFIED_CA_CERT_FILE,
-        USER_SPECIFIED_OAUTH_TOKEN_FILE))
-
-  private val TEST_MATRIX_EXPECTED_CREDENTIALS_SECRET = Table(
-      ("Credentials Mounter Implementation", "Expected Credentials Secret Data"),
-      (credentialsMounterWithoutAnyDriverCredentials, None),
-      (credentialsMounterWithoutPreMountedFiles,
-        Some(KubernetesSecretNameAndData(
-          data = Map[String, String](
-            DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME -> CLIENT_KEY_DATA,
-            DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME -> CLIENT_CERT_DATA,
-            DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME -> CA_CERT_DATA,
-            DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME -> OAUTH_TOKEN_DATA
-          ),
-          name = s"$APP_ID-kubernetes-credentials"))),
-      (credentialsMounterWithPreMountedFiles, None))
-
-  test("Credentials mounter should set the driver's Kubernetes credentials locations") {
-    forAll(TEST_MATRIX_EXPECTED_SPARK_CONFS) {
-      case (credentialsMounter,
-           expectedClientKeyFile,
-           expectedClientCertFile,
-           expectedCaCertFile,
-           expectedOAuthTokenFile) =>
-        val baseSparkConf = new SparkConf()
-        val resolvedSparkConf =
-          credentialsMounter.setDriverPodKubernetesCredentialLocations(baseSparkConf)
-        assert(resolvedSparkConf.getOption(
-            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX") ===
-            expectedClientKeyFile)
-        assert(resolvedSparkConf.getOption(
-            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX") ===
-            expectedClientCertFile)
-        assert(resolvedSparkConf.getOption(
-            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX") ===
-            expectedCaCertFile)
-        assert(resolvedSparkConf.getOption(
-            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX") ===
-            expectedOAuthTokenFile)
-    }
-  }
-
-  test("Credentials mounter should create the correct credentials secret.") {
-    forAll(TEST_MATRIX_EXPECTED_CREDENTIALS_SECRET) {
-      case (credentialsMounter, expectedSecretNameAndData) =>
-        val builtSecret = credentialsMounter.createCredentialsSecret()
-        val secretNameAndData = builtSecret.map { secret =>
-          KubernetesSecretNameAndData(secret.getMetadata.getName, secret.getData.asScala.toMap)
-        }
-        assert(secretNameAndData === expectedSecretNameAndData)
-    }
-  }
-
-  test("When credentials secret is provided, driver pod should mount the secret volume.") {
-    val credentialsSecret = new SecretBuilder()
-      .withNewMetadata().withName("secret").endMetadata()
-      .addToData("secretKey", "secretValue")
-      .build()
-    val originalPodSpec = new PodBuilder()
-      .withNewMetadata().withName("pod").endMetadata()
-      .withNewSpec()
-        .addNewContainer()
-          .withName("container")
-          .endContainer()
-        .endSpec()
-    val podSpecWithMountedDriverKubernetesCredentials =
-        credentialsMounterWithoutPreMountedFiles.mountDriverKubernetesCredentials(
-          originalPodSpec, "container", Some(credentialsSecret)).build()
-    val volumes = podSpecWithMountedDriverKubernetesCredentials.getSpec.getVolumes.asScala
-    assert(volumes.exists(_.getName == DRIVER_CREDENTIALS_SECRET_VOLUME_NAME))
-    volumes.find(_.getName == DRIVER_CREDENTIALS_SECRET_VOLUME_NAME).foreach { secretVolume =>
-      assert(secretVolume.getSecret != null && secretVolume.getSecret.getSecretName == "secret")
-    }
-  }
-
-  test("When credentials secret is absent, driver pod should not be changed.") {
-    val originalPodSpec = new PodBuilder()
-    val nonAdjustedPodSpec =
-      credentialsMounterWithoutAnyDriverCredentials.mountDriverKubernetesCredentials(
-        originalPodSpec, "driver", None)
-    assert(nonAdjustedPodSpec === originalPodSpec)
-  }
-}
-
-private case class KubernetesSecretNameAndData(name: String, data: Map[String, String])
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala
deleted file mode 100644
index ead1d49b8a37c..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/ExecutorInitContainerConfigurationSuite.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.kubernetes.config._
-
-class ExecutorInitContainerConfigurationSuite extends SparkFunSuite {
-
-  private val SECRET_NAME = "init-container-secret"
-  private val SECRET_MOUNT_DIR = "/mnt/secrets/spark"
-  private val CONFIG_MAP_NAME = "spark-config-map"
-  private val CONFIG_MAP_KEY = "spark-config-map-key"
-
-  test("Not passing a secret name should not set the secret value.") {
-    val baseSparkConf = new SparkConf(false)
-    val configurationUnderTest = new ExecutorInitContainerConfigurationImpl(
-      None,
-      SECRET_MOUNT_DIR,
-      CONFIG_MAP_NAME,
-      CONFIG_MAP_KEY)
-    val resolvedSparkConf = configurationUnderTest
-        .configureSparkConfForExecutorInitContainer(baseSparkConf)
-    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP).contains(CONFIG_MAP_NAME))
-    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY).contains(CONFIG_MAP_KEY))
-    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR)
-        .contains(SECRET_MOUNT_DIR))
-    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET).isEmpty)
-  }
-
-  test("Passing a secret name should set the secret value.") {
-    val baseSparkConf = new SparkConf(false)
-    val configurationUnderTest = new ExecutorInitContainerConfigurationImpl(
-      Some(SECRET_NAME),
-      SECRET_MOUNT_DIR,
-      CONFIG_MAP_NAME,
-      CONFIG_MAP_KEY)
-    val resolvedSparkConf = configurationUnderTest
-      .configureSparkConfForExecutorInitContainer(baseSparkConf)
-    assert(resolvedSparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET).contains(SECRET_NAME))
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala
deleted file mode 100644
index 9b60b7ef2b786..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/PythonSubmissionResourcesSuite.scala
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.{SSLOptions, SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.kubernetes.config._
-
-import scala.collection.JavaConverters._
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, Pod, PodBuilder}
-import org.mockito.{Mock, MockitoAnnotations}
-import org.mockito.Mockito.when
-import org.scalatest.BeforeAndAfter
-
-private[spark] class PythonSubmissionResourcesSuite extends SparkFunSuite with BeforeAndAfter {
-  private val PYSPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.py",
-    "file:///app/files/file2.py",
-    "local:///app/files/file3.py",
-    "http://app/files/file4.py",
-    "file:///app/files/file5.py")
-  private val RESOLVED_PYSPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.py",
-    "/var/spark-data/spark-files/file2.py",
-    "local:///app/file`s/file3.py",
-    "http://app/files/file4.py")
-  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
-  private val RESOLVED_PYSPARK_PRIMARY_FILE = "/var/data/spark-files/file5.py"
-
-  private val pyFilesResource = new PythonSubmissionResourcesImpl(
-    PYSPARK_PRIMARY_FILE, Array(PYSPARK_FILES.mkString(","), "500")
-  )
-  private val pyResource = new PythonSubmissionResourcesImpl(
-    PYSPARK_PRIMARY_FILE, Array(null, "500")
-  )
-  private val DRIVER_CONTAINER_NAME = "pyspark_container"
-  private val driverContainer = new ContainerBuilder()
-    .withName(DRIVER_CONTAINER_NAME)
-    .build()
-  private val basePodBuilder = new PodBuilder()
-    .withNewMetadata()
-      .withName("base_pod")
-    .endMetadata()
-    .withNewSpec()
-      .addToContainers(driverContainer)
-    .endSpec()
-
-  @Mock
-  private var driverInitContainer: DriverInitContainerComponentsProviderImpl = _
-  @Mock
-  private var localizedFileResolver: ContainerLocalizedFilesResolverImpl = _
-  before {
-    MockitoAnnotations.initMocks(this)
-    when(driverInitContainer.provideDriverPodFileMounter()).thenReturn(
-      new DriverPodKubernetesFileMounterImpl()
-    )
-    when(localizedFileResolver.resolvePrimaryResourceFile()).thenReturn(
-      RESOLVED_PYSPARK_PRIMARY_FILE)
-  }
-  test("Test with --py-files included") {
-    assert(pyFilesResource.sparkJars === Seq.empty[String])
-    assert(pyFilesResource.pySparkFiles ===
-      PYSPARK_PRIMARY_FILE +: PYSPARK_FILES)
-    assert(pyFilesResource.primaryPySparkResource(localizedFileResolver) ===
-      RESOLVED_PYSPARK_PRIMARY_FILE)
-    val driverPod: Pod = pyFilesResource.driverPodWithPySparkEnvs(
-      driverInitContainer.provideDriverPodFileMounter(),
-      RESOLVED_PYSPARK_PRIMARY_FILE,
-      RESOLVED_PYSPARK_FILES.mkString(","),
-      DRIVER_CONTAINER_NAME,
-      basePodBuilder
-      )
-    val driverContainer = driverPod.getSpec.getContainers.asScala.head
-    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue)).toMap
-    envs.get("PYSPARK_PRIMARY") foreach{ a => assert (a === RESOLVED_PYSPARK_PRIMARY_FILE) }
-    envs.get("PYSPARK_FILES") foreach{ a => assert (a === RESOLVED_PYSPARK_FILES.mkString(",")) }
-  }
-
-  test("Test without --py-files") {
-    assert(pyResource.sparkJars === Seq.empty[String])
-    assert(pyResource.pySparkFiles === Array(PYSPARK_PRIMARY_FILE))
-    assert(pyResource.primaryPySparkResource(localizedFileResolver) ===
-      RESOLVED_PYSPARK_PRIMARY_FILE)
-    val driverPod: Pod = pyResource.driverPodWithPySparkEnvs(
-      driverInitContainer.provideDriverPodFileMounter(),
-      RESOLVED_PYSPARK_PRIMARY_FILE,
-      "",
-      DRIVER_CONTAINER_NAME,
-      basePodBuilder
-    )
-    val driverContainer = driverPod.getSpec.getContainers.asScala.head
-    val envs = driverContainer.getEnv.asScala.map(env => (env.getName, env.getValue)).toMap
-    envs.get("PYSPARK_PRIMARY") foreach{ a => assert (a === RESOLVED_PYSPARK_PRIMARY_FILE) }
-    envs.get("PYSPARK_FILES") foreach{ a => assert (a === "") }
-  }
-}
\ No newline at end of file
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala
deleted file mode 100644
index f1e1ff7013496..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SparkInitContainerConfigMapBuilderSuite.scala
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import java.io.StringReader
-import java.util.Properties
-
-import com.google.common.collect.Maps
-import org.mockito.Mockito.{verify, when}
-import org.scalatest.BeforeAndAfter
-import org.scalatest.mock.MockitoSugar._
-import scala.collection.JavaConverters._
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.config._
-
-class SparkInitContainerConfigMapBuilderSuite extends SparkFunSuite with BeforeAndAfter {
-
-  private val JARS = Seq(
-    "hdfs://localhost:9000/app/jars/jar1.jar",
-    "file:///app/jars/jar2.jar",
-    "http://localhost:9000/app/jars/jar3.jar",
-    "local:///app/jars/jar4.jar")
-  private val FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.txt",
-    "file:///app/files/file2.txt",
-    "http://localhost:9000/app/files/file3.txt",
-    "local:///app/files/file4.txt")
-  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
-  private val FILES_DOWNLOAD_PATH = "/var/data/files"
-  private val CONFIG_MAP_NAME = "config-map"
-  private val CONFIG_MAP_KEY = "config-map-key"
-
-  test("Config map without submitted dependencies sets remote download configurations") {
-    val configMap = new SparkInitContainerConfigMapBuilderImpl(
-      JARS,
-      FILES,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      CONFIG_MAP_NAME,
-      CONFIG_MAP_KEY,
-      None).build()
-    assert(configMap.getMetadata.getName === CONFIG_MAP_NAME)
-    val maybeConfigValue = configMap.getData.asScala.get(CONFIG_MAP_KEY)
-    assert(maybeConfigValue.isDefined)
-    maybeConfigValue.foreach { configValue =>
-      val propertiesStringReader = new StringReader(configValue)
-      val properties = new Properties()
-      properties.load(propertiesStringReader)
-      val propertiesMap = Maps.fromProperties(properties).asScala
-      val remoteJarsString = propertiesMap.get(INIT_CONTAINER_REMOTE_JARS.key)
-      assert(remoteJarsString.isDefined)
-      val remoteJars = remoteJarsString.map(_.split(",")).toSet.flatten
-      assert(remoteJars ===
-        Set("hdfs://localhost:9000/app/jars/jar1.jar", "http://localhost:9000/app/jars/jar3.jar"))
-      val remoteFilesString = propertiesMap.get(INIT_CONTAINER_REMOTE_FILES.key)
-      assert(remoteFilesString.isDefined)
-      val remoteFiles = remoteFilesString.map(_.split(",")).toSet.flatten
-      assert(remoteFiles ===
-        Set("hdfs://localhost:9000/app/files/file1.txt",
-          "http://localhost:9000/app/files/file3.txt"))
-      assert(propertiesMap(INIT_CONTAINER_JARS_DOWNLOAD_LOCATION.key) === JARS_DOWNLOAD_PATH)
-      assert(propertiesMap(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION.key) === FILES_DOWNLOAD_PATH)
-    }
-  }
-
-  test("Config map with submitted dependencies adds configurations from plugin") {
-    val submittedDependenciesPlugin = mock[SubmittedDependencyInitContainerConfigPlugin]
-    when(submittedDependenciesPlugin.configurationsToFetchSubmittedDependencies())
-      .thenReturn(Map("customConf" -> "customConfValue"))
-    val configMap = new SparkInitContainerConfigMapBuilderImpl(
-      JARS,
-      FILES,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      CONFIG_MAP_NAME,
-      CONFIG_MAP_KEY,
-      Some(submittedDependenciesPlugin)).build()
-    val configValue = configMap.getData.asScala(CONFIG_MAP_KEY)
-    val propertiesStringReader = new StringReader(configValue)
-    val properties = new Properties()
-    properties.load(propertiesStringReader)
-    val propertiesMap = Maps.fromProperties(properties).asScala
-    assert(propertiesMap("customConf") === "customConfValue")
-    verify(submittedDependenciesPlugin).configurationsToFetchSubmittedDependencies()
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala
deleted file mode 100644
index 8431b77c9e85f..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencyInitContainerConfigPluginSuite.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.kubernetes.config._
-
-class SubmittedDependencyInitContainerConfigPluginSuite extends SparkFunSuite {
-  private val STAGING_SERVER_URI = "http://localhost:9000"
-  private val STAGING_SERVER_INTERNAL_URI = "http://internalHost:9000"
-  private val JARS_RESOURCE_ID = "jars-id"
-  private val FILES_RESOURCE_ID = "files-id"
-  private val JARS_SECRET_KEY = "jars"
-  private val FILES_SECRET_KEY = "files"
-  private val TRUSTSTORE_SECRET_KEY = "trustStore"
-  private val CLIENT_CERT_SECRET_KEY = "client-cert"
-  private val SECRETS_VOLUME_MOUNT_PATH = "/var/data"
-  private val TRUSTSTORE_PASSWORD = "trustStore"
-  private val TRUSTSTORE_FILE = "/mnt/secrets/trustStore.jks"
-  private val CLIENT_CERT_URI = "local:///mnt/secrets/client-cert.pem"
-  private val TRUSTSTORE_TYPE = "jks"
-
-  test("Plugin should provide configuration for fetching uploaded dependencies") {
-    val configPluginUnderTest = new SubmittedDependencyInitContainerConfigPluginImpl(
-      STAGING_SERVER_URI,
-      JARS_RESOURCE_ID,
-      FILES_RESOURCE_ID,
-      JARS_SECRET_KEY,
-      FILES_SECRET_KEY,
-      TRUSTSTORE_SECRET_KEY,
-      CLIENT_CERT_SECRET_KEY,
-      false,
-      None,
-      None,
-      None,
-      None,
-      SECRETS_VOLUME_MOUNT_PATH)
-    val addedConfigurations = configPluginUnderTest.configurationsToFetchSubmittedDependencies()
-    val expectedConfigurations = Map(
-      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
-      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
-      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
-      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
-        s"$SECRETS_VOLUME_MOUNT_PATH/$JARS_SECRET_KEY",
-      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
-        s"$SECRETS_VOLUME_MOUNT_PATH/$FILES_SECRET_KEY",
-      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "false")
-    assert(addedConfigurations === expectedConfigurations)
-  }
-
-  test("Plugin should set up SSL with the appropriate trustStore if it's provided.") {
-    val configPluginUnderTest = new SubmittedDependencyInitContainerConfigPluginImpl(
-      STAGING_SERVER_URI,
-      JARS_RESOURCE_ID,
-      FILES_RESOURCE_ID, JARS_SECRET_KEY,
-      FILES_SECRET_KEY,
-      TRUSTSTORE_SECRET_KEY,
-      CLIENT_CERT_SECRET_KEY,
-      true,
-      Some(TRUSTSTORE_FILE),
-      Some(CLIENT_CERT_URI),
-      Some(TRUSTSTORE_PASSWORD),
-      Some(TRUSTSTORE_TYPE),
-      SECRETS_VOLUME_MOUNT_PATH)
-    val addedConfigurations = configPluginUnderTest.configurationsToFetchSubmittedDependencies()
-    val expectedSslConfigurations = Map(
-      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> "true",
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key ->
-          s"$SECRETS_VOLUME_MOUNT_PATH/$TRUSTSTORE_SECRET_KEY",
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASSWORD,
-      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE,
-      RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key -> "/mnt/secrets/client-cert.pem")
-    assert(expectedSslConfigurations.toSet.subsetOf(addedConfigurations.toSet))
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala
deleted file mode 100644
index 83fd568e7a3aa..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/SubmittedDependencySecretBuilderSuite.scala
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit
-
-import java.io.File
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.Secret
-import scala.collection.JavaConverters._
-import scala.collection.Map
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.util.Utils
-
-class SubmittedDependencySecretBuilderSuite extends SparkFunSuite {
-
-  private val SECRET_NAME = "submitted-dependency-secret"
-  private val JARS_SECRET = "jars-secret"
-  private val FILES_SECRET = "files-secret"
-  private val JARS_SECRET_KEY = "jars-secret-key"
-  private val FILES_SECRET_KEY = "files-secret-key"
-  private val TRUSTSTORE_SECRET_KEY = "truststore-secret-key"
-  private val CLIENT_CERT_SECRET_KEY = "client-cert"
-  private val TRUSTSTORE_STRING_CONTENTS = "trustStore-contents"
-  private val CLIENT_CERT_STRING_CONTENTS = "client-certificate-contents"
-
-  test("Building the secret without a trustStore") {
-    val builder = new SubmittedDependencySecretBuilderImpl(
-      SECRET_NAME,
-      JARS_SECRET,
-      FILES_SECRET,
-      JARS_SECRET_KEY,
-      FILES_SECRET_KEY,
-      TRUSTSTORE_SECRET_KEY,
-      CLIENT_CERT_SECRET_KEY,
-      None,
-      None)
-    val secret = builder.build()
-    assert(secret.getMetadata.getName === SECRET_NAME)
-    val secretDecodedData = decodeSecretData(secret)
-    val expectedSecretData = Map(JARS_SECRET_KEY -> JARS_SECRET, FILES_SECRET_KEY -> FILES_SECRET)
-    assert(secretDecodedData === expectedSecretData)
-  }
-
-  private def decodeSecretData(secret: Secret): Map[String, String] = {
-    val secretData = secret.getData.asScala
-    secretData.mapValues(encoded =>
-      new String(BaseEncoding.base64().decode(encoded), Charsets.UTF_8))
-  }
-
-  test("Building the secret with a trustStore") {
-    val tempSslDir = Utils.createTempDir(namePrefix = "temp-ssl-tests")
-    try {
-      val trustStoreFile = new File(tempSslDir, "trustStore.jks")
-      Files.write(TRUSTSTORE_STRING_CONTENTS, trustStoreFile, Charsets.UTF_8)
-      val clientCertFile = new File(tempSslDir, "cert.pem")
-      Files.write(CLIENT_CERT_STRING_CONTENTS, clientCertFile, Charsets.UTF_8)
-      val builder = new SubmittedDependencySecretBuilderImpl(
-        SECRET_NAME,
-        JARS_SECRET,
-        FILES_SECRET,
-        JARS_SECRET_KEY,
-        FILES_SECRET_KEY,
-        TRUSTSTORE_SECRET_KEY,
-        CLIENT_CERT_SECRET_KEY,
-        Some(trustStoreFile.getAbsolutePath),
-        Some(clientCertFile.getAbsolutePath))
-      val secret = builder.build()
-      val decodedSecretData = decodeSecretData(secret)
-      assert(decodedSecretData(TRUSTSTORE_SECRET_KEY) === TRUSTSTORE_STRING_CONTENTS)
-      assert(decodedSecretData(CLIENT_CERT_SECRET_KEY) === CLIENT_CERT_STRING_CONTENTS)
-    } finally {
-      tempSslDir.delete()
-    }
-  }
-
-  test("If trustStore and certificate are container-local, don't add secret entries") {
-    val builder = new SubmittedDependencySecretBuilderImpl(
-      SECRET_NAME,
-      JARS_SECRET,
-      FILES_SECRET,
-      JARS_SECRET_KEY,
-      FILES_SECRET_KEY,
-      TRUSTSTORE_SECRET_KEY,
-      CLIENT_CERT_SECRET_KEY,
-      Some("local:///mnt/secrets/trustStore.jks"),
-      Some("local:///mnt/secrets/cert.pem"))
-    val secret = builder.build()
-    val decodedSecretData = decodeSecretData(secret)
-    assert(!decodedSecretData.contains(TRUSTSTORE_SECRET_KEY))
-    assert(!decodedSecretData.contains(CLIENT_CERT_SECRET_KEY))
-  }
-
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStepSuite.scala
new file mode 100644
index 0000000000000..c7d80a16a1532
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStepSuite.scala
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] class BaseDriverConfigurationStepSuite extends SparkFunSuite {
+
+  private val APP_ID = "spark-app-id"
+  private val RESOURCE_NAME_PREFIX = "spark"
+  private val DRIVER_LABELS = Map("labelkey" -> "labelvalue")
+  private val DOCKER_IMAGE_PULL_POLICY = "IfNotPresent"
+  private val APP_NAME = "spark-test"
+  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
+  private val APP_ARGS = Array("arg1", "arg2")
+  private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
+  private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
+  private val DEPRECATED_CUSTOM_ANNOTATION_KEY = "customAnnotationDeprecated"
+  private val DEPRECATED_CUSTOM_ANNOTATION_VALUE = "customAnnotationDeprecatedValue"
+
+  test("Set all possible configurations from the user.") {
+    val sparkConf = new SparkConf()
+        .set(KUBERNETES_DRIVER_POD_NAME, "spark-driver-pod")
+        .set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, "/opt/spark/spark-exmaples.jar")
+        .set("spark.driver.cores", "2")
+        .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
+        .set(org.apache.spark.internal.config.DRIVER_MEMORY, 256L)
+        .set(KUBERNETES_DRIVER_MEMORY_OVERHEAD, 200L)
+        .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
+        .set(s"spark.kubernetes.driver.annotation.$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
+        .set("spark.kubernetes.driver.annotations",
+            s"$DEPRECATED_CUSTOM_ANNOTATION_KEY=$DEPRECATED_CUSTOM_ANNOTATION_VALUE")
+    val submissionStep = new BaseDriverConfigurationStep(
+        APP_ID,
+        RESOURCE_NAME_PREFIX,
+        DRIVER_LABELS,
+        DOCKER_IMAGE_PULL_POLICY,
+        APP_NAME,
+        MAIN_CLASS,
+        APP_ARGS,
+        sparkConf)
+    val basePod = new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build()
+    val baseDriverSpec = KubernetesDriverSpec(
+        driverPod = basePod,
+        driverContainer = new ContainerBuilder().build(),
+        driverSparkConf = new SparkConf(false),
+        otherKubernetesResources = Seq.empty[HasMetadata])
+
+    val preparedDriverSpec = submissionStep.configureDriver(baseDriverSpec)
+    assert(preparedDriverSpec.driverContainer.getName === DRIVER_CONTAINER_NAME)
+    assert(preparedDriverSpec.driverContainer.getImage === "spark-driver:latest")
+    assert(preparedDriverSpec.driverContainer.getImagePullPolicy === DOCKER_IMAGE_PULL_POLICY)
+    val envs = preparedDriverSpec.driverContainer
+        .getEnv
+        .asScala
+        .map(env => (env.getName, env.getValue))
+        .toMap
+    assert(envs.size === 4)
+    assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-exmaples.jar")
+    assert(envs(ENV_DRIVER_MEMORY) === "456m")
+    assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
+    assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2")
+    val resourceRequirements = preparedDriverSpec.driverContainer.getResources
+    val requests = resourceRequirements.getRequests.asScala
+    assert(requests("cpu").getAmount === "2")
+    assert(requests("memory").getAmount === "256M")
+    val limits = resourceRequirements.getLimits.asScala
+    assert(limits("memory").getAmount === "456M")
+    assert(limits("cpu").getAmount === "4")
+    val driverPodMetadata = preparedDriverSpec.driverPod.getMetadata
+    assert(driverPodMetadata.getName === "spark-driver-pod")
+    assert(driverPodMetadata.getLabels.asScala === DRIVER_LABELS)
+    val expectedAnnotations = Map(
+      CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
+      DEPRECATED_CUSTOM_ANNOTATION_KEY -> DEPRECATED_CUSTOM_ANNOTATION_VALUE,
+      SPARK_APP_NAME_ANNOTATION -> APP_NAME)
+    assert(driverPodMetadata.getAnnotations.asScala === expectedAnnotations)
+    assert(preparedDriverSpec.driverPod.getSpec.getRestartPolicy === "Never")
+    val resolvedSparkConf = preparedDriverSpec.driverSparkConf.getAll.toMap
+    val expectedSparkConf = Map(
+      KUBERNETES_DRIVER_POD_NAME.key -> "spark-driver-pod",
+      "spark.app.id" -> APP_ID,
+      KUBERNETES_EXECUTOR_POD_NAME_PREFIX.key -> RESOURCE_NAME_PREFIX)
+    assert(resolvedSparkConf === expectedSparkConf)
+
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStepSuite.scala
new file mode 100644
index 0000000000000..3f7ec61074b0c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DependencyResolutionStepSuite.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import java.io.File
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] class DependencyResolutionStepSuite extends SparkFunSuite {
+
+  private val SPARK_JARS = Seq(
+      "hdfs://localhost:9000/apps/jars/jar1.jar",
+      "file:///home/user/apps/jars/jar2.jar",
+      "local:///var/apps/jars/jar3.jar")
+
+  private val SPARK_FILES = Seq(
+      "file:///home/user/apps/files/file1.txt",
+      "hdfs://localhost:9000/apps/files/file2.txt",
+      "local:///var/apps/files/file3.txt")
+
+  private val JARS_DOWNLOAD_PATH = "/mnt/spark-data/jars"
+  private val FILES_DOWNLOAD_PATH = "/mnt/spark-data/files"
+
+  test("Added dependencies should be resolved in Spark configuration and environment") {
+    val dependencyResolutionStep = new DependencyResolutionStep(
+        SPARK_JARS,
+        SPARK_FILES,
+        JARS_DOWNLOAD_PATH,
+        FILES_DOWNLOAD_PATH)
+    val driverPod = new PodBuilder().build()
+    val baseDriverSpec = KubernetesDriverSpec(
+        driverPod = driverPod,
+        driverContainer = new ContainerBuilder().build(),
+        driverSparkConf = new SparkConf(false),
+        otherKubernetesResources = Seq.empty[HasMetadata])
+    val preparedDriverSpec = dependencyResolutionStep.configureDriver(baseDriverSpec)
+    assert(preparedDriverSpec.driverPod === driverPod)
+    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
+    val resolvedSparkJars = preparedDriverSpec.driverSparkConf.get("spark.jars").split(",").toSet
+    val expectedResolvedSparkJars = Set(
+        "hdfs://localhost:9000/apps/jars/jar1.jar",
+        s"$JARS_DOWNLOAD_PATH/jar2.jar",
+        "/var/apps/jars/jar3.jar")
+    assert(resolvedSparkJars === expectedResolvedSparkJars)
+    val resolvedSparkFiles = preparedDriverSpec.driverSparkConf.get("spark.files").split(",").toSet
+    val expectedResolvedSparkFiles = Set(
+        s"$FILES_DOWNLOAD_PATH/file1.txt",
+        s"hdfs://localhost:9000/apps/files/file2.txt",
+        s"/var/apps/files/file3.txt")
+    assert(resolvedSparkFiles === expectedResolvedSparkFiles)
+    val driverEnv = preparedDriverSpec.driverContainer.getEnv.asScala
+    assert(driverEnv.size === 1)
+    assert(driverEnv.head.getName === ENV_MOUNTED_CLASSPATH)
+    val resolvedDriverClasspath = driverEnv.head.getValue.split(File.pathSeparator).toSet
+    val expectedResolvedDriverClasspath = Set(
+        s"$JARS_DOWNLOAD_PATH/jar1.jar",
+        s"$JARS_DOWNLOAD_PATH/jar2.jar",
+        "/var/apps/jars/jar3.jar")
+    assert(resolvedDriverClasspath === expectedResolvedDriverClasspath)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStepSuite.scala
new file mode 100644
index 0000000000000..3d5664713a2b8
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStepSuite.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder, Secret}
+import org.scalatest.BeforeAndAfter
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.util.Utils
+
+private[spark] class DriverKubernetesCredentialsStepSuite
+    extends SparkFunSuite with BeforeAndAfter {
+
+  private val KUBERNETES_RESOURCE_NAME_PREFIX = "spark"
+  private var credentialsTempDirectory: File = _
+  private val BASE_DRIVER_SPEC = new KubernetesDriverSpec(
+    driverPod = new PodBuilder().build(),
+    driverContainer = new ContainerBuilder().build(),
+    driverSparkConf = new SparkConf(false),
+    otherKubernetesResources = Seq.empty[HasMetadata])
+
+  before {
+    credentialsTempDirectory = Utils.createTempDir()
+  }
+
+  after {
+    credentialsTempDirectory.delete()
+  }
+
+  test("Don't set any credentials") {
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+        new SparkConf(false), KUBERNETES_RESOURCE_NAME_PREFIX)
+    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(BASE_DRIVER_SPEC)
+    assert(preparedDriverSpec.driverPod === BASE_DRIVER_SPEC.driverPod)
+    assert(preparedDriverSpec.driverContainer === BASE_DRIVER_SPEC.driverContainer)
+    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
+    assert(preparedDriverSpec.driverSparkConf.getAll.isEmpty)
+  }
+
+  test("Only set credentials that are manually mounted.") {
+    val submissionSparkConf = new SparkConf(false)
+        .set(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
+            "/mnt/secrets/my-token.txt")
+        .set(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+            "/mnt/secrets/my-key.pem")
+        .set(
+            s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+            "/mnt/secrets/my-cert.pem")
+        .set(
+          s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+          "/mnt/secrets/my-ca.pem")
+
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+      submissionSparkConf, KUBERNETES_RESOURCE_NAME_PREFIX)
+    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(BASE_DRIVER_SPEC)
+    assert(preparedDriverSpec.driverPod === BASE_DRIVER_SPEC.driverPod)
+    assert(preparedDriverSpec.driverContainer === BASE_DRIVER_SPEC.driverContainer)
+    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
+    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === submissionSparkConf.getAll.toMap)
+  }
+
+  test("Mount credentials from the submission client as a secret.") {
+    val caCertFile = writeCredentials("ca.pem", "ca-cert")
+    val clientKeyFile = writeCredentials("key.pem", "key")
+    val clientCertFile = writeCredentials("cert.pem", "cert")
+    val submissionSparkConf = new SparkConf(false)
+        .set(
+            s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX",
+            "token")
+        .set(
+            s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+            clientKeyFile.getAbsolutePath)
+        .set(
+            s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+            clientCertFile.getAbsolutePath)
+        .set(
+            s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+            caCertFile.getAbsolutePath)
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+        submissionSparkConf, KUBERNETES_RESOURCE_NAME_PREFIX)
+    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(
+      BASE_DRIVER_SPEC.copy(driverSparkConf = submissionSparkConf))
+    val expectedSparkConf = Map(
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX" -> "<present_but_redacted>",
+        s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX" ->
+            DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH,
+        s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX" ->
+            DRIVER_CREDENTIALS_CLIENT_KEY_PATH,
+        s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX" ->
+            DRIVER_CREDENTIALS_CLIENT_CERT_PATH,
+        s"$APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX" ->
+            DRIVER_CREDENTIALS_CA_CERT_PATH,
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX" ->
+            clientKeyFile.getAbsolutePath,
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX" ->
+            clientCertFile.getAbsolutePath,
+        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX" ->
+            caCertFile.getAbsolutePath)
+    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === expectedSparkConf)
+    assert(preparedDriverSpec.otherKubernetesResources.size === 1)
+    val credentialsSecret = preparedDriverSpec.otherKubernetesResources.head.asInstanceOf[Secret]
+    assert(credentialsSecret.getMetadata.getName ===
+        s"$KUBERNETES_RESOURCE_NAME_PREFIX-kubernetes-credentials")
+    val decodedSecretData = credentialsSecret.getData.asScala.map { data =>
+      (data._1, new String(BaseEncoding.base64().decode(data._2), Charsets.UTF_8))
+    }
+    val expectedSecretData = Map(
+      DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME -> "ca-cert",
+      DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME -> "token",
+      DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME -> "key",
+      DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME -> "cert")
+    assert(decodedSecretData === expectedSecretData)
+    val driverPodVolumes = preparedDriverSpec.driverPod.getSpec.getVolumes.asScala
+    assert(driverPodVolumes.size === 1)
+    assert(driverPodVolumes.head.getName === DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+    assert(driverPodVolumes.head.getSecret != null)
+    assert(driverPodVolumes.head.getSecret.getSecretName === credentialsSecret.getMetadata.getName)
+    val driverContainerVolumeMount = preparedDriverSpec.driverContainer.getVolumeMounts.asScala
+    assert(driverContainerVolumeMount.size === 1)
+    assert(driverContainerVolumeMount.head.getName === DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+    assert(driverContainerVolumeMount.head.getMountPath === DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
+  }
+
+  private def writeCredentials(credentialsFileName: String, credentialsContents: String): File = {
+    val credentialsFile = new File(credentialsTempDirectory, credentialsFileName)
+    Files.write(credentialsContents, credentialsFile, Charsets.UTF_8)
+    credentialsFile
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStepSuite.scala
new file mode 100644
index 0000000000000..ce0dcee6acc46
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/PythonStepSuite.scala
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import io.fabric8.kubernetes.api.model._
+import org.scalatest.BeforeAndAfter
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+class PythonStepSuite extends SparkFunSuite with BeforeAndAfter {
+  private val FILE_DOWNLOAD_PATH = "/var/data/spark-files"
+  private val PYSPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.py",
+    "file:///app/files/file2.py",
+    "local:///app/files/file3.py",
+    "http://app/files/file4.py")
+  private val RESOLVED_PYSPARK_FILES = Seq(
+    FILE_DOWNLOAD_PATH + "/file1.py",
+    FILE_DOWNLOAD_PATH + "/file2.py",
+    "/app/files/file3.py",
+    FILE_DOWNLOAD_PATH + "/file4.py").mkString(",")
+  private val PYSPARK_PRIMARY_FILE = "file:///app/files/file5.py"
+  private val RESOLVED_PYSPARK_PRIMARY_FILE = FILE_DOWNLOAD_PATH + "/file5.py"
+
+  test("testing PySpark with --py-files both local and remote files") {
+    val pyStep = new PythonStep(
+      PYSPARK_PRIMARY_FILE,
+      PYSPARK_FILES,
+      FILE_DOWNLOAD_PATH)
+    val returnedDriverContainer = pyStep.configureDriver(
+      KubernetesDriverSpec(
+        new Pod(),
+        new Container(),
+        Seq.empty[HasMetadata],
+        new SparkConf))
+    assert(returnedDriverContainer.driverContainer.getEnv
+      .asScala.map(env => (env.getName, env.getValue)).toMap ===
+      Map(
+        "PYSPARK_PRIMARY" -> RESOLVED_PYSPARK_PRIMARY_FILE,
+        "PYSPARK_FILES" -> RESOLVED_PYSPARK_FILES))
+  }
+
+  test("testing PySpark with empty --py-files ") {
+    val pyStep = new PythonStep(
+      PYSPARK_PRIMARY_FILE,
+      Seq.empty[String],
+      FILE_DOWNLOAD_PATH)
+    val returnedDriverContainer = pyStep.configureDriver(
+      KubernetesDriverSpec(
+        new Pod(),
+        new Container(),
+        Seq.empty[HasMetadata],
+        new SparkConf))
+    assert(returnedDriverContainer.driverContainer.getEnv
+      .asScala.map(env => (env.getName, env.getValue)).toMap ===
+      Map(
+        "PYSPARK_PRIMARY" -> RESOLVED_PYSPARK_PRIMARY_FILE,
+        "PYSPARK_FILES" -> "null"))
+  }
+
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initContainerBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initContainerBootstrapStepSuite.scala
new file mode 100644
index 0000000000000..b11b487111496
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initContainerBootstrapStepSuite.scala
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import java.io.StringReader
+import java.util.Properties
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.google.common.collect.Maps
+import io.fabric8.kubernetes.api.model.{ConfigMap, Container, ContainerBuilder, HasMetadata, PodBuilder, SecretBuilder}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer.{InitContainerConfigurationStep, InitContainerSpec}
+import org.apache.spark.util.Utils
+
+private[spark] class initContainerBootstrapStepSuite extends SparkFunSuite {
+
+  private val OBJECT_MAPPER = new ObjectMapper().registerModule(DefaultScalaModule)
+  private val CONFIG_MAP_NAME = "spark-init-config-map"
+  private val CONFIG_MAP_KEY = "spark-init-config-map-key"
+
+  test("The init container bootstrap step should use all of the init container steps") {
+    val baseDriverSpec = KubernetesDriverSpec(
+        driverPod = new PodBuilder().build(),
+        driverContainer = new ContainerBuilder().build(),
+        driverSparkConf = new SparkConf(false),
+        otherKubernetesResources = Seq.empty[HasMetadata])
+    val initContainerSteps = Seq(
+        FirstTestInitContainerConfigurationStep$,
+        SecondTestInitContainerConfigurationStep$)
+    val bootstrapStep = new InitContainerBootstrapStep(
+        initContainerSteps,
+        CONFIG_MAP_NAME,
+        CONFIG_MAP_KEY)
+    val preparedDriverSpec = bootstrapStep.configureDriver(baseDriverSpec)
+    assert(preparedDriverSpec.driverPod.getMetadata.getLabels.asScala ===
+        FirstTestInitContainerConfigurationStep$.additionalLabels)
+    val additionalDriverEnv = preparedDriverSpec.driverContainer.getEnv.asScala
+    assert(additionalDriverEnv.size === 1)
+    assert(additionalDriverEnv.head.getName ===
+        FirstTestInitContainerConfigurationStep$.additionalMainContainerEnvKey)
+    assert(additionalDriverEnv.head.getValue ===
+        FirstTestInitContainerConfigurationStep$.additionalMainContainerEnvValue)
+    val driverAnnotations = preparedDriverSpec.driverPod.getMetadata.getAnnotations.asScala
+    assert(driverAnnotations.size === 1)
+    val initContainers = OBJECT_MAPPER.readValue(
+        driverAnnotations(INIT_CONTAINER_ANNOTATION), classOf[Array[Container]])
+    assert(initContainers.length === 1)
+    val initContainerEnv = initContainers.head.getEnv.asScala
+    assert(initContainerEnv.size === 1)
+    assert(initContainerEnv.head.getName ===
+        SecondTestInitContainerConfigurationStep$.additionalInitContainerEnvKey)
+    assert(initContainerEnv.head.getValue ===
+        SecondTestInitContainerConfigurationStep$.additionalInitContainerEnvValue)
+    val expectedSparkConf = Map(
+      EXECUTOR_INIT_CONTAINER_CONFIG_MAP.key -> CONFIG_MAP_NAME,
+      EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY.key -> CONFIG_MAP_KEY,
+      SecondTestInitContainerConfigurationStep$.additionalDriverSparkConfKey ->
+          SecondTestInitContainerConfigurationStep$.additionalDriverSparkConfValue)
+    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === expectedSparkConf)
+    assert(preparedDriverSpec.otherKubernetesResources.size === 2)
+    assert(preparedDriverSpec.otherKubernetesResources.contains(
+        FirstTestInitContainerConfigurationStep$.additionalKubernetesResource))
+    assert(preparedDriverSpec.otherKubernetesResources.exists {
+      case configMap: ConfigMap =>
+        val hasMatchingName = configMap.getMetadata.getName == CONFIG_MAP_NAME
+        val configMapData = configMap.getData.asScala
+        val hasCorrectNumberOfEntries = configMapData.size == 1
+        val initContainerPropertiesRaw = configMapData(CONFIG_MAP_KEY)
+        val initContainerProperties = new Properties()
+        Utils.tryWithResource(new StringReader(initContainerPropertiesRaw)) {
+          initContainerProperties.load(_)
+        }
+        val initContainerPropertiesMap = Maps.fromProperties(initContainerProperties).asScala
+        val expectedInitContainerProperties = Map(
+          SecondTestInitContainerConfigurationStep$.additionalInitContainerPropertyKey ->
+              SecondTestInitContainerConfigurationStep$.additionalInitContainerPropertyValue)
+        val hasMatchingProperties = initContainerPropertiesMap == expectedInitContainerProperties
+        hasMatchingName && hasCorrectNumberOfEntries && hasMatchingProperties
+      case _ => false
+    })
+  }
+}
+
+private object FirstTestInitContainerConfigurationStep$ extends InitContainerConfigurationStep {
+
+  val additionalLabels = Map("additionalLabelkey" -> "additionalLabelValue")
+  val additionalMainContainerEnvKey = "TEST_ENV_MAIN_KEY"
+  val additionalMainContainerEnvValue = "TEST_ENV_MAIN_VALUE"
+  val additionalKubernetesResource = new SecretBuilder()
+    .withNewMetadata()
+      .withName("test-secret")
+      .endMetadata()
+    .addToData("secret-key", "secret-value")
+    .build()
+
+  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
+    val driverPod = new PodBuilder(initContainerSpec.podToInitialize)
+      .editOrNewMetadata()
+        .addToLabels(additionalLabels.asJava)
+        .endMetadata()
+      .build()
+    val mainContainer = new ContainerBuilder(initContainerSpec.driverContainer)
+      .addNewEnv()
+        .withName(additionalMainContainerEnvKey)
+        .withValue(additionalMainContainerEnvValue)
+        .endEnv()
+      .build()
+    initContainerSpec.copy(
+      podToInitialize = driverPod,
+      driverContainer = mainContainer,
+      initContainerDependentResources = initContainerSpec.initContainerDependentResources ++
+        Seq(additionalKubernetesResource))
+  }
+}
+
+private object SecondTestInitContainerConfigurationStep$ extends InitContainerConfigurationStep {
+  val additionalInitContainerEnvKey = "TEST_ENV_INIT_KEY"
+  val additionalInitContainerEnvValue = "TEST_ENV_INIT_VALUE"
+  val additionalInitContainerPropertyKey = "spark.initcontainer.testkey"
+  val additionalInitContainerPropertyValue = "testvalue"
+  val additionalDriverSparkConfKey = "spark.driver.testkey"
+  val additionalDriverSparkConfValue = "spark.driver.testvalue"
+
+  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
+    val initContainer = new ContainerBuilder(initContainerSpec.initContainer)
+      .addNewEnv()
+        .withName(additionalInitContainerEnvKey)
+        .withValue(additionalInitContainerEnvValue)
+        .endEnv()
+      .build()
+    val initContainerProperties = initContainerSpec.initContainerProperties ++
+      Map(additionalInitContainerPropertyKey -> additionalInitContainerPropertyValue)
+    val driverSparkConf = initContainerSpec.additionalDriverSparkConf ++
+      Map(additionalDriverSparkConfKey -> additionalDriverSparkConfValue)
+    initContainerSpec.copy(
+      initContainer = initContainer,
+      initContainerProperties = initContainerProperties,
+      additionalDriverSparkConf = driverSparkConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStepSuite.scala
new file mode 100644
index 0000000000000..fe1af4bc5be2a
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/BaseInitContainerConfigurationStepSuite.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+import io.fabric8.kubernetes.api.model._
+import org.mockito.{Mock, MockitoAnnotations}
+import org.mockito.Matchers.any
+import org.mockito.Mockito.when
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.kubernetes.{PodWithDetachedInitContainer, SparkPodInitContainerBootstrap}
+import org.apache.spark.deploy.kubernetes.config._
+
+class BaseInitContainerConfigurationStepSuite extends SparkFunSuite with BeforeAndAfter{
+  private val SPARK_JARS = Seq(
+    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
+  private val SPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
+  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/files"
+  private val CONFIG_MAP_NAME = "config-map"
+  private val CONFIG_MAP_KEY = "config-map-key"
+  private val POD_LABEL = Map("bootstrap" -> "true")
+  private val INIT_CONTAINER_NAME = "init-container"
+  private val DRIVER_CONTAINER_NAME = "driver-container"
+
+  @Mock
+  private var podAndInitContainerBootstrap : SparkPodInitContainerBootstrap = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    when(podAndInitContainerBootstrap.bootstrapInitContainerAndVolumes(
+      any[PodWithDetachedInitContainer])).thenAnswer(new Answer[PodWithDetachedInitContainer] {
+        override def answer(invocation: InvocationOnMock) : PodWithDetachedInitContainer = {
+          val pod = invocation.getArgumentAt(0, classOf[PodWithDetachedInitContainer])
+          pod.copy(
+            pod =
+              new PodBuilder(pod.pod)
+                .withNewMetadata()
+                  .addToLabels("bootstrap", "true")
+                  .endMetadata()
+                .withNewSpec().endSpec()
+                .build(),
+            initContainer =
+              new ContainerBuilder()
+                .withName(INIT_CONTAINER_NAME).build(),
+            mainContainer =
+              new ContainerBuilder()
+                .withName(DRIVER_CONTAINER_NAME).build()
+          )}})
+  }
+
+  test("Test of additionalDriverSparkConf with mix of remote files and jars") {
+    val baseInitStep = new BaseInitContainerConfigurationStep(
+        SPARK_JARS,
+        SPARK_FILES,
+        JARS_DOWNLOAD_PATH,
+        FILES_DOWNLOAD_PATH,
+        CONFIG_MAP_NAME,
+        CONFIG_MAP_KEY,
+        podAndInitContainerBootstrap)
+    val expectedDriverSparkConf = Map(
+        INIT_CONTAINER_JARS_DOWNLOAD_LOCATION.key -> JARS_DOWNLOAD_PATH,
+        INIT_CONTAINER_FILES_DOWNLOAD_LOCATION.key -> FILES_DOWNLOAD_PATH,
+        INIT_CONTAINER_REMOTE_JARS.key -> "hdfs://localhost:9000/app/jars/jar1.jar",
+        INIT_CONTAINER_REMOTE_FILES.key -> "hdfs://localhost:9000/app/files/file1.txt")
+    val initContainerSpec = InitContainerSpec(
+        Map.empty[String, String],
+        Map.empty[String, String],
+        new Container(),
+        new Container(),
+        new Pod,
+        Seq.empty[HasMetadata])
+    val returnContainerSpec = baseInitStep.configureInitContainer(initContainerSpec)
+    assert(expectedDriverSparkConf === returnContainerSpec.initContainerProperties)
+    assert(returnContainerSpec.initContainer.getName == INIT_CONTAINER_NAME)
+    assert(returnContainerSpec.driverContainer.getName == DRIVER_CONTAINER_NAME)
+    assert(returnContainerSpec.podToInitialize.getMetadata.getLabels.asScala === POD_LABEL)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestratorSuite.scala
new file mode 100644
index 0000000000000..1cc8007803457
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/InitContainerConfigurationStepsOrchestratorSuite.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+
+class InitContainerConfigurationStepsOrchestratorSuite extends SparkFunSuite {
+  private val NAMESPACE = "namespace"
+  private val APP_RESOURCE_PREFIX = "spark-prefix"
+  private val SPARK_JARS = Seq(
+    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
+  private val SPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
+  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/files"
+  private val DOCKER_IMAGE_PULL_POLICY: String = "IfNotPresent"
+  private val APP_ID = "spark-id"
+  private val CUSTOM_LABEL_KEY = "customLabel"
+  private val CUSTOM_LABEL_VALUE = "customLabelValue"
+  private val DEPRECATED_CUSTOM_LABEL_KEY = "deprecatedCustomLabel"
+  private val DEPRECATED_CUSTOM_LABEL_VALUE = "deprecatedCustomLabelValue"
+  private val DRIVER_LABELS = Map(
+    CUSTOM_LABEL_KEY -> CUSTOM_LABEL_VALUE,
+    DEPRECATED_CUSTOM_LABEL_KEY -> DEPRECATED_CUSTOM_LABEL_VALUE,
+    SPARK_APP_ID_LABEL -> APP_ID,
+    SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
+  private val INIT_CONTAINER_CONFIG_MAP_NAME = "spark-init-config-map"
+  private val INIT_CONTAINER_CONFIG_MAP_KEY = "spark-init-config-map-key"
+  private val STAGING_SERVER_URI = "http://localhost:8000"
+
+  test ("including step to contact resource staging server") {
+    val sparkConf = new SparkConf(true)
+      .set(KUBERNETES_DRIVER_LABELS, s"$DEPRECATED_CUSTOM_LABEL_KEY=$DEPRECATED_CUSTOM_LABEL_VALUE")
+      .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
+      .set(RESOURCE_STAGING_SERVER_URI, STAGING_SERVER_URI)
+
+    val orchestrator = new InitContainerConfigurationStepsOrchestrator(
+      NAMESPACE,
+      APP_RESOURCE_PREFIX,
+      SPARK_JARS,
+      SPARK_FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOCKER_IMAGE_PULL_POLICY,
+      DRIVER_LABELS,
+      INIT_CONTAINER_CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY,
+      sparkConf)
+    val initSteps : Seq[InitContainerConfigurationStep] =
+        orchestrator.getAllConfigurationSteps()
+    assert(initSteps.length == 2)
+    assert(initSteps.head.isInstanceOf[BaseInitContainerConfigurationStep])
+    assert(initSteps(1).isInstanceOf[SubmittedResourcesInitContainerConfigurationStep])
+  }
+
+  test ("not including steps because no contact to resource staging server") {
+    val sparkConf = new SparkConf(true)
+      .set(KUBERNETES_DRIVER_LABELS, s"$DEPRECATED_CUSTOM_LABEL_KEY=$DEPRECATED_CUSTOM_LABEL_VALUE")
+      .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
+
+    val orchestrator = new InitContainerConfigurationStepsOrchestrator(
+      NAMESPACE,
+      APP_RESOURCE_PREFIX,
+      SPARK_JARS,
+      SPARK_FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOCKER_IMAGE_PULL_POLICY,
+      DRIVER_LABELS,
+      INIT_CONTAINER_CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY,
+      sparkConf)
+    val initSteps : Seq[InitContainerConfigurationStep] =
+        orchestrator.getAllConfigurationSteps()
+    assert(initSteps.length === 1)
+    assert(initSteps.head.isInstanceOf[BaseInitContainerConfigurationStep])
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerStepSuite.scala
new file mode 100644
index 0000000000000..2edaba93fe07f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/initcontainer/SubmittedResourcesInitContainerStepSuite.scala
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer
+
+import java.io.File
+import java.util.UUID
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model._
+import org.mockito.{Mock, MockitoAnnotations}
+import org.mockito.Matchers.any
+import org.mockito.Mockito.when
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import scala.collection.JavaConverters._
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.kubernetes.InitContainerResourceStagingServerSecretPlugin
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.{SubmittedDependencyUploader, SubmittedResourceIdAndSecret}
+import org.apache.spark.util.Utils
+
+class SubmittedResourcesInitContainerStepSuite extends SparkFunSuite with BeforeAndAfter {
+  private val RESOURCE_SECRET_NAME = "secret"
+  private val JARS_RESOURCE_ID = "jarsID"
+  private val JARS_SECRET = "jarsSecret"
+  private val FILES_RESOURCE_ID = "filesID"
+  private val FILES_SECRET = "filesSecret"
+  private val STAGING_SERVER_URI = "http://localhost:8000"
+  private val SECRET_MOUNT_PATH = "/tmp"
+  private val RSS_SECRET = Map(
+    INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY ->
+      BaseEncoding.base64().encode(JARS_SECRET.getBytes(Charsets.UTF_8)),
+    INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY ->
+      BaseEncoding.base64().encode(FILES_SECRET.getBytes(Charsets.UTF_8))
+  ).asJava
+  private var RSS_WITH_SSL_SECRET: java.util.Map[String, String] = _
+  private var TRUSTSTORE_FILENAME: String = ""
+  private var TRUSTSTORE_FILE: File = _
+  private var TRUSTSTORE_URI: Option[String] = None
+  private val TRUSTSTORE_PASS = "trustStorePassword"
+  private val TRUSTSTORE_TYPE = "jks"
+  private var CERT_FILENAME: String = ""
+  private var CERT_FILE: File = _
+  private var CERT_URI: Option[String] = None
+
+  @Mock
+  private var submittedDependencyUploader: SubmittedDependencyUploader = _
+  @Mock
+  private var submittedResourcesSecretPlugin: InitContainerResourceStagingServerSecretPlugin = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    TRUSTSTORE_FILENAME = createTempFile(".jks")
+    TRUSTSTORE_FILE = new File(TRUSTSTORE_FILENAME)
+    TRUSTSTORE_URI = Some(TRUSTSTORE_FILENAME)
+    CERT_FILENAME = createTempFile("pem")
+    CERT_FILE = new File(CERT_FILENAME)
+    CERT_URI = Some(CERT_FILENAME)
+    RSS_WITH_SSL_SECRET =
+      (RSS_SECRET.asScala ++ Map(
+        INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY ->
+          BaseEncoding.base64().encode(Files.toByteArray(TRUSTSTORE_FILE)),
+        INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY ->
+          BaseEncoding.base64().encode(Files.toByteArray(CERT_FILE))
+      )).asJava
+    when(submittedDependencyUploader.uploadJars()).thenReturn(
+      SubmittedResourceIdAndSecret(JARS_RESOURCE_ID, JARS_SECRET)
+    )
+    when(submittedDependencyUploader.uploadFiles()).thenReturn(
+      SubmittedResourceIdAndSecret(FILES_RESOURCE_ID, FILES_SECRET)
+    )
+    when(submittedResourcesSecretPlugin.addResourceStagingServerSecretVolumeToPod(
+      any[Pod])).thenAnswer(new Answer[Pod] {
+      override def answer(invocation: InvocationOnMock) : Pod = {
+        val pod = invocation.getArgumentAt(0, classOf[Pod])
+        new PodBuilder(pod)
+          .withNewMetadata()
+          .addToLabels("mountedSecret", "true")
+          .endMetadata()
+          .withNewSpec().endSpec()
+          .build()
+      }})
+    when(submittedResourcesSecretPlugin.mountResourceStagingServerSecretIntoInitContainer(
+      any[Container])).thenAnswer(new Answer[Container] {
+      override def answer(invocation: InvocationOnMock) : Container = {
+        val con = invocation.getArgumentAt(0, classOf[Container])
+        new ContainerBuilder(con).withName("mountedSecret").build()
+      }})
+  }
+  after {
+    TRUSTSTORE_FILE.delete()
+    CERT_FILE.delete()
+  }
+  test ("testing vanilla prepareInitContainer on resources and properties") {
+    val submittedResourceStep = new SubmittedResourcesInitContainerConfigurationStep(
+      RESOURCE_SECRET_NAME,
+      STAGING_SERVER_URI,
+      SECRET_MOUNT_PATH,
+      false,
+      None,
+      None,
+      None,
+      None,
+      submittedDependencyUploader,
+      submittedResourcesSecretPlugin
+    )
+    val returnedInitContainer =
+      submittedResourceStep.configureInitContainer(InitContainerSpec(
+        Map.empty[String, String],
+        Map.empty[String, String],
+        new Container(),
+        new Container(),
+        new Pod(),
+        Seq.empty[HasMetadata]))
+    assert(returnedInitContainer.initContainer.getName === "mountedSecret")
+    assert(returnedInitContainer.podToInitialize.getMetadata.getLabels.asScala
+      === Map("mountedSecret" -> "true"))
+    assert(returnedInitContainer.initContainerDependentResources.length == 1)
+    val secret = returnedInitContainer.initContainerDependentResources.head.asInstanceOf[Secret]
+    assert(secret.getData === RSS_SECRET)
+    assert(secret.getMetadata.getName == RESOURCE_SECRET_NAME)
+    val expectedinitContainerProperties = Map(
+      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY",
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> false.toString)
+    assert(returnedInitContainer.initContainerProperties === expectedinitContainerProperties)
+    assert(returnedInitContainer.additionalDriverSparkConf ===
+      Map(
+        EXECUTOR_INIT_CONTAINER_SECRET.key -> RESOURCE_SECRET_NAME,
+        EXECUTOR_INIT_CONTAINER_SECRET_MOUNT_DIR.key -> SECRET_MOUNT_PATH))
+  }
+
+  test ("testing prepareInitContainer w/ CERT and TrustStore Files w/o SSL") {
+    val submittedResourceStep = new SubmittedResourcesInitContainerConfigurationStep(
+      RESOURCE_SECRET_NAME,
+      STAGING_SERVER_URI,
+      SECRET_MOUNT_PATH,
+      false,
+      TRUSTSTORE_URI,
+      CERT_URI,
+      Some(TRUSTSTORE_PASS),
+      Some(TRUSTSTORE_TYPE),
+      submittedDependencyUploader,
+      submittedResourcesSecretPlugin
+    )
+    val returnedInitContainer =
+      submittedResourceStep.configureInitContainer(InitContainerSpec(
+        Map.empty[String, String],
+        Map.empty[String, String],
+        new Container(),
+        new Container(),
+        new Pod(),
+        Seq.empty[HasMetadata]))
+    val expectedinitContainerProperties = Map(
+      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY",
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> false.toString,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASS,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_STAGING_SERVER_TRUSTSTORE_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_STAGING_SERVER_CLIENT_CERT_SECRET_KEY"
+    )
+    assert(returnedInitContainer.initContainerProperties === expectedinitContainerProperties)
+    assert(returnedInitContainer.initContainerDependentResources.length == 1)
+    val secret = returnedInitContainer.initContainerDependentResources.head.asInstanceOf[Secret]
+    assert(secret.getData === RSS_WITH_SSL_SECRET)
+    assert(secret.getMetadata.getName == RESOURCE_SECRET_NAME)
+
+  }
+
+  test ("testing prepareInitContainer w/ local CERT and TrustStore Files w/o SSL") {
+    val LOCAL_TRUST_FILE = "local:///tmp/trust.jsk"
+    val LOCAL_CERT_FILE = "local:///tmp/cert.pem"
+    val submittedResourceStep = new SubmittedResourcesInitContainerConfigurationStep(
+      RESOURCE_SECRET_NAME,
+      STAGING_SERVER_URI,
+      SECRET_MOUNT_PATH,
+      false,
+      Some(LOCAL_TRUST_FILE),
+      Some(LOCAL_CERT_FILE),
+      Some(TRUSTSTORE_PASS),
+      Some(TRUSTSTORE_TYPE),
+      submittedDependencyUploader,
+      submittedResourcesSecretPlugin
+    )
+    val returnedInitContainer =
+      submittedResourceStep.configureInitContainer(InitContainerSpec(
+        Map.empty[String, String],
+        Map.empty[String, String],
+        new Container(),
+        new Container(),
+        new Pod(),
+        Seq.empty[HasMetadata]))
+    val expectedinitContainerProperties = Map(
+      RESOURCE_STAGING_SERVER_URI.key -> STAGING_SERVER_URI,
+      INIT_CONTAINER_DOWNLOAD_JARS_RESOURCE_IDENTIFIER.key -> JARS_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_JARS_SECRET_LOCATION.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_JARS_SECRET_KEY",
+      INIT_CONTAINER_DOWNLOAD_FILES_RESOURCE_IDENTIFIER.key -> FILES_RESOURCE_ID,
+      INIT_CONTAINER_DOWNLOAD_FILES_SECRET_LOCATION.key ->
+        s"$SECRET_MOUNT_PATH/$INIT_CONTAINER_SUBMITTED_FILES_SECRET_KEY",
+      RESOURCE_STAGING_SERVER_SSL_ENABLED.key -> false.toString,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_PASSWORD.key -> TRUSTSTORE_PASS,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_TYPE.key -> TRUSTSTORE_TYPE,
+      RESOURCE_STAGING_SERVER_TRUSTSTORE_FILE.key ->
+        "/tmp/trust.jsk",
+      RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key ->
+        "/tmp/cert.pem"
+    )
+    assert(returnedInitContainer.initContainerProperties === expectedinitContainerProperties)
+    assert(returnedInitContainer.initContainerDependentResources.length == 1)
+    val secret = returnedInitContainer.initContainerDependentResources.head.asInstanceOf[Secret]
+    assert(secret.getData === RSS_SECRET)
+    assert(secret.getMetadata.getName == RESOURCE_SECRET_NAME)
+  }
+  private def createTempFile(extension: String): String = {
+    val dir = Utils.createTempDir()
+    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
+    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
+    file.getAbsolutePath
+  }
+}
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index d2082291eba22..c6cd6a74c88d1 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackendFactory
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.submit.{Client, KeyAndCertPem}
+import org.apache.spark.deploy.kubernetes.submit.{Client, ClientArguments, JavaMainAppResource, KeyAndCertPem, MainAppResource, PythonMainAppResource}
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
 
@@ -72,7 +72,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     kubernetesTestComponents.deleteNamespace()
   }
 
-  test("Run PySpark Job on file from SUBMITTER") {
+  test("Run PySpark Job on file from SUBMITTER with --py-files") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 
     launchStagingServer(SSLOptions(), None)
@@ -83,7 +83,9 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
 
     runPySparkPiAndVerifyCompletion(
-      PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION)
+      PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION,
+      Seq(PYSPARK_SORT_CONTAINER_LOCAL_FILE_LOCATION)
+    )
   }
 
   test("Run PySpark Job on file from CONTAINER with spark.jar defined") {
@@ -96,8 +98,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .set(EXECUTOR_DOCKER_IMAGE,
       System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
 
-    runPySparkPiAndVerifyCompletion(
-      PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION)
+    runPySparkPiAndVerifyCompletion(PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION, Seq.empty[String])
   }
 
   test("Simple submission test with the resource staging server.") {
@@ -154,10 +155,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
     sparkConf.set("spark.app.name", "group-by-test")
     runSparkApplicationAndVerifyCompletion(
-        SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+        JavaMainAppResource(SUBMITTER_LOCAL_MAIN_APP_RESOURCE),
         GROUP_BY_MAIN_CLASS,
-        "The Result is",
-        Array.empty[String])
+        Seq("The Result is"),
+        Array.empty[String],
+        Seq.empty[String])
   }
 
   test("Use remote resources without the resource staging server.") {
@@ -217,10 +219,11 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     launchStagingServer(SSLOptions(), None)
     sparkConf.set("spark.files", testExistenceFile.getAbsolutePath)
     runSparkApplicationAndVerifyCompletion(
-        SUBMITTER_LOCAL_MAIN_APP_RESOURCE,
+        JavaMainAppResource(SUBMITTER_LOCAL_MAIN_APP_RESOURCE),
         FILE_EXISTENCE_MAIN_CLASS,
-        s"File found at /opt/spark/${testExistenceFile.getName} with correct contents.",
-        Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS))
+        Seq(s"File found at /opt/spark/${testExistenceFile.getName} with correct contents."),
+        Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS),
+        Seq.empty[String])
   }
 
   test("Use a very long application name.") {
@@ -248,26 +251,35 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
     runSparkApplicationAndVerifyCompletion(
-        appResource, SPARK_PI_MAIN_CLASS, "Pi is roughly 3", Array.empty[String])
+        JavaMainAppResource(appResource),
+        SPARK_PI_MAIN_CLASS,
+        Seq("Pi is roughly 3"),
+        Array.empty[String],
+        Seq.empty[String])
   }
 
   private def runPySparkPiAndVerifyCompletion(
-    appResource: String): Unit = {
+      appResource: String, otherPyFiles: Seq[String]): Unit = {
     runSparkApplicationAndVerifyCompletion(
-      appResource, PYSPARK_PI_MAIN_CLASS, "Pi is roughly 3",
-      Array(null, "5"))
+      PythonMainAppResource(appResource),
+      PYSPARK_PI_MAIN_CLASS,
+      Seq("Submitting 5 missing tasks from ResultStage", "Pi is roughly 3"),
+      Array("5"),
+      otherPyFiles)
   }
 
   private def runSparkApplicationAndVerifyCompletion(
-      appResource: String,
+      appResource: MainAppResource,
       mainClass: String,
-      expectedLogOnCompletion: String,
-      appArgs: Array[String]): Unit = {
-    Client.run(
-      sparkConf = sparkConf,
-      appArgs = appArgs,
+      expectedLogOnCompletion: Seq[String],
+      appArgs: Array[String],
+      otherPyFiles: Seq[String]): Unit = {
+    val clientArguments = ClientArguments(
+      mainAppResource = appResource,
       mainClass = mainClass,
-      mainAppResource = appResource)
+      driverArgs = appArgs,
+      otherPyFiles = otherPyFiles)
+    Client.run(sparkConf, clientArguments)
     val driverPod = kubernetesTestComponents.kubernetesClient
       .pods()
       .withLabel("spark-app-locator", APP_LOCATOR_LABEL)
@@ -275,11 +287,13 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       .getItems
       .get(0)
     Eventually.eventually(TIMEOUT, INTERVAL) {
-      assert(kubernetesTestComponents.kubernetesClient
-        .pods()
-        .withName(driverPod.getMetadata.getName)
-        .getLog
-        .contains(expectedLogOnCompletion), "The application did not complete.")
+      expectedLogOnCompletion.foreach { e =>
+        assert(kubernetesTestComponents.kubernetesClient
+          .pods()
+          .withName(driverPod.getMetadata.getName)
+          .getLog
+          .contains(e), "The application did not complete.")
+      }
     }
   }
 
@@ -347,6 +361,8 @@ private[spark] object KubernetesSuite {
   val PYSPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.PythonRunner"
   val PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION =
     "local:///opt/spark/examples/src/main/python/pi.py"
+  val PYSPARK_SORT_CONTAINER_LOCAL_FILE_LOCATION =
+    "local:///opt/spark/examples/src/main/python/sort.py"
   val PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION = "src/test/python/pi.py"
   val FILE_EXISTENCE_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.FileExistenceTest"

From 8c35d81824336ca7b35afbb0387d1be88ccd9293 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Mon, 17 Jul 2017 13:09:41 -0700
Subject: [PATCH 517/534] Add implicit conversions to imports. (#374)

Otherwise we can get a Scalastyle error when building from SBT.
---
 .../submit/submitsteps/DriverKubernetesCredentialsStep.scala     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala
index 0c58006130659..70a108edc8678 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/DriverKubernetesCredentialsStep.scala
@@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets
 import com.google.common.io.{BaseEncoding, Files}
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, Secret, SecretBuilder}
 import scala.collection.JavaConverters._
+import scala.language.implicitConversions
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._

From db5f5be54e250c8863bf00e429e71d5acb9b496d Mon Sep 17 00:00:00 2001
From: Andrew Ash <andrew@andrewash.com>
Date: Mon, 17 Jul 2017 16:56:13 -0700
Subject: [PATCH 518/534] Fix import order and scalastyle (#375)

Test with ./dev/scalastyle
---
 ...nitContainerResourceStagingServerSecretPluginSuite.scala | 6 +++---
 .../kubernetes/SparkPodInitContainerBootstrapSuite.scala    | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala
index f5b2db36aff8f..597bcdb416fc0 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/InitContainerResourceStagingServerSecretPluginSuite.scala
@@ -16,11 +16,11 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import org.scalatest.BeforeAndAfter
 import io.fabric8.kubernetes.api.model._
-import org.apache.spark.deploy.kubernetes.constants._
-
+import org.scalatest.BeforeAndAfter
 import scala.collection.JavaConverters._
+
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.SparkFunSuite
 
 class InitContainerResourceStagingServerSecretPluginSuite extends SparkFunSuite with BeforeAndAfter{
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
index 0557b5677b919..d5f25983f5080 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/SparkPodInitContainerBootstrapSuite.scala
@@ -16,11 +16,11 @@
  */
 package org.apache.spark.deploy.kubernetes
 
-import org.scalatest.BeforeAndAfter
 import io.fabric8.kubernetes.api.model._
-import org.apache.spark.deploy.kubernetes.constants._
-
+import org.scalatest.BeforeAndAfter
 import scala.collection.JavaConverters._
+
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.SparkFunSuite
 
 class SparkPodInitContainerBootstrapSuite extends SparkFunSuite with BeforeAndAfter {

From 8751a9aaf1255d71397d49c15ee8aa1d8e854925 Mon Sep 17 00:00:00 2001
From: sandflee <moonfang@tencent.com>
Date: Wed, 19 Jul 2017 03:00:36 +0800
Subject: [PATCH 519/534] fix submit job errors (#376)

---
 .../org/apache/spark/deploy/kubernetes/submit/Client.scala   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 98cd7afcd204d..2fa9b416330e5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -42,7 +42,7 @@ private[spark] object ClientArguments {
     var otherPyFiles = Seq.empty[String]
     var mainClass: Option[String] = None
     val driverArgs = mutable.Buffer.empty[String]
-    args.sliding(2).toList.collect {
+    args.sliding(2, 2).toList.collect {
       case Array("--primary-py-file", mainPyFile: String) =>
         mainAppResource = Some(PythonMainAppResource(mainPyFile))
       case Array("--primary-java-resource", primaryJavaResource: String) =>
@@ -54,7 +54,8 @@ private[spark] object ClientArguments {
       case Array("--arg", arg: String) =>
         driverArgs += arg
       case other =>
-        throw new RuntimeException(s"Unknown arguments: $other")
+        val invalid = other.mkString(" ")
+        throw new RuntimeException(s"Unknown arguments: $invalid")
     }
     require(mainAppResource.isDefined,
         "Main app resource must be defined by either --primary-py-file or --primary-java-resource.")

From 6dbd32e0d68d9ee68bf4fbe806300bf3cdfd6849 Mon Sep 17 00:00:00 2001
From: sandflee <moonfang@tencent.com>
Date: Wed, 19 Jul 2017 06:43:10 +0800
Subject: [PATCH 520/534] Add node selectors for driver and executor pods
 (#355)

---
 docs/running-on-kubernetes.md                      | 10 ++++++++++
 .../deploy/kubernetes/ConfigurationUtils.scala     | 14 ++++++++++++++
 .../apache/spark/deploy/kubernetes/config.scala    |  2 ++
 .../submitsteps/BaseDriverConfigurationStep.scala  |  3 +++
 .../KubernetesClusterSchedulerBackend.scala        |  6 ++++++
 5 files changed, 35 insertions(+)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 2b4e9a6f96af1..5e23801e15b10 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -758,6 +758,16 @@ from the other deployment modes. See the [configuration page](configuration.html
     Specify the hard cpu limit for a single executor pod
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.node.selector.[labelKey]</code></td> 
+  <td>(none)</td>
+  <td>
+    Adds to the node selector of the driver pod and executor pods, with key <code>labelKey</code> and the value as the 
+    configuration's value. For example, setting <code>spark.kubernetes.node.selector.identifier</code> to <code>myIdentifier</code>
+    will result in the driver pod and executors having a node selector with key <code>identifier</code> and value 
+    <code>myIdentifier</code>. Multiple node selector keys can be added by setting multiple configurations with this prefix.
+  </td>
+</tr>
 </table>
 
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
index f461da4809b4d..1a008c236d00f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/ConfigurationUtils.scala
@@ -65,4 +65,18 @@ object ConfigurationUtils extends Logging {
     }
     combined.toMap
   }
+
+  def parsePrefixedKeyValuePairs(
+      sparkConf: SparkConf,
+      prefix: String,
+      configType: String): Map[String, String] = {
+    val fromPrefix = sparkConf.getAllWithPrefix(prefix)
+    fromPrefix.groupBy(_._1).foreach {
+      case (key, values) =>
+        require(values.size == 1,
+          s"Cannot have multiple values for a given $configType key, got key $key with" +
+            s" values $values")
+    }
+    fromPrefix.toMap
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index e1c1ab9d459fc..c6772c1cb5ae4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -497,6 +497,8 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
+
   private[spark] def resolveK8sMaster(rawMasterString: String): String = {
     if (!rawMasterString.startsWith("k8s://")) {
       throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala
index 022b5fccdc5e1..b3f509b44054e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/BaseDriverConfigurationStep.scala
@@ -73,6 +73,8 @@ private[spark] class BaseDriverConfigurationStep(
         s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
             s" Spark bookkeeping operations.")
     val allDriverAnnotations = driverCustomAnnotations ++ Map(SPARK_APP_NAME_ANNOTATION -> appName)
+    val nodeSelector = ConfigurationUtils.parsePrefixedKeyValuePairs(
+      submissionSparkConf, KUBERNETES_NODE_SELECTOR_PREFIX, "node selector")
     val driverCpuQuantity = new QuantityBuilder(false)
       .withAmount(driverCpuCores)
       .build()
@@ -117,6 +119,7 @@ private[spark] class BaseDriverConfigurationStep(
       .endMetadata()
       .withNewSpec()
         .withRestartPolicy("Never")
+        .withNodeSelector(nodeSelector.asJava)
         .endSpec()
       .build()
     val resolvedSparkConf = driverSpec.driverSparkConf.clone()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index e5f980ad1f366..6dbe918f966e4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -84,6 +84,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
           KUBERNETES_EXECUTOR_ANNOTATION_PREFIX,
           KUBERNETES_EXECUTOR_ANNOTATIONS,
           "executor annotation")
+  private val nodeSelector =
+      ConfigurationUtils.parsePrefixedKeyValuePairs(
+          conf,
+          KUBERNETES_NODE_SELECTOR_PREFIX,
+          "node-selector")
   private var shufflePodCache: Option[ShufflePodCache] = None
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val dockerImagePullPolicy = conf.get(DOCKER_IMAGE_PULL_POLICY)
@@ -449,6 +454,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       .endMetadata()
       .withNewSpec()
         .withHostname(hostname)
+        .withNodeSelector(nodeSelector.asJava)
       .endSpec()
       .build()
 

From 3ec941020d980e2577c48f75a50512b5e8824830 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Tue, 18 Jul 2017 23:16:36 -0700
Subject: [PATCH 521/534] Retry binding server to random port in the resource
 staging server test. (#378)

* Retry binding server to random port in the resource staging server test.

* Break if successful start

* Start server in try block.

* FIx scalastyle

* More rigorous cleanup logic. Increment port numbers.

* Move around more exception logic.

* More exception refactoring.

* Remove whitespace

* Fix test

* Rename variable
---
 .../ResourceStagingServerSuite.scala          | 69 ++++++++++++++++---
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
index 0c0908da20d89..1bcd85a611e00 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/kubernetes/ResourceStagingServerSuite.scala
@@ -23,12 +23,14 @@ import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.google.common.io.ByteStreams
 import okhttp3.{RequestBody, ResponseBody}
+import org.eclipse.jetty.server.Server
 import org.scalatest.BeforeAndAfter
 import org.scalatest.mock.MockitoSugar.mock
 import retrofit2.Call
 
 import org.apache.spark.{SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.SSLUtils
+import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
 /**
@@ -40,30 +42,37 @@ import org.apache.spark.util.Utils
  * we've configured the Jetty server correctly and that the endpoints reached over HTTP can
  * receive streamed uploads and can stream downloads.
  */
-class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
+class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter with Logging {
+
+  private val MAX_SERVER_START_ATTEMPTS = 5
   private var serviceImpl: ResourceStagingService = _
   private var stagedResourcesCleaner: StagedResourcesCleaner = _
-  private var server: ResourceStagingServer = _
+  private var server: Option[ResourceStagingServer] = None
   private val OBJECT_MAPPER = new ObjectMapper().registerModule(new DefaultScalaModule)
 
-  private val serverPort = new ServerSocket(0).getLocalPort
-
   private val sslOptionsProvider = new SettableReferenceSslOptionsProvider()
 
   before {
     stagedResourcesCleaner = mock[StagedResourcesCleaner]
     serviceImpl = new ResourceStagingServiceImpl(
       new StagedResourcesStoreImpl(Utils.createTempDir()), stagedResourcesCleaner)
-    server = new ResourceStagingServer(serverPort, serviceImpl, sslOptionsProvider)
   }
 
   after {
-    server.stop()
+    server.foreach { s =>
+      try {
+        s.stop()
+      } catch {
+        case e: Throwable =>
+          log.warn("Failed to stop the resource staging server.", e)
+      }
+    }
+    server = None
   }
 
   test("Accept file and jar uploads and downloads") {
-    server.start()
-    runUploadAndDownload(SSLOptions())
+    val serverPort = startServer()
+    runUploadAndDownload(SSLOptions(), serverPort)
   }
 
   test("Enable SSL on the server") {
@@ -80,11 +89,11 @@ class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
       trustStore = Some(keyStoreAndTrustStore.trustStore),
       trustStorePassword = Some("trustStore"))
     sslOptionsProvider.setOptions(sslOptions)
-    server.start()
-    runUploadAndDownload(sslOptions)
+    val serverPort = startServer()
+    runUploadAndDownload(sslOptions, serverPort)
   }
 
-  private def runUploadAndDownload(sslOptions: SSLOptions): Unit = {
+  private def runUploadAndDownload(sslOptions: SSLOptions, serverPort: Int): Unit = {
     val scheme = if (sslOptions.enabled) "https" else "http"
     val retrofitService = RetrofitClientFactoryImpl.createRetrofitClient(
       s"$scheme://127.0.0.1:$serverPort/",
@@ -125,6 +134,44 @@ class ResourceStagingServerSuite extends SparkFunSuite with BeforeAndAfter {
     val downloadedBytes = ByteStreams.toByteArray(responseBody.byteStream())
     assert(downloadedBytes.toSeq === bytes)
   }
+
+  private def startServer(): Int = {
+    var currentAttempt = 0
+    var successfulStart = false
+    var latestServerPort = new ServerSocket(0).getLocalPort
+    while (currentAttempt < MAX_SERVER_START_ATTEMPTS && !successfulStart) {
+      val newServer = new ResourceStagingServer(latestServerPort, serviceImpl, sslOptionsProvider)
+      try {
+        newServer.start()
+        successfulStart = true
+        server = Some(newServer)
+      } catch {
+        case e: Throwable =>
+          try {
+            newServer.stop()
+          } catch {
+            case e1: Throwable =>
+              log.warn("Failed to stop a resource staging server that failed to start.", e1)
+          }
+
+          if (Utils.isBindCollision(e)) {
+            currentAttempt += 1
+            latestServerPort = latestServerPort + 1
+            if (currentAttempt == MAX_SERVER_START_ATTEMPTS) {
+              throw new RuntimeException(s"Failed to bind to a random port" +
+                s" $MAX_SERVER_START_ATTEMPTS times. Last attempted port: $latestServerPort", e)
+            } else {
+              logWarning(s"Attempt $currentAttempt/$MAX_SERVER_START_ATTEMPTS failed to start" +
+                s" server on port $latestServerPort.", e)
+            }
+          } else {
+            throw e
+          }
+      }
+    }
+    logInfo(s"Started resource staging server on port $latestServerPort.")
+    latestServerPort
+  }
 }
 
 private class SettableReferenceSslOptionsProvider extends ResourceStagingServerSslOptionsProvider {

From e1ff2f06ee537431440a020d613a9be278f57287 Mon Sep 17 00:00:00 2001
From: Hong Zhiguo <zhiguohong@tencent.com>
Date: Thu, 20 Jul 2017 04:44:47 +0800
Subject: [PATCH 522/534] set RestartPolicy=Never for executor (#367)

* set RestartPolicy=Never for executor

As for current implementation the RestartPolicy of executor pod is
not set, so the default value "OnFailure" is in effect. But this
causes problem.

If an executor is terminated unexpectedly, for example, exit by
java.lang.OutOfMemoryError,  it'll be restarted by k8s with the
same executor ID.  When the new executor tries to fetch a block hold by
the last executor, ShuffleBlockFetcherIterator.splitLocalRemoteBlocks()
think it's a **local** block and tries to read it from it's local dir.
But the executor's local dir is changed because random generated ID is
part of local dir. FetchFailedException will raise and the stage will
fail.

The rolling Error message:

17/06/29 01:54:56 WARN KubernetesTaskSetManager: Lost task 0.1 in stage
2.0 (TID 7, 172.16.75.92, executor 1): FetchFailed(BlockManagerId(1,
172.16.75.92, 40539, None), shuffleId=2, mapId=0, reduceId=0, message=
org.apache.spark.shuffle.FetchFailedException:
/data2/spark/blockmgr-0e228d3c-8727-422e-aa97-2841a877c42a/32/shuffle_2_0_0.index
(No such file or directory)
        at
org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:357)
        at
org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:332)
        at
org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:54)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

* Update KubernetesClusterSchedulerBackend.scala
---
 .../cluster/kubernetes/KubernetesClusterSchedulerBackend.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index 6dbe918f966e4..a0753728f8cfd 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -454,6 +454,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       .endMetadata()
       .withNewSpec()
         .withHostname(hostname)
+        .withRestartPolicy("Never")
         .withNodeSelector(nodeSelector.asJava)
       .endSpec()
       .build()

From b1c48f98bba5d2522fb4f348361b256626557aa1 Mon Sep 17 00:00:00 2001
From: mccheah <mcheah@palantir.com>
Date: Wed, 19 Jul 2017 22:27:16 -0700
Subject: [PATCH 523/534] Read classpath entries from SPARK_EXTRA_CLASSPATH on
 executors. (#383)

This makes executors consistent with the driver. Note that
SPARK_EXTRA_CLASSPATH isn't set anywhere by Spark itself, but it's
primarily meant to be set by images that inherit from the base
driver/executor images.
---
 .../docker-minimal-bundle/src/main/docker/executor/Dockerfile    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
index 9c9efb23d7e95..b3b0acc3b64b8 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker/executor/Dockerfile
@@ -27,5 +27,6 @@ COPY examples /opt/spark/examples
 CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_MOUNTED_FILES_DIR} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
     exec /sbin/tini -- ${JAVA_HOME}/bin/java -Dspark.executor.port=$SPARK_EXECUTOR_PORT -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp $SPARK_CLASSPATH org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP

From 70e4e328431cab9ce32031f27c31668b38c635f2 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Fri, 14 Jul 2017 18:37:49 -0700
Subject: [PATCH 524/534] Initial architecture design for HDFS support

---
 .../kubernetes/HadoopConfBootstrap.scala      | 78 +++++++++++++++++++
 .../kubernetes/PodWithMainContainer.scala     | 23 ++++++
 .../spark/deploy/kubernetes/config.scala      |  6 ++
 .../spark/deploy/kubernetes/constants.scala   |  3 +
 ...DriverConfigurationStepsOrchestrator.scala | 32 +++++++-
 .../HadoopConfigBootstrapStep.scala           | 39 ++++++++++
 .../hadoopsteps/HadoopConfMounterStep.scala   | 49 ++++++++++++
 .../hadoopsteps/HadoopConfigSpec.scala        | 33 ++++++++
 .../hadoopsteps/HadoopConfigurationStep.scala | 25 ++++++
 .../hadoopsteps/HadoopStepsOrchestrator.scala | 53 +++++++++++++
 .../kubernetes/KubernetesClusterManager.scala |  7 +-
 11 files changed, 343 insertions(+), 5 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithMainContainer.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
new file mode 100644
index 0000000000000..bc83baa527a27
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import java.io.File
+
+import org.apache.spark.deploy.kubernetes.constants._
+import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, ContainerBuilder, KeyToPathBuilder, PodBuilder}
+
+import collection.JavaConverters._
+
+
+/**
+  * This is separated out from the HadoopConf steps API because this component can be reused to
+  * set up the hadoop-conf for executors as well.
+  */
+private[spark] trait HadoopConfBootstrap {
+  /**
+    * Bootstraps a main container with the ConfigMaps mounted as volumes and an ENV variable
+    * pointing to the mounted file.
+    */
+  def bootstrapMainContainerAndVolumes(
+    originalPodWithMainContainer: PodWithMainContainer)
+  : PodWithMainContainer
+}
+
+private[spark] class HadoopConfBootstrapImpl(
+  hadoopConfConfigMapName: String,
+  hadoopConfigFiles: Array[File]) extends HadoopConfBootstrap {
+
+  override def bootstrapMainContainerAndVolumes(
+    originalPodWithMainContainer: PodWithMainContainer)
+    : PodWithMainContainer = {
+    val fileContents = hadoopConfigFiles.map(file => (file.getPath, file.toString)).toMap
+    val keyPaths = hadoopConfigFiles.map(file =>
+      new KeyToPathBuilder().withKey(file.getPath).withPath(file.getAbsolutePath).build())
+    val hadoopSupportedPod = new PodBuilder(originalPodWithMainContainer.pod)
+      .editSpec()
+        .addNewVolume()
+          .withName(HADOOP_FILE_VOLUME)
+            .withNewConfigMap()
+              .withName(hadoopConfConfigMapName)
+              .addAllToItems(keyPaths.toList.asJavaCollection)
+            .endConfigMap()
+          .endVolume()
+        .endSpec()
+      .build()
+    val mainContainerWithMountedHadoopConf = new ContainerBuilder(
+      originalPodWithMainContainer.mainContainer)
+      .addNewVolumeMount()
+        .withName(HADOOP_FILE_VOLUME)
+        .withMountPath(HADOOP_FILE_DIR)
+        .endVolumeMount()
+      .addNewEnv()
+        .withName(HADOOP_CONF_DIR)
+        .withValue(s"$HADOOP_FILE_DIR/$HADOOP_FILE_VOLUME")
+        .endEnv()
+      .build()
+    PodWithMainContainer(
+      hadoopSupportedPod,
+      mainContainerWithMountedHadoopConf
+    )
+  }
+}
\ No newline at end of file
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithMainContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithMainContainer.scala
new file mode 100644
index 0000000000000..4f182c250fcf4
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/PodWithMainContainer.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.{Container, Pod}
+
+private[spark] case class PodWithMainContainer(
+    pod: Pod,
+    mainContainer: Container)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index c6772c1cb5ae4..7ebcf7253ebb3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -498,6 +498,12 @@ package object config extends Logging {
       .createOptional
 
   private[spark] val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
+  private[spark] val KUBERNETES_KERBEROS_SUPPORT =
+    ConfigBuilder("spark.kubernetes.kerberos")
+      .doc("Specify whether your job is a job " +
+        "that will require a Delegation Token to access HDFS")
+      .booleanConf
+      .createWithDefault(false)
 
   private[spark] def resolveK8sMaster(rawMasterString: String): String = {
     if (!rawMasterString.startsWith("k8s://")) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 92f051b2ac298..0dd2940faf66d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -86,6 +86,9 @@ package object constants {
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "spark-init-properties"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_DIR = "/etc/spark-init"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_NAME = "spark-init.properties"
+  private[spark] val HADOOP_FILE_VOLUME = "hadoop-properties"
+  private[spark] val HADOOP_FILE_DIR = "/etc/hadoop"
+  private[spark] val HADOOP_CONF_DIR = "HADOOP_CONF_DIR"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_PATH =
     s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
   private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
index 82abe55ac6989..4b88cd2a3fb00 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -16,11 +16,14 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
+import java.io.File
+
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.ConfigurationUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.submitsteps.{BaseDriverConfigurationStep, DependencyResolutionStep, DriverConfigurationStep, DriverKubernetesCredentialsStep, InitContainerBootstrapStep, PythonStep}
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.HadoopStepsOrchestrator
+import org.apache.spark.deploy.kubernetes.submit.submitsteps._
 import org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer.InitContainerConfigurationStepsOrchestrator
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
@@ -51,6 +54,7 @@ private[spark] class DriverConfigurationStepsOrchestrator(
   private val filesDownloadPath = submissionSparkConf.get(INIT_CONTAINER_FILES_DOWNLOAD_LOCATION)
   private val dockerImagePullPolicy = submissionSparkConf.get(DOCKER_IMAGE_PULL_POLICY)
   private val initContainerConfigMapName = s"$kubernetesResourceNamePrefix-init-config"
+  private val hadoopConfigMapName = s"$kubernetesResourceNamePrefix-hadoop-config"
 
   def getAllConfigurationSteps(): Seq[DriverConfigurationStep] = {
     val additionalMainAppJar = mainAppResource match {
@@ -94,6 +98,22 @@ private[spark] class DriverConfigurationStepsOrchestrator(
         submissionSparkConf)
     val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
         submissionSparkConf, kubernetesResourceNamePrefix)
+    val hadoopConfigurations =
+      sys.env.get("HADOOP_CONF_DIR").map{ conf => getHadoopConfFiles(conf)}
+          .getOrElse(Array.empty[File])
+    val hadoopConfigSteps =
+      if (hadoopConfigurations.isEmpty) {
+        Option.empty[DriverConfigurationStep]
+      } else {
+        val hadoopStepsOrchestrator = new HadoopStepsOrchestrator(
+          namespace,
+          kubernetesResourceNamePrefix,
+          submissionSparkConf,
+          hadoopConfigurations)
+        val hadoopConfSteps =
+          hadoopStepsOrchestrator.getHadoopSteps()
+        Some(new HadoopConfigBootstrapStep(hadoopConfSteps))
+      }
     val pythonStep = mainAppResource match {
       case PythonMainAppResource(mainPyResource) =>
         Option(new PythonStep(mainPyResource, additionalPythonFiles, filesDownloadPath))
@@ -133,6 +153,16 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       kubernetesCredentialsStep,
       dependencyResolutionStep) ++
       initContainerBootstrapStep.toSeq ++
+      hadoopConfigSteps.toSeq ++
       pythonStep.toSeq
   }
+  private def getHadoopConfFiles(path: String) : Array[File] = {
+    def isFile(file: File) = if (file.isFile) Some(file) else None
+    val dir = new File(path)
+    if (dir.isDirectory) {
+      dir.listFiles.flatMap { file => isFile(file) }
+    } else {
+      Array.empty[File]
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
new file mode 100644
index 0000000000000..685fdd188b497
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps
+
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.{HadoopConfigSpec, HadoopConfigurationStep}
+
+ /**
+  * Configures the driverSpec that bootstraps dependencies into the driver pod.
+  */
+private[spark] class HadoopConfigBootstrapStep(
+  hadoopConfigurationSteps: Seq[HadoopConfigurationStep])
+  extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    var currentHadoopSpec = HadoopConfigSpec(
+      driverPod = driverSpec.driverPod,
+      driverContainer = driverSpec.driverContainer)
+    for (nextStep <- hadoopConfigurationSteps) {
+      currentHadoopSpec = nextStep.configureContainers(currentHadoopSpec)
+    }
+    driverSpec.copy(
+      driverPod = currentHadoopSpec.driverPod,
+      driverContainer = currentHadoopSpec.driverContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
new file mode 100644
index 0000000000000..6410221fc48e0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+import java.io.File
+
+import io.fabric8.kubernetes.api.model._
+import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainContainer}
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.{DriverKubernetesCredentialsStep, KubernetesDriverSpec}
+import scala.collection.JavaConverters._
+
+ /**
+  * Step that configures the ConfigMap + Volumes for the driver
+  */
+private[spark] class HadoopConfMounterStep(
+    hadoopConfigMapName: String,
+    hadoopConfBootstrapConf: HadoopConfBootstrap)
+  extends HadoopConfigurationStep {
+
+   override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
+    val bootstrappedPodAndMainContainer =
+      hadoopConfBootstrapConf.bootstrapMainContainerAndVolumes(
+        PodWithMainContainer(
+          hadoopConfigSpec.driverPod,
+          hadoopConfigSpec.driverContainer
+          ))
+     hadoopConfigSpec.copy(
+       driverPod = bootstrappedPodAndMainContainer.pod,
+       driverContainer = bootstrappedPodAndMainContainer.mainContainer
+     )
+  }
+}
\ No newline at end of file
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
new file mode 100644
index 0000000000000..aaa49d9b45496
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
+
+ /**
+  * Represents a given configuration of the hadoop configuration logic, informing the
+  * HadoopConfigBootstrapStep of how the driver should be configured. This includes:
+  * <p>
+  * - What Spark properties should be set on the driver's SparkConf for the executors
+  * - The spec of the main container so that it can be modified to share volumes
+  * - The spec of the driver pod EXCEPT for the addition of the given hadoop configs (e.g. volumes
+  *   the hadoop logic needs)
+  */
+private[spark] case class HadoopConfigSpec(
+//  additionalDriverSparkConf: Map[String, String],
+  driverPod: Pod,
+  driverContainer: Container)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigurationStep.scala
new file mode 100644
index 0000000000000..2b5aca1aadddc
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigurationStep.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+ /**
+  * Represents a step in preparing the driver
+  */
+private[spark] trait HadoopConfigurationStep {
+
+  def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
new file mode 100644
index 0000000000000..b6ecc5dce8a82
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+import java.io.File
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.HadoopConfBootstrapImpl
+import org.apache.spark.deploy.kubernetes.config._
+
+
+ /**
+  * Returns the complete ordered list of steps required to configure the hadoop configurations.
+  */
+private[spark] class HadoopStepsOrchestrator(
+  namespace: String,
+  kubernetesResourceNamePrefix: String,
+  submissionSparkConf: SparkConf,
+  hadoopConfigurationFiles: Array[File]) {
+  private val hadoopConfigMapName = s"$kubernetesResourceNamePrefix-hadoop-config"
+  private val maybeKerberosSupport = submissionSparkConf.get(KUBERNETES_KERBEROS_SUPPORT)
+
+  def getHadoopSteps(): Seq[HadoopConfigurationStep] = {
+    val hadoopConfBootstrapImpl = new HadoopConfBootstrapImpl(
+      hadoopConfigMapName,
+      hadoopConfigurationFiles)
+    val hadoopConfMounterStep = new HadoopConfMounterStep(
+      hadoopConfigMapName,
+      hadoopConfBootstrapImpl)
+    val maybeHadoopKerberosMountingStep =
+      if (maybeKerberosSupport) {
+        // TODO: Implement mounting secrets
+        Option.empty[HadoopConfigurationStep]
+      } else {
+        Option.empty[HadoopConfigurationStep]
+      }
+    Seq(hadoopConfMounterStep) ++ maybeHadoopKerberosMountingStep.toSeq
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index fa0ecca3b4ee6..fe3109d9f14ed 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -19,9 +19,8 @@ package org.apache.spark.scheduler.cluster.kubernetes
 import java.io.File
 
 import io.fabric8.kubernetes.client.Config
-
 import org.apache.spark.SparkContext
-import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrapImpl, InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.internal.Logging
@@ -59,7 +58,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     // name. Note that we generally expect both to have been set from spark-submit V2, but for
     // testing developers may simply run the driver JVM locally, but the config map won't be set
     // then.
-    val bootStrap = for {
+    val initBootStrap = for {
       configMap <- maybeConfigMap
       configMapKey <- maybeConfigMapKey
     } yield {
@@ -90,7 +89,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     new KubernetesClusterSchedulerBackend(
         sc.taskScheduler.asInstanceOf[TaskSchedulerImpl],
         sc,
-        bootStrap,
+        initBootStrap,
         executorInitContainerSecretVolumePlugin,
         kubernetesClient)
   }

From 434575243a3f4147829bf2e23ec82de3beecc5c1 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Fri, 14 Jul 2017 19:30:38 -0700
Subject: [PATCH 525/534] Minor styling

---
 .../kubernetes/HadoopConfBootstrap.scala      | 23 +++++++++----------
 ...DriverConfigurationStepsOrchestrator.scala |  2 +-
 .../hadoopsteps/HadoopConfMounterStep.scala   | 10 +-------
 .../kubernetes/KubernetesClusterManager.scala |  6 ++---
 4 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
index bc83baa527a27..18daff73b82b2 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
@@ -18,21 +18,19 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.File
 
-import org.apache.spark.deploy.kubernetes.constants._
-import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, ContainerBuilder, KeyToPathBuilder, PodBuilder}
-
-import collection.JavaConverters._
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, KeyToPathBuilder, PodBuilder}
 
+import org.apache.spark.deploy.kubernetes.constants._
 
 /**
-  * This is separated out from the HadoopConf steps API because this component can be reused to
-  * set up the hadoop-conf for executors as well.
-  */
+ * This is separated out from the HadoopConf steps API because this component can be reused to
+ * set up the hadoop-conf for executors as well.
+ */
 private[spark] trait HadoopConfBootstrap {
-  /**
-    * Bootstraps a main container with the ConfigMaps mounted as volumes and an ENV variable
-    * pointing to the mounted file.
-    */
+ /**
+  * Bootstraps a main container with the ConfigMaps mounted as volumes and an ENV variable
+  * pointing to the mounted file.
+  */
   def bootstrapMainContainerAndVolumes(
     originalPodWithMainContainer: PodWithMainContainer)
   : PodWithMainContainer
@@ -45,6 +43,7 @@ private[spark] class HadoopConfBootstrapImpl(
   override def bootstrapMainContainerAndVolumes(
     originalPodWithMainContainer: PodWithMainContainer)
     : PodWithMainContainer = {
+    import collection.JavaConverters._
     val fileContents = hadoopConfigFiles.map(file => (file.getPath, file.toString)).toMap
     val keyPaths = hadoopConfigFiles.map(file =>
       new KeyToPathBuilder().withKey(file.getPath).withPath(file.getAbsolutePath).build())
@@ -75,4 +74,4 @@ private[spark] class HadoopConfBootstrapImpl(
       mainContainerWithMountedHadoopConf
     )
   }
-}
\ No newline at end of file
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
index 4b88cd2a3fb00..dbaf45849b69c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -22,8 +22,8 @@ import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.ConfigurationUtils
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.HadoopStepsOrchestrator
 import org.apache.spark.deploy.kubernetes.submit.submitsteps._
+import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.HadoopStepsOrchestrator
 import org.apache.spark.deploy.kubernetes.submit.submitsteps.initcontainer.InitContainerConfigurationStepsOrchestrator
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.Utils
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
index 6410221fc48e0..7f65147a4f2b4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
@@ -16,15 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
 
-import java.io.File
-
-import io.fabric8.kubernetes.api.model._
 import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainContainer}
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.deploy.kubernetes.constants._
-import org.apache.spark.deploy.kubernetes.submit.KubernetesFileUtils
-import org.apache.spark.deploy.kubernetes.submit.submitsteps.{DriverKubernetesCredentialsStep, KubernetesDriverSpec}
-import scala.collection.JavaConverters._
 
  /**
   * Step that configures the ConfigMap + Volumes for the driver
@@ -46,4 +38,4 @@ private[spark] class HadoopConfMounterStep(
        driverContainer = bootstrappedPodAndMainContainer.mainContainer
      )
   }
-}
\ No newline at end of file
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index fe3109d9f14ed..ca837919e5ff4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -19,15 +19,15 @@ package org.apache.spark.scheduler.cluster.kubernetes
 import java.io.File
 
 import io.fabric8.kubernetes.client.Config
-import org.apache.spark.SparkContext
-import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrapImpl, InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
+
+import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 
 private[spark] class KubernetesClusterManager extends ExternalClusterManager with Logging {
-
+  import org.apache.spark.SparkContext
   override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
   override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {

From 1d19f7d83bb9d8d9ff70d353fc89129f556355db Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Mon, 17 Jul 2017 17:14:01 -0700
Subject: [PATCH 526/534] Added proper logic for mounting ConfigMaps

---
 .../kubernetes/HadoopConfBootstrap.scala      |  8 +++---
 .../spark/deploy/kubernetes/constants.scala   |  2 ++
 ...DriverConfigurationStepsOrchestrator.scala |  2 +-
 .../HadoopConfigBootstrapStep.scala           | 26 +++++++++++++++--
 .../hadoopsteps/HadoopConfMounterStep.scala   |  8 +++++-
 .../hadoopsteps/HadoopConfigSpec.scala        |  5 +++-
 .../hadoopsteps/HadoopStepsOrchestrator.scala |  1 +
 .../kubernetes/KubernetesClusterManager.scala | 28 ++++++++++++++++++-
 .../KubernetesClusterSchedulerBackend.scala   | 23 ++++++++++-----
 .../integrationtest/KubernetesSuite.scala     |  6 ++++
 10 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
index 18daff73b82b2..8a6679f80cb8b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
@@ -43,17 +43,17 @@ private[spark] class HadoopConfBootstrapImpl(
   override def bootstrapMainContainerAndVolumes(
     originalPodWithMainContainer: PodWithMainContainer)
     : PodWithMainContainer = {
-    import collection.JavaConverters._
-    val fileContents = hadoopConfigFiles.map(file => (file.getPath, file.toString)).toMap
+    import scala.collection.JavaConverters._
     val keyPaths = hadoopConfigFiles.map(file =>
-      new KeyToPathBuilder().withKey(file.getPath).withPath(file.getAbsolutePath).build())
+      new KeyToPathBuilder().withKey(file.toPath.getFileName.toString)
+        .withPath(file.toPath.getFileName.toString).build()).toList
     val hadoopSupportedPod = new PodBuilder(originalPodWithMainContainer.pod)
       .editSpec()
         .addNewVolume()
           .withName(HADOOP_FILE_VOLUME)
             .withNewConfigMap()
               .withName(hadoopConfConfigMapName)
-              .addAllToItems(keyPaths.toList.asJavaCollection)
+              .withItems(keyPaths.asJava)
             .endConfigMap()
           .endVolume()
         .endSpec()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 0dd2940faf66d..1e61e2f57a683 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -89,6 +89,8 @@ package object constants {
   private[spark] val HADOOP_FILE_VOLUME = "hadoop-properties"
   private[spark] val HADOOP_FILE_DIR = "/etc/hadoop"
   private[spark] val HADOOP_CONF_DIR = "HADOOP_CONF_DIR"
+  private[spark] val HADOOP_CONFIG_MAP_SPARK_CONF_NAME =
+    "spark.kubernetes.hadoop.executor.hadoopconfigmapname"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_PATH =
     s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
   private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
index dbaf45849b69c..a4b0cb7760f0c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -112,7 +112,7 @@ private[spark] class DriverConfigurationStepsOrchestrator(
           hadoopConfigurations)
         val hadoopConfSteps =
           hadoopStepsOrchestrator.getHadoopSteps()
-        Some(new HadoopConfigBootstrapStep(hadoopConfSteps))
+        Some(new HadoopConfigBootstrapStep(hadoopConfSteps, kubernetesResourceNamePrefix))
       }
     val pythonStep = mainAppResource match {
       case PythonMainAppResource(mainPyResource) =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
index 685fdd188b497..6091dc5b36c98 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
@@ -16,24 +16,44 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.submitsteps
 
+import java.io.StringWriter
+import java.util.Properties
+
+import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, HasMetadata}
+import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.{HadoopConfigSpec, HadoopConfigurationStep}
 
  /**
   * Configures the driverSpec that bootstraps dependencies into the driver pod.
   */
 private[spark] class HadoopConfigBootstrapStep(
-  hadoopConfigurationSteps: Seq[HadoopConfigurationStep])
+  hadoopConfigurationSteps: Seq[HadoopConfigurationStep], kubernetesResourceNamePrefix: String)
   extends DriverConfigurationStep {
+  private val hadoopConfigMapName = s"$kubernetesResourceNamePrefix-hadoop-config"
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    import scala.collection.JavaConverters._
     var currentHadoopSpec = HadoopConfigSpec(
       driverPod = driverSpec.driverPod,
-      driverContainer = driverSpec.driverContainer)
+      driverContainer = driverSpec.driverContainer,
+      configMapProperties = Map.empty[String, String])
     for (nextStep <- hadoopConfigurationSteps) {
       currentHadoopSpec = nextStep.configureContainers(currentHadoopSpec)
     }
+    val configMap =
+      new ConfigMapBuilder()
+        .withNewMetadata()
+        .withName(hadoopConfigMapName)
+        .endMetadata()
+          .addToData(currentHadoopSpec.configMapProperties.asJava)
+      .build()
+    val executorSparkConf = driverSpec.driverSparkConf.clone()
+      .set(HADOOP_CONFIG_MAP_SPARK_CONF_NAME, hadoopConfigMapName)
     driverSpec.copy(
       driverPod = currentHadoopSpec.driverPod,
-      driverContainer = currentHadoopSpec.driverContainer)
+      driverContainer = currentHadoopSpec.driverContainer,
+      driverSparkConf = executorSparkConf,
+      otherKubernetesResources = Seq(configMap)
+      )
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
index 7f65147a4f2b4..e9035561d7ed6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
 
+import java.io.File
+
 import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainContainer}
 
  /**
@@ -23,6 +25,7 @@ import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainConta
   */
 private[spark] class HadoopConfMounterStep(
     hadoopConfigMapName: String,
+    hadoopConfigurationFiles: Array[File],
     hadoopConfBootstrapConf: HadoopConfBootstrap)
   extends HadoopConfigurationStep {
 
@@ -35,7 +38,10 @@ private[spark] class HadoopConfMounterStep(
           ))
      hadoopConfigSpec.copy(
        driverPod = bootstrappedPodAndMainContainer.pod,
-       driverContainer = bootstrappedPodAndMainContainer.mainContainer
+       driverContainer = bootstrappedPodAndMainContainer.mainContainer,
+       configMapProperties =
+         hadoopConfigurationFiles.map(file =>
+           (file.toPath.getFileName.toString, file.toString)).toMap
      )
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
index aaa49d9b45496..e31f6c5901a55 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
@@ -26,8 +26,11 @@ import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
   * - The spec of the main container so that it can be modified to share volumes
   * - The spec of the driver pod EXCEPT for the addition of the given hadoop configs (e.g. volumes
   *   the hadoop logic needs)
+  * - The properties that will be stored into the config map which have (key, value)
+  *   pairs of (path, data)
   */
 private[spark] case class HadoopConfigSpec(
 //  additionalDriverSparkConf: Map[String, String],
   driverPod: Pod,
-  driverContainer: Container)
+  driverContainer: Container,
+  configMapProperties: Map[String, String])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
index b6ecc5dce8a82..1d3b64931aefa 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
@@ -40,6 +40,7 @@ private[spark] class HadoopStepsOrchestrator(
       hadoopConfigurationFiles)
     val hadoopConfMounterStep = new HadoopConfMounterStep(
       hadoopConfigMapName,
+      hadoopConfigurationFiles,
       hadoopConfBootstrapImpl)
     val maybeHadoopKerberosMountingStep =
       if (maybeKerberosSupport) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index ca837919e5ff4..ff482e9ae3b8f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -20,7 +20,7 @@ import java.io.File
 
 import io.fabric8.kubernetes.client.Config
 
-import org.apache.spark.deploy.kubernetes.{InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
+import org.apache.spark.deploy.kubernetes._
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.internal.Logging
@@ -41,6 +41,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     val sparkConf = sc.getConf
     val maybeConfigMap = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP)
     val maybeConfigMapKey = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY)
+    val maybeHadoopConfigMap = sparkConf.getOption(HADOOP_CONFIG_MAP_SPARK_CONF_NAME)
 
     val maybeExecutorInitContainerSecretName =
       sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET)
@@ -71,6 +72,17 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         configMap,
         configMapKey)
     }
+    val hadoopBootStrap = for {
+      hadoopConfigMap <- maybeHadoopConfigMap
+    } yield {
+      val hadoopConfigurations =
+        sys.env.get("HADOOP_CONF_DIR").map{ conf => getHadoopConfFiles(conf)}
+          .getOrElse(Array.empty[File])
+      new HadoopConfBootstrapImpl(
+        hadoopConfigMap,
+        hadoopConfigurations
+      )
+    }
     if (maybeConfigMap.isEmpty) {
       logWarning("The executor's init-container config map was not specified. Executors will" +
         " therefore not attempt to fetch remote or submitted dependencies.")
@@ -79,6 +91,10 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
       logWarning("The executor's init-container config map key was not specified. Executors will" +
         " therefore not attempt to fetch remote or submitted dependencies.")
     }
+    if (maybeHadoopConfigMap.isEmpty) {
+      logWarning("The executor's hadoop config map key was not specified. Executors will" +
+        " therefore not attempt to fetch hadoop configuration files.")
+    }
     val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
         KUBERNETES_MASTER_INTERNAL_URL,
         Some(sparkConf.get(KUBERNETES_NAMESPACE)),
@@ -90,6 +106,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         sc.taskScheduler.asInstanceOf[TaskSchedulerImpl],
         sc,
         initBootStrap,
+        hadoopBootStrap,
         executorInitContainerSecretVolumePlugin,
         kubernetesClient)
   }
@@ -97,4 +114,13 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
     scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
   }
+  private def getHadoopConfFiles(path: String) : Array[File] = {
+    def isFile(file: File) = if (file.isFile) Some(file) else None
+    val dir = new File(path)
+    if (dir.isDirectory) {
+      dir.listFiles.flatMap { file => isFile(file) }
+    } else {
+      Array.empty[File]
+    }
+  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index a0753728f8cfd..c3d14b8e12fb1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -21,18 +21,18 @@ import java.net.InetAddress
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
+import scala.collection.mutable
+import scala.concurrent.{ExecutionContext, Future}
+
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder}
+import io.fabric8.kubernetes.api.model._
 import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import org.apache.commons.io.FilenameUtils
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.concurrent.{ExecutionContext, Future}
 
 import org.apache.spark.{SparkContext, SparkEnv, SparkException}
-import org.apache.spark.deploy.kubernetes.{ConfigurationUtils, InitContainerResourceStagingServerSecretPlugin, PodWithDetachedInitContainer, SparkPodInitContainerBootstrap}
+import org.apache.spark.deploy.kubernetes._
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.submit.InitContainerUtil
@@ -48,6 +48,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
     val sc: SparkContext,
     executorInitContainerBootstrap: Option[SparkPodInitContainerBootstrap],
+    executorHadoopBootStrap: Option[HadoopConfBootstrap],
     executorMountInitContainerSecretPlugin: Option[InitContainerResourceStagingServerSecretPlugin],
     kubernetesClient: KubernetesClient)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
@@ -366,6 +367,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
    * @return A tuple of the new executor name and the Pod data structure.
    */
   private def allocateNewExecutorPod(nodeToLocalTaskCount: Map[String, Int]): (String, Pod) = {
+    import scala.collection.JavaConverters._
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
     val name = s"$executorPodNamePrefix-exec-$executorId"
 
@@ -520,9 +522,16 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
     val executorPodWithNodeAffinity = addNodeAffinityAnnotationIfUseful(
         executorPodWithInitContainer, nodeToLocalTaskCount)
-    val resolvedExecutorPod = new PodBuilder(executorPodWithNodeAffinity)
+    val (executorHadoopConfPod, executorHadoopConfContainer) =
+      executorHadoopBootStrap.map { bootstrap =>
+        val podWithMainContainer = bootstrap.bootstrapMainContainerAndVolumes(
+          PodWithMainContainer(executorPodWithNodeAffinity, initBootstrappedExecutorContainer)
+        )
+        (podWithMainContainer.pod, podWithMainContainer.mainContainer)
+      }.getOrElse(executorPodWithNodeAffinity, initBootstrappedExecutorContainer)
+    val resolvedExecutorPod = new PodBuilder(executorHadoopConfPod)
       .editSpec()
-        .addToContainers(initBootstrappedExecutorContainer)
+        .addToContainers(executorHadoopConfContainer)
         .endSpec()
       .build()
     try {
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index c6cd6a74c88d1..360a83bb9e1fe 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -72,6 +72,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     kubernetesTestComponents.deleteNamespace()
   }
 
+  test("Include HADOOP_CONF for HDFS based jobs ") {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+
+    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+  }
+
   test("Run PySpark Job on file from SUBMITTER with --py-files") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
 

From adf44cf82dbe17bcea6d6d45fe1dcac87777c623 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Mon, 17 Jul 2017 17:32:54 -0700
Subject: [PATCH 527/534] styling

---
 .../submit/DriverConfigurationStepsOrchestrator.scala        | 4 ++--
 .../submit/submitsteps/HadoopConfigBootstrapStep.scala       | 5 +++--
 .../submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala    | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
index a4b0cb7760f0c..5470c8e0ba977 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -107,12 +107,12 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       } else {
         val hadoopStepsOrchestrator = new HadoopStepsOrchestrator(
           namespace,
-          kubernetesResourceNamePrefix,
+          hadoopConfigMapName,
           submissionSparkConf,
           hadoopConfigurations)
         val hadoopConfSteps =
           hadoopStepsOrchestrator.getHadoopSteps()
-        Some(new HadoopConfigBootstrapStep(hadoopConfSteps, kubernetesResourceNamePrefix))
+        Some(new HadoopConfigBootstrapStep(hadoopConfSteps, hadoopConfigMapName))
       }
     val pythonStep = mainAppResource match {
       case PythonMainAppResource(mainPyResource) =>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
index 6091dc5b36c98..755105f41f467 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
@@ -20,6 +20,7 @@ import java.io.StringWriter
 import java.util.Properties
 
 import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, HasMetadata}
+
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.{HadoopConfigSpec, HadoopConfigurationStep}
 
@@ -27,9 +28,9 @@ import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.{Hadoop
   * Configures the driverSpec that bootstraps dependencies into the driver pod.
   */
 private[spark] class HadoopConfigBootstrapStep(
-  hadoopConfigurationSteps: Seq[HadoopConfigurationStep], kubernetesResourceNamePrefix: String)
+  hadoopConfigurationSteps: Seq[HadoopConfigurationStep],
+  hadoopConfigMapName: String )
   extends DriverConfigurationStep {
-  private val hadoopConfigMapName = s"$kubernetesResourceNamePrefix-hadoop-config"
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
     import scala.collection.JavaConverters._
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
index 1d3b64931aefa..55722c75b5926 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
@@ -28,10 +28,9 @@ import org.apache.spark.deploy.kubernetes.config._
   */
 private[spark] class HadoopStepsOrchestrator(
   namespace: String,
-  kubernetesResourceNamePrefix: String,
+  hadoopConfigMapName: String,
   submissionSparkConf: SparkConf,
   hadoopConfigurationFiles: Array[File]) {
-  private val hadoopConfigMapName = s"$kubernetesResourceNamePrefix-hadoop-config"
   private val maybeKerberosSupport = submissionSparkConf.get(KUBERNETES_KERBEROS_SUPPORT)
 
   def getHadoopSteps(): Seq[HadoopConfigurationStep] = {

From 8b711681451850174aa66bcadc50b64b6cde067f Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Mon, 17 Jul 2017 18:25:39 -0700
Subject: [PATCH 528/534] modified otherKubernetesResource logic

---
 .../submit/submitsteps/HadoopConfigBootstrapStep.scala        | 4 +++-
 .../submitsteps/hadoopsteps/HadoopConfMounterStep.scala       | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
index 755105f41f467..17772aa0b0514 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
@@ -54,7 +54,9 @@ private[spark] class HadoopConfigBootstrapStep(
       driverPod = currentHadoopSpec.driverPod,
       driverContainer = currentHadoopSpec.driverContainer,
       driverSparkConf = executorSparkConf,
-      otherKubernetesResources = Seq(configMap)
+      otherKubernetesResources =
+        driverSpec.otherKubernetesResources ++
+        Seq(configMap)
       )
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
index e9035561d7ed6..73374a18062a9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
 
 import java.io.File
+import org.apache.commons.io.FileUtils.readFileToString
 
 import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainContainer}
 
@@ -41,7 +42,7 @@ private[spark] class HadoopConfMounterStep(
        driverContainer = bootstrappedPodAndMainContainer.mainContainer,
        configMapProperties =
          hadoopConfigurationFiles.map(file =>
-           (file.toPath.getFileName.toString, file.toString)).toMap
+           (file.toPath.getFileName.toString, readFileToString(file))).toMap
      )
   }
 }

From 03ff1ba6ddd64ab7ebc9cb6138d639392ac35949 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Mon, 17 Jul 2017 19:12:03 -0700
Subject: [PATCH 529/534] fixed Integration tests and modified HADOOP_CONF_DIR
 variable to be FILE_DIR for Volume mount

---
 .../kubernetes/HadoopConfBootstrap.scala      |   7 +-
 .../integrationtest/KubernetesSuite.scala     |   2 +-
 .../test-data/hadoop-conf-files/core-site.xml |  34 +++++
 .../test-data/hadoop-conf-files/hdfs-site.xml | 132 ++++++++++++++++++
 4 files changed, 172 insertions(+), 3 deletions(-)
 create mode 100644 resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml
 create mode 100644 resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
index 8a6679f80cb8b..a8ab857ff145d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
@@ -21,6 +21,8 @@ import java.io.File
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, KeyToPathBuilder, PodBuilder}
 
 import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.Logging
+
 
 /**
  * This is separated out from the HadoopConf steps API because this component can be reused to
@@ -38,12 +40,13 @@ private[spark] trait HadoopConfBootstrap {
 
 private[spark] class HadoopConfBootstrapImpl(
   hadoopConfConfigMapName: String,
-  hadoopConfigFiles: Array[File]) extends HadoopConfBootstrap {
+  hadoopConfigFiles: Array[File]) extends HadoopConfBootstrap with Logging{
 
   override def bootstrapMainContainerAndVolumes(
     originalPodWithMainContainer: PodWithMainContainer)
     : PodWithMainContainer = {
     import scala.collection.JavaConverters._
+    logInfo("HADOOP_CONF_DIR defined. Mounting HDFS specific .xml files")
     val keyPaths = hadoopConfigFiles.map(file =>
       new KeyToPathBuilder().withKey(file.toPath.getFileName.toString)
         .withPath(file.toPath.getFileName.toString).build()).toList
@@ -66,7 +69,7 @@ private[spark] class HadoopConfBootstrapImpl(
         .endVolumeMount()
       .addNewEnv()
         .withName(HADOOP_CONF_DIR)
-        .withValue(s"$HADOOP_FILE_DIR/$HADOOP_FILE_VOLUME")
+        .withValue(HADOOP_FILE_DIR)
         .endEnv()
       .build()
     PodWithMainContainer(
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 360a83bb9e1fe..cc3396c97f61d 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -74,7 +74,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Include HADOOP_CONF for HDFS based jobs ") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
+    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
     runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
   }
 
diff --git a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml
new file mode 100644
index 0000000000000..f9e27564b7e9c
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+  <property>
+    <name>hadoop.security.authentication</name>
+    <value>kerberos</value>
+  </property>
+
+  <property>
+    <name>hadoop.security.authorization</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://nn.default.svc.cluster.local:9000</value>
+  </property>
+</configuration>
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml
new file mode 100644
index 0000000000000..1b2de25958c65
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+  <property>
+    <name>dfs.replication</name>
+    <value>1</value>
+  </property>
+
+  <!-- General HDFS security config -->
+  <property> 
+    <name>dfs.permissions</name> 
+    <value>true</value> 
+  </property>
+  <property>
+    <name>dfs.block.access.token.enable</name>
+    <value>true</value>
+  </property>
+
+  <!-- NameNode security config -->
+  <property>
+    <name>dfs.namenode.keytab.file</name>
+    <value>/var/keytabs/hdfs.keytab</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.principal</name>
+    <value>hdfs/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.internal.spnego.principal</name>
+    <value>HTTP/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+
+
+  <!-- For testing, we want tokens to expire FAST -->
+  <property>
+    <name>dfs.namenode.delegation.token.max-lifetime</name>
+    <value>18000000</value> <!-- 300 minutes -->
+  </property>
+  <property>
+    <name>dfs.namenode.delegation.token.renew-interval</name>
+    <value>1800000</value> <!-- 30 minutes -->
+  </property>
+
+
+
+  <!-- DataNode security config -->
+  <property>
+    <name>dfs.data.transfer.protection</name>
+    <value>integrity</value>
+  </property>
+   
+  <property>
+    <name>dfs.datanode.address</name>
+    <value>0.0.0.0:10019</value>
+  </property>
+   
+  <property>
+    <name>dfs.datanode.http.address</name>
+    <value>0.0.0.0:10022</value>
+  </property>
+   
+  <property>
+    <name>dfs.http.policy</name>
+    <value>HTTPS_ONLY</value>
+  </property>
+
+
+  <property>
+    <name>dfs.namenode.keytab.file</name>
+    <value>/var/keytabs/hdfs.keytab</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.principal</name>
+    <value>hdfs/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.internal.spnego.principal</name>
+    <value>HTTP/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+
+  <!-- prevent those errors -->
+  <property>
+    <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
+    <value>false</value>
+  </property>
+
+
+  <property>
+    <name>dfs.datanode.data.dir.perm</name>
+    <value>700</value> 
+  </property>
+  <property>
+    <name>dfs.datanode.keytab.file</name>
+    <value>/var/keytabs/hdfs.keytab</value> <!-- path to the HDFS keytab -->
+  </property>
+  <property>
+    <name>dfs.datanode.kerberos.principal</name>
+    <value>hdfs/dn1.default.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+
+  <!-- Web Authentication config -->
+  <property>
+    <name>dfs.webhdfs.enabled</name>
+    <value>true</value>
+  </property>
+  <property>
+    <name>dfs.web.authentication.kerberos.principal</name>
+    <value>HTTP/dn1.default.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+  <property>
+    <name>dfs.web.authentication.kerberos.keytab</name>
+    <value>/var/keytabs/hdfs.keytab</value> <!-- path to the HTTP keytab -->
+  </property>
+
+</configuration>

From a6431a06fa6b8779b016d0627f3bdf61893b0521 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Mon, 17 Jul 2017 19:53:31 -0700
Subject: [PATCH 530/534] setting HADOOP_CONF_DIR env variables

---
 .../integrationtest/KubernetesSuite.scala     | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index cc3396c97f61d..f4aa53c6f4f11 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -28,15 +28,15 @@ import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
 import scala.collection.JavaConverters._
 
-import org.apache.spark.{SparkConf, SparkFunSuite, SSLOptions}
 import org.apache.spark.deploy.kubernetes.SSLUtils
+import org.apache.spark.{SSLOptions, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackendFactory
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
 import org.apache.spark.deploy.kubernetes.submit.{Client, ClientArguments, JavaMainAppResource, KeyAndCertPem, MainAppResource, PythonMainAppResource}
 import org.apache.spark.launcher.SparkLauncher
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{RedirectThread, Utils}
 
 private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   import KubernetesSuite._
@@ -74,13 +74,31 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Include HADOOP_CONF for HDFS based jobs ") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    // Ensuring that HADOOP_CONF_DIR env variable is set
+    val builder = new ProcessBuilder(
+      Seq("/bin/bash", "-c", "export HADOOP_CONF_DIR=" +
+        "test-data/hadoop-conf-files && exec").asJava)
+    builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
+    val process = builder.start()
+    new RedirectThread(process.getInputStream, System.out, "redirect output").start()
+    val exitCode = process.waitFor()
+    if (exitCode != 0) {
+      logInfo(s"exitCode: $exitCode")
+    }
     sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
     runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
   }
 
   test("Run PySpark Job on file from SUBMITTER with --py-files") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
+    // Ensuring that HADOOP_CONF_DIR env variable is unset
+    val builder = new ProcessBuilder(
+      Seq("/bin/bash", "-c", "export HADOOP_CONF_DIR=" +
+        " && exec").asJava)
+    builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
+    val process = builder.start()
+    new RedirectThread(process.getInputStream, System.out, "redirect output").start()
+    val exitCode = process.waitFor()
     launchStagingServer(SSLOptions(), None)
     sparkConf
       .set(DRIVER_DOCKER_IMAGE,

From dc8f2eb7b23f27a80eae8372bc80058818de7ae3 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Tue, 18 Jul 2017 14:26:04 -0700
Subject: [PATCH 531/534] Included integration tests for Stage 1

---
 .../spark/deploy/kubernetes/constants.scala   | 10 ++--
 .../deploy/kubernetes/submit/Client.scala     |  8 ++-
 ...DriverConfigurationStepsOrchestrator.scala |  9 ++--
 .../HadoopConfigBootstrapStep.scala           |  4 +-
 .../hadoopsteps/HadoopConfMounterStep.scala   | 10 +++-
 .../hadoopsteps/HadoopConfigSpec.scala        |  2 +-
 .../hadoopsteps/HadoopStepsOrchestrator.scala |  6 ++-
 .../kubernetes/KubernetesClusterManager.scala |  6 +--
 ...rConfigurationStepsOrchestratorSuite.scala |  3 ++
 .../integrationtest/KubernetesSuite.scala     | 49 +++++++++----------
 10 files changed, 61 insertions(+), 46 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 1e61e2f57a683..6e1a66c0fedb0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -86,15 +86,17 @@ package object constants {
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "spark-init-properties"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_DIR = "/etc/spark-init"
   private[spark] val INIT_CONTAINER_PROPERTIES_FILE_NAME = "spark-init.properties"
+  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_PATH =
+    s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
+  private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
+  private[spark] val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
+
   private[spark] val HADOOP_FILE_VOLUME = "hadoop-properties"
   private[spark] val HADOOP_FILE_DIR = "/etc/hadoop"
   private[spark] val HADOOP_CONF_DIR = "HADOOP_CONF_DIR"
+  private[spark] val HADOOP_CONF_DIR_LOC = "spark.kubernetes.hadoop.conf.dir"
   private[spark] val HADOOP_CONFIG_MAP_SPARK_CONF_NAME =
     "spark.kubernetes.hadoop.executor.hadoopconfigmapname"
-  private[spark] val INIT_CONTAINER_PROPERTIES_FILE_PATH =
-    s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
-  private[spark] val DEFAULT_SHUFFLE_MOUNT_NAME = "shuffle"
-  private[spark] val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
 
   // Miscellaneous
   private[spark] val ANNOTATION_EXECUTOR_NODE_AFFINITY = "scheduler.alpha.kubernetes.io/affinity"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index 2fa9b416330e5..c4fd6e413b0d4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -149,7 +149,9 @@ private[spark] class Client(
 }
 
 private[spark] object Client {
-  def run(sparkConf: SparkConf, clientArguments: ClientArguments): Unit = {
+  def run(sparkConf: SparkConf,
+          clientArguments: ClientArguments,
+          hadoopConfDir: Option[String]): Unit = {
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
     val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}"
     val launchTime = System.currentTimeMillis()
@@ -168,6 +170,7 @@ private[spark] object Client {
         clientArguments.mainClass,
         clientArguments.driverArgs,
         clientArguments.otherPyFiles,
+        hadoopConfDir,
         sparkConf)
     Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
         master,
@@ -195,6 +198,7 @@ private[spark] object Client {
   def main(args: Array[String]): Unit = {
     val parsedArguments = ClientArguments.fromCommandLineArgs(args)
     val sparkConf = new SparkConf()
-    run(sparkConf, parsedArguments)
+    val hadoopConfDir = sys.env.get("HADOOP_CONF_DIR")
+    run(sparkConf, parsedArguments, hadoopConfDir)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
index 5470c8e0ba977..41b7885b633bf 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -40,6 +40,7 @@ private[spark] class DriverConfigurationStepsOrchestrator(
     mainClass: String,
     appArgs: Array[String],
     additionalPythonFiles: Seq[String],
+    hadoopConfDir: Option[String],
     submissionSparkConf: SparkConf) {
 
   // The resource name prefix is derived from the application name, making it easy to connect the
@@ -98,9 +99,8 @@ private[spark] class DriverConfigurationStepsOrchestrator(
         submissionSparkConf)
     val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
         submissionSparkConf, kubernetesResourceNamePrefix)
-    val hadoopConfigurations =
-      sys.env.get("HADOOP_CONF_DIR").map{ conf => getHadoopConfFiles(conf)}
-          .getOrElse(Array.empty[File])
+    val hadoopConfigurations = hadoopConfDir.map(conf => getHadoopConfFiles(conf))
+      .getOrElse(Array.empty[File])
     val hadoopConfigSteps =
       if (hadoopConfigurations.isEmpty) {
         Option.empty[DriverConfigurationStep]
@@ -109,7 +109,8 @@ private[spark] class DriverConfigurationStepsOrchestrator(
           namespace,
           hadoopConfigMapName,
           submissionSparkConf,
-          hadoopConfigurations)
+          hadoopConfigurations,
+          hadoopConfDir)
         val hadoopConfSteps =
           hadoopStepsOrchestrator.getHadoopSteps()
         Some(new HadoopConfigBootstrapStep(hadoopConfSteps, hadoopConfigMapName))
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
index 17772aa0b0514..dd84c50c86253 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
@@ -37,7 +37,8 @@ private[spark] class HadoopConfigBootstrapStep(
     var currentHadoopSpec = HadoopConfigSpec(
       driverPod = driverSpec.driverPod,
       driverContainer = driverSpec.driverContainer,
-      configMapProperties = Map.empty[String, String])
+      configMapProperties = Map.empty[String, String],
+      additionalDriverSparkConf = Map.empty[String, String])
     for (nextStep <- hadoopConfigurationSteps) {
       currentHadoopSpec = nextStep.configureContainers(currentHadoopSpec)
     }
@@ -50,6 +51,7 @@ private[spark] class HadoopConfigBootstrapStep(
       .build()
     val executorSparkConf = driverSpec.driverSparkConf.clone()
       .set(HADOOP_CONFIG_MAP_SPARK_CONF_NAME, hadoopConfigMapName)
+      .setAll(currentHadoopSpec.additionalDriverSparkConf)
     driverSpec.copy(
       driverPod = currentHadoopSpec.driverPod,
       driverContainer = currentHadoopSpec.driverContainer,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
index 73374a18062a9..02e8dd7803ab3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
@@ -17,9 +17,11 @@
 package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
 
 import java.io.File
+
 import org.apache.commons.io.FileUtils.readFileToString
 
 import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainContainer}
+import org.apache.spark.deploy.kubernetes.constants._
 
  /**
   * Step that configures the ConfigMap + Volumes for the driver
@@ -27,7 +29,8 @@ import org.apache.spark.deploy.kubernetes.{HadoopConfBootstrap, PodWithMainConta
 private[spark] class HadoopConfMounterStep(
     hadoopConfigMapName: String,
     hadoopConfigurationFiles: Array[File],
-    hadoopConfBootstrapConf: HadoopConfBootstrap)
+    hadoopConfBootstrapConf: HadoopConfBootstrap,
+    hadoopConfDir: Option[String])
   extends HadoopConfigurationStep {
 
    override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
@@ -42,7 +45,10 @@ private[spark] class HadoopConfMounterStep(
        driverContainer = bootstrappedPodAndMainContainer.mainContainer,
        configMapProperties =
          hadoopConfigurationFiles.map(file =>
-           (file.toPath.getFileName.toString, readFileToString(file))).toMap
+           (file.toPath.getFileName.toString, readFileToString(file))).toMap,
+       additionalDriverSparkConf = hadoopConfigSpec.additionalDriverSparkConf ++
+        hadoopConfDir.map(conf_dir => Map(HADOOP_CONF_DIR_LOC -> conf_dir)).getOrElse(
+          Map.empty[String, String])
      )
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
index e31f6c5901a55..872593f849ff1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
@@ -30,7 +30,7 @@ import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
   *   pairs of (path, data)
   */
 private[spark] case class HadoopConfigSpec(
-//  additionalDriverSparkConf: Map[String, String],
+  additionalDriverSparkConf: Map[String, String],
   driverPod: Pod,
   driverContainer: Container,
   configMapProperties: Map[String, String])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
index 55722c75b5926..8acbab922a06e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
@@ -30,7 +30,8 @@ private[spark] class HadoopStepsOrchestrator(
   namespace: String,
   hadoopConfigMapName: String,
   submissionSparkConf: SparkConf,
-  hadoopConfigurationFiles: Array[File]) {
+  hadoopConfigurationFiles: Array[File],
+  hadoopConfDir: Option[String]) {
   private val maybeKerberosSupport = submissionSparkConf.get(KUBERNETES_KERBEROS_SUPPORT)
 
   def getHadoopSteps(): Seq[HadoopConfigurationStep] = {
@@ -40,7 +41,8 @@ private[spark] class HadoopStepsOrchestrator(
     val hadoopConfMounterStep = new HadoopConfMounterStep(
       hadoopConfigMapName,
       hadoopConfigurationFiles,
-      hadoopConfBootstrapImpl)
+      hadoopConfBootstrapImpl,
+      hadoopConfDir)
     val maybeHadoopKerberosMountingStep =
       if (maybeKerberosSupport) {
         // TODO: Implement mounting secrets
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index ff482e9ae3b8f..6fb76affe5c86 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -42,6 +42,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     val maybeConfigMap = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP)
     val maybeConfigMapKey = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY)
     val maybeHadoopConfigMap = sparkConf.getOption(HADOOP_CONFIG_MAP_SPARK_CONF_NAME)
+    val maybeHadoopConfDir = sparkConf.getOption(HADOOP_CONF_DIR_LOC)
 
     val maybeExecutorInitContainerSecretName =
       sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET)
@@ -75,9 +76,8 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     val hadoopBootStrap = for {
       hadoopConfigMap <- maybeHadoopConfigMap
     } yield {
-      val hadoopConfigurations =
-        sys.env.get("HADOOP_CONF_DIR").map{ conf => getHadoopConfFiles(conf)}
-          .getOrElse(Array.empty[File])
+      val hadoopConfigurations = maybeHadoopConfDir.map(
+          conf_dir => getHadoopConfFiles(conf_dir)).getOrElse(Array.empty[File])
       new HadoopConfBootstrapImpl(
         hadoopConfigMap,
         hadoopConfigurations
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala
index e4f221ad99cc5..babb65bf743b1 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestratorSuite.scala
@@ -42,6 +42,7 @@ private[spark] class DriverConfigurationStepsOrchestratorSuite extends SparkFunS
         MAIN_CLASS,
         APP_ARGS,
         ADDITIONAL_PYTHON_FILES,
+        None,
         sparkConf)
     val steps = orchestrator.getAllConfigurationSteps()
     assert(steps.size === 3)
@@ -63,6 +64,7 @@ private[spark] class DriverConfigurationStepsOrchestratorSuite extends SparkFunS
         MAIN_CLASS,
         APP_ARGS,
         ADDITIONAL_PYTHON_FILES,
+        None,
         sparkConf)
     val steps = orchestrator.getAllConfigurationSteps()
     assert(steps.size === 4)
@@ -84,6 +86,7 @@ private[spark] class DriverConfigurationStepsOrchestratorSuite extends SparkFunS
         MAIN_CLASS,
         APP_ARGS,
         ADDITIONAL_PYTHON_FILES,
+        None,
         sparkConf)
     val steps = orchestrator.getAllConfigurationSteps()
     assert(steps.size === 4)
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index f4aa53c6f4f11..810ed5e30d6c2 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -74,31 +74,19 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("Include HADOOP_CONF for HDFS based jobs ") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    // Ensuring that HADOOP_CONF_DIR env variable is set
-    val builder = new ProcessBuilder(
-      Seq("/bin/bash", "-c", "export HADOOP_CONF_DIR=" +
-        "test-data/hadoop-conf-files && exec").asJava)
-    builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
-    val process = builder.start()
-    new RedirectThread(process.getInputStream, System.out, "redirect output").start()
-    val exitCode = process.waitFor()
-    if (exitCode != 0) {
-      logInfo(s"exitCode: $exitCode")
-    }
+    // Ensuring that HADOOP_CONF_DIR variable is set, could also be one via env HADOOP_CONF_DIR
     sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
-    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+    runSparkApplicationAndVerifyCompletion(
+      JavaMainAppResource(CONTAINER_LOCAL_MAIN_APP_RESOURCE),
+      SPARK_PI_MAIN_CLASS,
+      Seq("HADOOP_CONF_DIR defined. Mounting HDFS specific .xml files", "Pi is roughly 3"),
+      Array("5"),
+      Seq.empty[String],
+      Some("test-data/hadoop-conf-files"))
   }
 
   test("Run PySpark Job on file from SUBMITTER with --py-files") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    // Ensuring that HADOOP_CONF_DIR env variable is unset
-    val builder = new ProcessBuilder(
-      Seq("/bin/bash", "-c", "export HADOOP_CONF_DIR=" +
-        " && exec").asJava)
-    builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
-    val process = builder.start()
-    new RedirectThread(process.getInputStream, System.out, "redirect output").start()
-    val exitCode = process.waitFor()
     launchStagingServer(SSLOptions(), None)
     sparkConf
       .set(DRIVER_DOCKER_IMAGE,
@@ -183,7 +171,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         GROUP_BY_MAIN_CLASS,
         Seq("The Result is"),
         Array.empty[String],
-        Seq.empty[String])
+        Seq.empty[String],
+        None)
   }
 
   test("Use remote resources without the resource staging server.") {
@@ -247,7 +236,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         FILE_EXISTENCE_MAIN_CLASS,
         Seq(s"File found at /opt/spark/${testExistenceFile.getName} with correct contents."),
         Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS),
-        Seq.empty[String])
+        Seq.empty[String],
+        None)
   }
 
   test("Use a very long application name.") {
@@ -277,9 +267,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
     runSparkApplicationAndVerifyCompletion(
         JavaMainAppResource(appResource),
         SPARK_PI_MAIN_CLASS,
-        Seq("Pi is roughly 3"),
+        Seq(
+          "hadoop config map key was not specified",
+          "Pi is roughly 3"),
         Array.empty[String],
-        Seq.empty[String])
+        Seq.empty[String],
+        None)
   }
 
   private def runPySparkPiAndVerifyCompletion(
@@ -289,7 +282,8 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       PYSPARK_PI_MAIN_CLASS,
       Seq("Submitting 5 missing tasks from ResultStage", "Pi is roughly 3"),
       Array("5"),
-      otherPyFiles)
+      otherPyFiles,
+      None)
   }
 
   private def runSparkApplicationAndVerifyCompletion(
@@ -297,13 +291,14 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       mainClass: String,
       expectedLogOnCompletion: Seq[String],
       appArgs: Array[String],
-      otherPyFiles: Seq[String]): Unit = {
+      otherPyFiles: Seq[String],
+      hadoopConfDir: Option[String]): Unit = {
     val clientArguments = ClientArguments(
       mainAppResource = appResource,
       mainClass = mainClass,
       driverArgs = appArgs,
       otherPyFiles = otherPyFiles)
-    Client.run(sparkConf, clientArguments)
+    Client.run(sparkConf, clientArguments, hadoopConfDir)
     val driverPod = kubernetesTestComponents.kubernetesClient
       .pods()
       .withLabel("spark-app-locator", APP_LOCATOR_LABEL)

From 82e073b98d1dacc973ad66db2f78d53a1cc17a39 Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Tue, 18 Jul 2017 19:09:16 -0700
Subject: [PATCH 532/534] Initial Kerberos support

---
 .../spark/deploy/kubernetes/config.scala      | 14 +++++
 .../HadoopKerberosMounterStep.scala           | 60 +++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 7ebcf7253ebb3..121f3f347af38 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -505,6 +505,20 @@ package object config extends Logging {
       .booleanConf
       .createWithDefault(false)
 
+  private[spark] val KUBERNETES_KERBEROS_KEYTAB =
+    ConfigBuilder("spark.kubernetes.kerberos.keytab")
+      .doc("Specify the location of keytab" +
+        " for Kerberos in order to access Secure HDFS")
+      .stringConf
+      .createOptional
+
+  private[spark] val KUBERNETES_KERBEROS_PRINCIPAL =
+    ConfigBuilder("spark.kubernetes.kerberos.principal")
+      .doc("Specify the principal" +
+        " for Kerberos in order to access Secure HDFS")
+      .stringConf
+      .createOptional
+
   private[spark] def resolveK8sMaster(rawMasterString: String): String = {
     if (!rawMasterString.startsWith("k8s://")) {
       throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala
new file mode 100644
index 0000000000000..66c652232501b
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+import java.io.File
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.kubernetes.config._
+import org.apache.spark.internal.Logging
+
+// import org.apache.spark.deploy.security.HadoopDelegationTokenManager
+
+private[spark] case class DelegationToken(
+  principle: String,
+  bytes: Array[Byte],
+  renewal: Long)
+
+ /**
+  * This class is responsible for Hadoop DT renewal
+  * TODO: THIS IS BLOCKED BY SPARK 2.2 REBASE
+  */
+private[spark] class HadoopKerberosMounterStep(
+   submissionSparkConf: SparkConf)
+  extends HadoopConfigurationStep with Logging {
+
+   private val maybePrincipal = submissionSparkConf.get(KUBERNETES_KERBEROS_PRINCIPAL)
+   private val maybeKeytab = submissionSparkConf.get(KUBERNETES_KERBEROS_KEYTAB).map(
+     k => new File(k))
+
+   override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
+     val hadoopConf = SparkHadoopUtil.get.newConfiguration(submissionSparkConf)
+     if (!UserGroupInformation.isSecurityEnabled) logError("Hadoop not configuration with Kerberos")
+     for {
+       principal <- maybePrincipal
+       keytab <- maybeKeytab
+     } yield {
+       submissionSparkConf.set("spark.yarn.principal", principal)
+       submissionSparkConf.set("spark.yarn.keytab", keytab.toURI.toString)
+     }
+     hadoopConfigSpec
+   }
+ }

From 3f1c567159aa20568767bb036b1109267e82cd7e Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Fri, 21 Jul 2017 15:44:25 -0700
Subject: [PATCH 533/534] initial Stage 2 architecture using deprecated 2.1
 methods

---
 .../org/apache/spark/deploy/SparkSubmit.scala |   2 +-
 .../kubernetes/HadoopConfBootstrap.scala      |  18 +--
 .../kubernetes/KerberosConfBootstrap.scala    |  42 ++++++
 .../spark/deploy/kubernetes/config.scala      |  11 +-
 .../spark/deploy/kubernetes/constants.scala   |   8 ++
 .../deploy/kubernetes/submit/Client.scala     |   2 +-
 ...DriverConfigurationStepsOrchestrator.scala |  16 +--
 .../HadoopConfigBootstrapStep.scala           |  12 +-
 .../hadoopsteps/HDFSDelegationToken.scala     |  19 +++
 .../hadoopsteps/HadoopConfMounterStep.scala   |   2 +-
 .../hadoopsteps/HadoopConfigSpec.scala        |   6 +-
 .../HadoopKerberosKeytabResolverStep.scala    | 130 ++++++++++++++++++
 .../HadoopKerberosMounterStep.scala           |  60 --------
 .../HadoopKerberosSecretResolverStep.scala    |  39 ++++++
 .../hadoopsteps/HadoopStepsOrchestrator.scala |  29 +++-
 .../kubernetes/KubernetesClusterManager.scala |   7 +
 .../KubernetesClusterSchedulerBackend.scala   |  14 +-
 17 files changed, 313 insertions(+), 104 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KerberosConfBootstrap.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HDFSDelegationToken.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosSecretResolverStep.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index df50af13f71a3..67e0a13e6d0b5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -559,7 +559,7 @@ object SparkSubmit {
     }
 
     // assure a keytab is available from any place in a JVM
-    if (clusterManager == YARN || clusterManager == LOCAL) {
+    if (clusterManager == YARN || clusterManager == KUBERNETES || clusterManager == LOCAL) {
       if (args.principal != null) {
         require(args.keytab != null, "Keytab must be specified when principal is specified")
         if (!new File(args.keytab).exists()) {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
index a8ab857ff145d..733fbeffe45b7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/HadoopConfBootstrap.scala
@@ -18,6 +18,8 @@ package org.apache.spark.deploy.kubernetes
 
 import java.io.File
 
+import scala.collection.JavaConverters._
+
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, KeyToPathBuilder, PodBuilder}
 
 import org.apache.spark.deploy.kubernetes.constants._
@@ -40,16 +42,17 @@ private[spark] trait HadoopConfBootstrap {
 
 private[spark] class HadoopConfBootstrapImpl(
   hadoopConfConfigMapName: String,
-  hadoopConfigFiles: Array[File]) extends HadoopConfBootstrap with Logging{
+  hadoopConfigFiles: Seq[File]) extends HadoopConfBootstrap with Logging{
 
   override def bootstrapMainContainerAndVolumes(
     originalPodWithMainContainer: PodWithMainContainer)
     : PodWithMainContainer = {
-    import scala.collection.JavaConverters._
     logInfo("HADOOP_CONF_DIR defined. Mounting HDFS specific .xml files")
     val keyPaths = hadoopConfigFiles.map(file =>
-      new KeyToPathBuilder().withKey(file.toPath.getFileName.toString)
-        .withPath(file.toPath.getFileName.toString).build()).toList
+      new KeyToPathBuilder()
+        .withKey(file.toPath.getFileName.toString)
+        .withPath(file.toPath.getFileName.toString)
+      .build()).toList
     val hadoopSupportedPod = new PodBuilder(originalPodWithMainContainer.pod)
       .editSpec()
         .addNewVolume()
@@ -72,9 +75,8 @@ private[spark] class HadoopConfBootstrapImpl(
         .withValue(HADOOP_FILE_DIR)
         .endEnv()
       .build()
-    PodWithMainContainer(
-      hadoopSupportedPod,
-      mainContainerWithMountedHadoopConf
-    )
+    originalPodWithMainContainer.copy(
+      pod = hadoopSupportedPod,
+      mainContainer = mainContainerWithMountedHadoopConf)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KerberosConfBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KerberosConfBootstrap.scala
new file mode 100644
index 0000000000000..e0c1b28f05046
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KerberosConfBootstrap.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes
+
+import io.fabric8.kubernetes.api.model.ContainerBuilder
+
+import org.apache.spark.deploy.kubernetes.constants._
+
+private[spark] trait KerberosConfBootstrap {
+  def bootstrapMainContainerAndVolumes(originalPodWithMainContainer: PodWithMainContainer)
+    : PodWithMainContainer
+}
+
+private[spark] class KerberosConfBootstrapImpl(
+  delegationTokenLabelName: String) extends KerberosConfBootstrap{
+  override def bootstrapMainContainerAndVolumes(
+  originalPodWithMainContainer: PodWithMainContainer)
+  : PodWithMainContainer = {
+    val mainContainerWithMountedHadoopConf = new ContainerBuilder(
+      originalPodWithMainContainer.mainContainer)
+      .addNewEnv()
+        .withName(ENV_KERBEROS_SECRET_LABEL)
+        .withValue(delegationTokenLabelName)
+      .endEnv()
+      .build()
+    originalPodWithMainContainer.copy(mainContainer = mainContainerWithMountedHadoopConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
index 121f3f347af38..9e397befdba4e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/config.scala
@@ -500,8 +500,7 @@ package object config extends Logging {
   private[spark] val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
   private[spark] val KUBERNETES_KERBEROS_SUPPORT =
     ConfigBuilder("spark.kubernetes.kerberos")
-      .doc("Specify whether your job is a job " +
-        "that will require a Delegation Token to access HDFS")
+      .doc("Specify whether your job is a job that will require a Delegation Token to access HDFS")
       .booleanConf
       .createWithDefault(false)
 
@@ -519,6 +518,14 @@ package object config extends Logging {
       .stringConf
       .createOptional
 
+  private[spark] val KUBERNETES_KERBEROS_DT_SECRET =
+    ConfigBuilder("spark.kubernetes.kerberos.tokensecret")
+      .doc("Specify the label of the secret where " +
+        " your existing delegation token is stored. This removes the need" +
+        " for the job user to provide any keytab for launching a job")
+      .stringConf
+      .createOptional
+
   private[spark] def resolveK8sMaster(rawMasterString: String): String = {
     if (!rawMasterString.startsWith("k8s://")) {
       throw new IllegalArgumentException("Master URL should start with k8s:// in Kubernetes mode.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
index 6e1a66c0fedb0..389297a5cec55 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/constants.scala
@@ -97,6 +97,14 @@ package object constants {
   private[spark] val HADOOP_CONF_DIR_LOC = "spark.kubernetes.hadoop.conf.dir"
   private[spark] val HADOOP_CONFIG_MAP_SPARK_CONF_NAME =
     "spark.kubernetes.hadoop.executor.hadoopconfigmapname"
+  private[spark] val HADOOP_KERBEROS_SECRET_NAME =
+    "spark.kubernetes.kerberos.dt"
+  private[spark] val KERBEROS_SPARK_CONF_NAME =
+    "spark.kubernetes.kerberos.secretlabelname"
+  private[spark] val KERBEROS_SECRET_LABEL_PREFIX =
+    "hadoop-tokens"
+  private[spark] val ENV_KERBEROS_SECRET_LABEL =
+    "KERBEROS_SECRET_LABEL"
 
   // Miscellaneous
   private[spark] val ANNOTATION_EXECUTOR_NODE_AFFINITY = "scheduler.alpha.kubernetes.io/affinity"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
index c4fd6e413b0d4..7a30bbe50790f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/Client.scala
@@ -149,7 +149,7 @@ private[spark] class Client(
 }
 
 private[spark] object Client {
-  def run(sparkConf: SparkConf,
+    def run(sparkConf: SparkConf,
           clientArguments: ClientArguments,
           hadoopConfDir: Option[String]): Unit = {
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
index 41b7885b633bf..096518432e2de 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/DriverConfigurationStepsOrchestrator.scala
@@ -16,8 +16,6 @@
  */
 package org.apache.spark.deploy.kubernetes.submit
 
-import java.io.File
-
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.ConfigurationUtils
 import org.apache.spark.deploy.kubernetes.config._
@@ -99,17 +97,14 @@ private[spark] class DriverConfigurationStepsOrchestrator(
         submissionSparkConf)
     val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
         submissionSparkConf, kubernetesResourceNamePrefix)
-    val hadoopConfigurations = hadoopConfDir.map(conf => getHadoopConfFiles(conf))
-      .getOrElse(Array.empty[File])
     val hadoopConfigSteps =
-      if (hadoopConfigurations.isEmpty) {
+      if (hadoopConfDir.isEmpty) {
         Option.empty[DriverConfigurationStep]
       } else {
         val hadoopStepsOrchestrator = new HadoopStepsOrchestrator(
           namespace,
           hadoopConfigMapName,
           submissionSparkConf,
-          hadoopConfigurations,
           hadoopConfDir)
         val hadoopConfSteps =
           hadoopStepsOrchestrator.getHadoopSteps()
@@ -157,13 +152,4 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       hadoopConfigSteps.toSeq ++
       pythonStep.toSeq
   }
-  private def getHadoopConfFiles(path: String) : Array[File] = {
-    def isFile(file: File) = if (file.isFile) Some(file) else None
-    val dir = new File(path)
-    if (dir.isDirectory) {
-      dir.listFiles.flatMap { file => isFile(file) }
-    } else {
-      Array.empty[File]
-    }
-  }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
index dd84c50c86253..f6dedf827a2d8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/HadoopConfigBootstrapStep.scala
@@ -16,14 +16,14 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.submitsteps
 
-import java.io.StringWriter
-import java.util.Properties
+import scala.collection.JavaConverters._
 
-import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, HasMetadata}
+import io.fabric8.kubernetes.api.model.ConfigMapBuilder
 
 import org.apache.spark.deploy.kubernetes.constants._
 import org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps.{HadoopConfigSpec, HadoopConfigurationStep}
 
+
  /**
   * Configures the driverSpec that bootstraps dependencies into the driver pod.
   */
@@ -33,12 +33,12 @@ private[spark] class HadoopConfigBootstrapStep(
   extends DriverConfigurationStep {
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    import scala.collection.JavaConverters._
     var currentHadoopSpec = HadoopConfigSpec(
       driverPod = driverSpec.driverPod,
       driverContainer = driverSpec.driverContainer,
       configMapProperties = Map.empty[String, String],
-      additionalDriverSparkConf = Map.empty[String, String])
+      additionalDriverSparkConf = Map.empty[String, String],
+      dtSecret = None)
     for (nextStep <- hadoopConfigurationSteps) {
       currentHadoopSpec = nextStep.configureContainers(currentHadoopSpec)
     }
@@ -58,7 +58,7 @@ private[spark] class HadoopConfigBootstrapStep(
       driverSparkConf = executorSparkConf,
       otherKubernetesResources =
         driverSpec.otherKubernetesResources ++
-        Seq(configMap)
+        Seq(configMap) ++ currentHadoopSpec.dtSecret.toSeq
       )
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HDFSDelegationToken.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HDFSDelegationToken.scala
new file mode 100644
index 0000000000000..308bea183d141
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HDFSDelegationToken.scala
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+private[spark] case class HDFSDelegationToken(bytes: Array[Byte], renewal: Long)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
index 02e8dd7803ab3..a1f399033afdb 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfMounterStep.scala
@@ -28,7 +28,7 @@ import org.apache.spark.deploy.kubernetes.constants._
   */
 private[spark] class HadoopConfMounterStep(
     hadoopConfigMapName: String,
-    hadoopConfigurationFiles: Array[File],
+    hadoopConfigurationFiles: Seq[File],
     hadoopConfBootstrapConf: HadoopConfBootstrap,
     hadoopConfDir: Option[String])
   extends HadoopConfigurationStep {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
index 872593f849ff1..7f399311737bc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopConfigSpec.scala
@@ -16,7 +16,7 @@
  */
 package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
 
-import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
+import io.fabric8.kubernetes.api.model.{Container, Pod, Secret}
 
  /**
   * Represents a given configuration of the hadoop configuration logic, informing the
@@ -28,9 +28,11 @@ import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
   *   the hadoop logic needs)
   * - The properties that will be stored into the config map which have (key, value)
   *   pairs of (path, data)
+  * - The secret containing a DT, either previously specified or re-built
   */
 private[spark] case class HadoopConfigSpec(
   additionalDriverSparkConf: Map[String, String],
   driverPod: Pod,
   driverContainer: Container,
-  configMapProperties: Map[String, String])
+  configMapProperties: Map[String, String],
+  dtSecret: Option[Secret])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala
new file mode 100644
index 0000000000000..2872791f9c43a
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+import java.io._
+import java.security.PrivilegedExceptionAction
+
+import io.fabric8.kubernetes.api.model.SecretBuilder
+import org.apache.commons.codec.binary.Base64
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.kubernetes.{KerberosConfBootstrapImpl, PodWithMainContainer}
+import org.apache.spark.deploy.kubernetes.constants._
+import org.apache.spark.internal.Logging
+
+ /**
+  * Step that configures the ConfigMap + Volumes for the driver
+  */
+private[spark] class HadoopKerberosKeytabResolverStep(
+  submissionSparkConf: SparkConf,
+  maybePrincipal: Option[String],
+  maybeKeytab: Option[File]) extends HadoopConfigurationStep with Logging{
+
+  override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
+    // FIXME: Pass down hadoopConf so you can call sc.hadoopConfiguration
+    val hadoopConf = SparkHadoopUtil.get.newConfiguration(submissionSparkConf)
+    if (!UserGroupInformation.isSecurityEnabled) logError("Hadoop not configuration with Kerberos")
+    val maybeJobUserUGI =
+      for {
+        principal <- maybePrincipal
+        keytab <- maybeKeytab
+      } yield {
+        // Not necessary with [Spark-16742]
+        // Reliant on [Spark-20328] for changing to YARN principal
+        submissionSparkConf.set("spark.yarn.principal", principal)
+        submissionSparkConf.set("spark.yarn.keytab", keytab.toURI.toString)
+        logInfo("Logged into KDC with keytab using Job User UGI")
+        UserGroupInformation.loginUserFromKeytabAndReturnUGI(
+          principal,
+          keytab.toURI.toString)
+      }
+    // In the case that keytab is not specified we will read from Local Ticket Cache
+    val jobUserUGI = maybeJobUserUGI.getOrElse(UserGroupInformation.getCurrentUser)
+    val credentials: Credentials = jobUserUGI.getCredentials
+    val credentialsManager = newHadoopTokenManager(submissionSparkConf, hadoopConf)
+    var renewalTime = Long.MaxValue
+    jobUserUGI.doAs(new PrivilegedExceptionAction[Void] {
+      override def run(): Void = {
+        renewalTime = Math.min(
+          obtainCredentials(credentialsManager, hadoopConf, credentials),
+          renewalTime)
+        null
+      }
+    })
+    if (credentials.getAllTokens.isEmpty) logError("Did not obtain any Delegation Tokens")
+    val data = serialize(credentials)
+    val delegationToken = HDFSDelegationToken(data, renewalTime)
+    val initialTokenLabelName = s"$KERBEROS_SECRET_LABEL_PREFIX-1-$renewalTime"
+    logInfo(s"Storing dt in $initialTokenLabelName")
+    val secretDT =
+      new SecretBuilder()
+        .withNewMetadata()
+          .withName(HADOOP_KERBEROS_SECRET_NAME)
+          .endMetadata()
+          .addToData(initialTokenLabelName, Base64.encodeBase64String(delegationToken.bytes))
+      .build()
+    val bootstrapKerberos = new KerberosConfBootstrapImpl(initialTokenLabelName)
+    val withKerberosEnvPod = bootstrapKerberos.bootstrapMainContainerAndVolumes(
+      PodWithMainContainer(
+        hadoopConfigSpec.driverPod,
+        hadoopConfigSpec.driverContainer))
+    hadoopConfigSpec.copy(
+      additionalDriverSparkConf =
+        hadoopConfigSpec.additionalDriverSparkConf ++ Map(
+          KERBEROS_SPARK_CONF_NAME -> initialTokenLabelName),
+      driverPod = withKerberosEnvPod.pod,
+      driverContainer = withKerberosEnvPod.mainContainer,
+      dtSecret = Some(secretDT))
+  }
+
+  // Functions that should be in SparkHadoopUtil with Rebase to 2.2
+  @deprecated("Moved to core in 2.2", "2.2")
+  private def obtainCredentials(instance: Any, args: AnyRef*): Long = {
+    val method = Class
+      .forName("org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager")
+      .getMethod("obtainCredentials", classOf[Configuration], classOf[Configuration])
+    method.setAccessible(true)
+    method.invoke(instance, args: _*).asInstanceOf[Long]
+  }
+   @deprecated("Moved to core in 2.2", "2.2")
+   // This method will instead be using HadoopDelegationTokenManager from Spark 2.2
+   private def newHadoopTokenManager(args: AnyRef*): Any = {
+     val constructor = Class
+       .forName("org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager")
+       .getConstructor(classOf[SparkConf], classOf[Configuration])
+     constructor.setAccessible(true)
+     constructor.newInstance(args: _*)
+   }
+  @deprecated("Moved to core in 2.2", "2.2")
+  private def serialize(creds: Credentials): Array[Byte] = {
+    val byteStream = new ByteArrayOutputStream
+    val dataStream = new DataOutputStream(byteStream)
+    creds.writeTokenStorageToStream(dataStream)
+    byteStream.toByteArray
+  }
+
+  @deprecated("Moved to core in 2.2", "2.2")
+  private def deserialize(tokenBytes: Array[Byte]): Credentials = {
+    val creds = new Credentials()
+    creds.readTokenStorageStream(new DataInputStream(new ByteArrayInputStream(tokenBytes)))
+    creds
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala
deleted file mode 100644
index 66c652232501b..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosMounterStep.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
-
-import java.io.File
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.kubernetes.config._
-import org.apache.spark.internal.Logging
-
-// import org.apache.spark.deploy.security.HadoopDelegationTokenManager
-
-private[spark] case class DelegationToken(
-  principle: String,
-  bytes: Array[Byte],
-  renewal: Long)
-
- /**
-  * This class is responsible for Hadoop DT renewal
-  * TODO: THIS IS BLOCKED BY SPARK 2.2 REBASE
-  */
-private[spark] class HadoopKerberosMounterStep(
-   submissionSparkConf: SparkConf)
-  extends HadoopConfigurationStep with Logging {
-
-   private val maybePrincipal = submissionSparkConf.get(KUBERNETES_KERBEROS_PRINCIPAL)
-   private val maybeKeytab = submissionSparkConf.get(KUBERNETES_KERBEROS_KEYTAB).map(
-     k => new File(k))
-
-   override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
-     val hadoopConf = SparkHadoopUtil.get.newConfiguration(submissionSparkConf)
-     if (!UserGroupInformation.isSecurityEnabled) logError("Hadoop not configuration with Kerberos")
-     for {
-       principal <- maybePrincipal
-       keytab <- maybeKeytab
-     } yield {
-       submissionSparkConf.set("spark.yarn.principal", principal)
-       submissionSparkConf.set("spark.yarn.keytab", keytab.toURI.toString)
-     }
-     hadoopConfigSpec
-   }
- }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosSecretResolverStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosSecretResolverStep.scala
new file mode 100644
index 0000000000000..9406204988403
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosSecretResolverStep.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.submit.submitsteps.hadoopsteps
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.kubernetes.{KerberosConfBootstrapImpl, PodWithMainContainer}
+
+ /**
+  * Step that configures the ConfigMap + Volumes for the driver
+  */
+private[spark] class HadoopKerberosSecretResolverStep(
+  submissionSparkConf: SparkConf,
+  tokenLabelName: String) extends HadoopConfigurationStep {
+
+  override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
+    val bootstrapKerberos = new KerberosConfBootstrapImpl(tokenLabelName)
+    val withKerberosEnvPod = bootstrapKerberos.bootstrapMainContainerAndVolumes(
+      PodWithMainContainer(
+        hadoopConfigSpec.driverPod,
+        hadoopConfigSpec.driverContainer))
+    hadoopConfigSpec.copy(
+      driverPod = withKerberosEnvPod.pod,
+      driverContainer = withKerberosEnvPod.mainContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
index 8acbab922a06e..032ac27939813 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopStepsOrchestrator.scala
@@ -30,9 +30,14 @@ private[spark] class HadoopStepsOrchestrator(
   namespace: String,
   hadoopConfigMapName: String,
   submissionSparkConf: SparkConf,
-  hadoopConfigurationFiles: Array[File],
   hadoopConfDir: Option[String]) {
   private val maybeKerberosSupport = submissionSparkConf.get(KUBERNETES_KERBEROS_SUPPORT)
+  private val maybePrincipal = submissionSparkConf.get(KUBERNETES_KERBEROS_PRINCIPAL)
+  private val maybeKeytab = submissionSparkConf.get(KUBERNETES_KERBEROS_KEYTAB)
+    .map(k => new File(k))
+  private val maybeExistingSecret = submissionSparkConf.get(KUBERNETES_KERBEROS_DT_SECRET)
+  private val hadoopConfigurationFiles = hadoopConfDir.map(conf => getHadoopConfFiles(conf))
+     .getOrElse(Seq.empty[File])
 
   def getHadoopSteps(): Seq[HadoopConfigurationStep] = {
     val hadoopConfBootstrapImpl = new HadoopConfBootstrapImpl(
@@ -43,13 +48,27 @@ private[spark] class HadoopStepsOrchestrator(
       hadoopConfigurationFiles,
       hadoopConfBootstrapImpl,
       hadoopConfDir)
-    val maybeHadoopKerberosMountingStep =
+    val maybeKerberosStep =
       if (maybeKerberosSupport) {
-        // TODO: Implement mounting secrets
-        Option.empty[HadoopConfigurationStep]
+        maybeExistingSecret.map(secretLabel => Some(new HadoopKerberosSecretResolverStep(
+         submissionSparkConf,
+         secretLabel))).getOrElse(Some(
+            new HadoopKerberosKeytabResolverStep(
+              submissionSparkConf,
+              maybePrincipal,
+              maybeKeytab)))
       } else {
         Option.empty[HadoopConfigurationStep]
       }
-    Seq(hadoopConfMounterStep) ++ maybeHadoopKerberosMountingStep.toSeq
+    Seq(hadoopConfMounterStep) ++ maybeKerberosStep.toSeq
+  }
+  private def getHadoopConfFiles(path: String) : Seq[File] = {
+     def isFile(file: File) = if (file.isFile) Some(file) else None
+     val dir = new File(path)
+     if (dir.isDirectory) {
+        dir.listFiles.flatMap { file => isFile(file) }.toSeq
+     } else {
+       Seq.empty[File]
+     }
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
index 6fb76affe5c86..ff348cf06b431 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -43,6 +43,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     val maybeConfigMapKey = sparkConf.get(EXECUTOR_INIT_CONTAINER_CONFIG_MAP_KEY)
     val maybeHadoopConfigMap = sparkConf.getOption(HADOOP_CONFIG_MAP_SPARK_CONF_NAME)
     val maybeHadoopConfDir = sparkConf.getOption(HADOOP_CONF_DIR_LOC)
+    val maybeDTSecret = sparkConf.getOption(KERBEROS_SPARK_CONF_NAME)
 
     val maybeExecutorInitContainerSecretName =
       sparkConf.get(EXECUTOR_INIT_CONTAINER_SECRET)
@@ -83,6 +84,11 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         hadoopConfigurations
       )
     }
+    val kerberosBootstrap = for {
+      dTSecret <- maybeDTSecret
+    } yield {
+      new KerberosConfBootstrapImpl(dTSecret)
+    }
     if (maybeConfigMap.isEmpty) {
       logWarning("The executor's init-container config map was not specified. Executors will" +
         " therefore not attempt to fetch remote or submitted dependencies.")
@@ -107,6 +113,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         sc,
         initBootStrap,
         hadoopBootStrap,
+        kerberosBootstrap,
         executorInitContainerSecretVolumePlugin,
         kubernetesClient)
   }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
index c3d14b8e12fb1..7c01653acb4c5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
 import scala.collection.mutable
+import scala.collection.JavaConverters._
 import scala.concurrent.{ExecutionContext, Future}
 
 import com.fasterxml.jackson.databind.ObjectMapper
@@ -49,6 +50,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val sc: SparkContext,
     executorInitContainerBootstrap: Option[SparkPodInitContainerBootstrap],
     executorHadoopBootStrap: Option[HadoopConfBootstrap],
+    executorKerberosBootStrap: Option[KerberosConfBootstrap],
     executorMountInitContainerSecretPlugin: Option[InitContainerResourceStagingServerSecretPlugin],
     kubernetesClient: KubernetesClient)
   extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
@@ -367,7 +369,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
    * @return A tuple of the new executor name and the Pod data structure.
    */
   private def allocateNewExecutorPod(nodeToLocalTaskCount: Map[String, Int]): (String, Pod) = {
-    import scala.collection.JavaConverters._
     val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
     val name = s"$executorPodNamePrefix-exec-$executorId"
 
@@ -529,9 +530,16 @@ private[spark] class KubernetesClusterSchedulerBackend(
         )
         (podWithMainContainer.pod, podWithMainContainer.mainContainer)
       }.getOrElse(executorPodWithNodeAffinity, initBootstrappedExecutorContainer)
-    val resolvedExecutorPod = new PodBuilder(executorHadoopConfPod)
+    val (executorKerberosPod, executorKerberosContainer) =
+      executorKerberosBootStrap.map { bootstrap =>
+        val podWithMainContainer = bootstrap.bootstrapMainContainerAndVolumes(
+          PodWithMainContainer(executorHadoopConfPod, executorHadoopConfContainer)
+        )
+        (podWithMainContainer.pod, podWithMainContainer.mainContainer)
+      }.getOrElse((executorHadoopConfPod, executorHadoopConfContainer))
+    val resolvedExecutorPod = new PodBuilder(executorKerberosPod)
       .editSpec()
-        .addToContainers(executorHadoopConfContainer)
+        .addToContainers(executorKerberosContainer)
         .endSpec()
       .build()
     try {

From 50c8fbf6b79cbb5246e57531b4734df54f5fde7e Mon Sep 17 00:00:00 2001
From: Ilan Filonenko <if56@cornell.edu>
Date: Wed, 26 Jul 2017 10:40:56 -0700
Subject: [PATCH 534/534] Added current, BROKEN, integration test environment
 for review

---
 .../HadoopKerberosKeytabResolverStep.scala    |   2 +
 .../data-populator-deployment.yml             |  28 ++
 .../kerberos-yml/data-populator-service.yml   |  16 +
 .../kerberos-yml/dn1-deployment.yml           |  28 ++
 .../kerberos-yml/dn1-service.yml              |  16 +
 .../kerberos-yml/kerberos-deployment.yml      |  28 ++
 .../kerberos-yml/kerberos-service.yml         |  16 +
 .../kerberos-yml/namenode-hadoop-pv.yml       |  13 +
 .../kerberos-yml/namenode-hadoop.yml          |  10 +
 .../kerberos-yml/nn-deployment.yml            |  33 ++
 .../kerberos-yml/nn-service.yml               |  16 +
 .../kerberos-yml/server-keytab-pv.yml         |  13 +
 .../kerberos-yml/server-keytab.yml            |  10 +
 .../kubernetes/integration-tests/pom.xml      |   1 +
 .../src/test/resources/core-site.xml          |  34 ++
 .../src/test/resources/hdfs-site.xml          | 132 ++++++
 .../src/test/resources/krb5.conf              |  25 ++
 .../src/test/resources/yarn-site.xml          |  26 ++
 .../KerberizedHadoopClusterLauncher.scala     | 112 +++++
 .../integrationtest/KubernetesSuite.scala     | 382 ++++++++++--------
 .../KubernetesTestComponents.scala            |  23 +-
 .../minikube/MinikubeTestBackend.scala        |   4 +-
 .../docker/SparkDockerImageBuilder.scala      |   7 +-
 .../test-data/hadoop-conf-files/core-site.xml |   2 +-
 .../test-data/hadoop-conf-files/hdfs-site.xml |  12 +-
 .../test-data/hadoop-conf-files/yarn-site.xml |  26 ++
 26 files changed, 816 insertions(+), 199 deletions(-)
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-deployment.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-service.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-deployment.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-service.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-deployment.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-service.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop-pv.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/nn-deployment.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/nn-service.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab-pv.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab.yml
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/resources/core-site.xml
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/resources/hdfs-site.xml
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/resources/krb5.conf
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/resources/yarn-site.xml
 create mode 100644 resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KerberizedHadoopClusterLauncher.scala
 create mode 100644 resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/yarn-site.xml

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala
index 2872791f9c43a..401fc60f73cad 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/submit/submitsteps/hadoopsteps/HadoopKerberosKeytabResolverStep.scala
@@ -41,6 +41,7 @@ private[spark] class HadoopKerberosKeytabResolverStep(
   override def configureContainers(hadoopConfigSpec: HadoopConfigSpec): HadoopConfigSpec = {
     // FIXME: Pass down hadoopConf so you can call sc.hadoopConfiguration
     val hadoopConf = SparkHadoopUtil.get.newConfiguration(submissionSparkConf)
+    logInfo(s"Hadoop Configuration: ${hadoopConf.toString}")
     if (!UserGroupInformation.isSecurityEnabled) logError("Hadoop not configuration with Kerberos")
     val maybeJobUserUGI =
       for {
@@ -58,6 +59,7 @@ private[spark] class HadoopKerberosKeytabResolverStep(
       }
     // In the case that keytab is not specified we will read from Local Ticket Cache
     val jobUserUGI = maybeJobUserUGI.getOrElse(UserGroupInformation.getCurrentUser)
+    logInfo("Primary group name: jobUserUGI.getPrimaryGroupName")
     val credentials: Credentials = jobUserUGI.getCredentials
     val credentialsManager = newHadoopTokenManager(submissionSparkConf, hadoopConf)
     var renewalTime = Long.MaxValue
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-deployment.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-deployment.yml
new file mode 100644
index 0000000000000..5d6c752f78ea9
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-deployment.yml
@@ -0,0 +1,28 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: data-populator
+spec:
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        pod.beta.kubernetes.io/hostname: data-populator
+      labels:
+        name: hdfs-data-populator
+        kerberosService: data-populator
+    spec:
+      containers:
+      - command:
+        - /populate-data.sh
+        name: data-populator
+        image: ifilonenko/hadoop-base:latest
+        imagePullPolicy: IfNotPresent
+        volumeMounts:
+        - mountPath: /var/keytabs
+          name: data-populator-keytab
+      restartPolicy: Always
+      volumes:
+      - name: data-populator-keytab
+        persistentVolumeClaim:
+          claimName: server-keytab
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-service.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-service.yml
new file mode 100644
index 0000000000000..8d8e02671c2c2
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/data-populator-service.yml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  annotations:
+    service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
+  labels:
+    kerberosService: data-populator
+  name: data-populator
+spec:
+  clusterIP: None
+  ports:
+  - protocol: TCP
+    port: 55555
+    targetPort: 0
+  selector:
+    kerberosService: data-populator
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-deployment.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-deployment.yml
new file mode 100644
index 0000000000000..eb68d5c011881
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-deployment.yml
@@ -0,0 +1,28 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: dn1
+spec:
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        pod.beta.kubernetes.io/hostname: dn1
+      labels:
+        name: hdfs-dn1
+        kerberosService: dn1
+    spec:
+      containers:
+      - command:
+        - /start-datanode.sh
+        name: dn1
+        image: ifilonenko/hadoop-base:latest
+        imagePullPolicy: IfNotPresent
+        volumeMounts:
+        - mountPath: /var/keytabs
+          name: dn1-keytab
+      restartPolicy: Always
+      volumes:
+      - name: dn1-keytab
+        persistentVolumeClaim:
+          claimName: server-keytab
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-service.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-service.yml
new file mode 100644
index 0000000000000..f746661721da6
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/dn1-service.yml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  annotations:
+    service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
+  labels:
+    kerberosService: dn1
+  name: dn1
+spec:
+  clusterIP: None
+  ports:
+  - protocol: TCP
+    port: 55555
+    targetPort: 0
+  selector:
+    kerberosService: dn1
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-deployment.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-deployment.yml
new file mode 100644
index 0000000000000..4268e424e4b28
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-deployment.yml
@@ -0,0 +1,28 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: kerberos
+spec:
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        pod.beta.kubernetes.io/hostname: kerberos
+      labels:
+        name: hdfs-kerberos
+        kerberosService: kerberos
+    spec:
+      containers:
+      - command:
+        - /start-kdc.sh
+        name: kerberos
+        image: ifilonenko/hadoop-base:latest
+        imagePullPolicy: IfNotPresent
+        volumeMounts:
+        - mountPath: /var/keytabs
+          name: kerb-keytab
+      restartPolicy: Always
+      volumes:
+      - name: kerb-keytab
+        persistentVolumeClaim:
+          claimName: server-keytab
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-service.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-service.yml
new file mode 100644
index 0000000000000..38746af6a0ad0
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/kerberos-service.yml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  annotations:
+    service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
+  labels:
+    kerberosService: kerberos
+  name: kerberos
+spec:
+  clusterIP: None
+  ports:
+  - protocol: TCP
+    port: 55555
+    targetPort: 0
+  selector:
+    kerberosService: kerberos
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop-pv.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop-pv.yml
new file mode 100644
index 0000000000000..d3d09f9d0699c
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop-pv.yml
@@ -0,0 +1,13 @@
+kind: PersistentVolume
+apiVersion: v1
+metadata:
+  name: nn-hadoop
+  labels:
+    type: local
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  hostPath:
+    path: "/tmp/nn"
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop.yml
new file mode 100644
index 0000000000000..3f7a74726ae01
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/namenode-hadoop.yml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: nn-hadoop
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 100Mi
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/nn-deployment.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/nn-deployment.yml
new file mode 100644
index 0000000000000..ca2afa6a4c5c2
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/nn-deployment.yml
@@ -0,0 +1,33 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: nn
+spec:
+  replicas: 1
+  template:
+    metadata:
+      annotations:
+        pod.beta.kubernetes.io/hostname: nn
+      labels:
+        name: hdfs-nn
+        kerberosService: nn
+    spec:
+      containers:
+      - command:
+        - /start-namenode.sh
+        name: nn
+        ports:
+        - containerPort: 9000
+        image: ifilonenko/hadoop-base:latest
+        imagePullPolicy: IfNotPresent
+        volumeMounts:
+        - mountPath: /var/keytabs
+          name: nn-keytab
+      restartPolicy: Always
+      volumes:
+      - name: nn-keytab
+        persistentVolumeClaim:
+          claimName: server-keytab
+      - name: nn-hadoop
+        persistentVolumeClaim:
+          claimName: nn-hadoop
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/nn-service.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/nn-service.yml
new file mode 100644
index 0000000000000..4f1e44a22ba55
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/nn-service.yml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  annotations:
+    service.alpha.kubernetes.io/tolerate-unready-endpoints: "true"
+  labels:
+    kerberosService: nn
+  name: nn
+spec:
+  clusterIP: None
+  ports:
+  - protocol: TCP
+    port: 9000
+    targetPort: 9000
+  selector:
+    kerberosService: nn
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab-pv.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab-pv.yml
new file mode 100644
index 0000000000000..bf1667ebf66bd
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab-pv.yml
@@ -0,0 +1,13 @@
+kind: PersistentVolume
+apiVersion: v1
+metadata:
+  name: server-keytab
+  labels:
+    type: local
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteOnce
+  hostPath:
+    path: "/tmp/keytab"
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab.yml b/resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab.yml
new file mode 100644
index 0000000000000..873f263c42fcd
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/kerberos-yml/server-keytab.yml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: server-keytab
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 100Mi
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index cd3ccad0a2b22..7cf670c5938bc 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -92,6 +92,7 @@
     <dependency>
       <groupId>com.spotify</groupId>
       <artifactId>docker-client</artifactId>
+      <version>8.8.2</version>
       <scope>test</scope>
       <!--
         See https://github.com/spotify/docker-client/pull/272#issuecomment-155249101
diff --git a/resource-managers/kubernetes/integration-tests/src/test/resources/core-site.xml b/resource-managers/kubernetes/integration-tests/src/test/resources/core-site.xml
new file mode 100644
index 0000000000000..2165d00052a72
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/resources/core-site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+  <property>
+    <name>hadoop.security.authentication</name>
+    <value>kerberos</value>
+  </property>
+
+  <property>
+    <name>hadoop.security.authorization</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.defaultFS</name>
+    <value>hdfs://nn.${NAMESPACE}.svc.cluster.local:9000</value>
+  </property>
+</configuration>
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/src/test/resources/hdfs-site.xml b/resource-managers/kubernetes/integration-tests/src/test/resources/hdfs-site.xml
new file mode 100644
index 0000000000000..8424aaba3c08d
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/resources/hdfs-site.xml
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+  <property>
+    <name>dfs.replication</name>
+    <value>1</value>
+  </property>
+
+  <!-- General HDFS security config -->
+  <property> 
+    <name>dfs.permissions</name> 
+    <value>true</value> 
+  </property>
+  <property>
+    <name>dfs.block.access.token.enable</name>
+    <value>true</value>
+  </property>
+
+  <!-- NameNode security config -->
+  <property>
+    <name>dfs.namenode.keytab.file</name>
+    <value>/var/keytabs/hdfs.keytab</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.principal</name>
+    <value>hdfs/nn.${NAMESPACE}.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.internal.spnego.principal</name>
+    <value>HTTP/nn.${NAMESPACE}.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+
+
+  <!-- For testing, we want tokens to expire FAST -->
+  <property>
+    <name>dfs.namenode.delegation.token.max-lifetime</name>
+    <value>18000000</value> <!-- 300 minutes -->
+  </property>
+  <property>
+    <name>dfs.namenode.delegation.token.renew-interval</name>
+    <value>1800000</value> <!-- 30 minutes -->
+  </property>
+
+
+
+  <!-- DataNode security config -->
+  <property>
+    <name>dfs.data.transfer.protection</name>
+    <value>integrity</value>
+  </property>
+   
+  <property>
+    <name>dfs.datanode.address</name>
+    <value>0.0.0.0:10019</value>
+  </property>
+   
+  <property>
+    <name>dfs.datanode.http.address</name>
+    <value>0.0.0.0:10022</value>
+  </property>
+   
+  <property>
+    <name>dfs.http.policy</name>
+    <value>HTTPS_ONLY</value>
+  </property>
+
+
+  <property>
+    <name>dfs.namenode.keytab.file</name>
+    <value>/var/keytabs/hdfs.keytab</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.principal</name>
+    <value>hdfs/nn.${NAMESPACE}.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+  <property>
+    <name>dfs.namenode.kerberos.internal.spnego.principal</name>
+    <value>HTTP/nn.${NAMESPACE}.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+
+  <!-- prevent those errors -->
+  <property>
+    <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
+    <value>false</value>
+  </property>
+
+
+  <property>
+    <name>dfs.datanode.data.dir.perm</name>
+    <value>700</value> 
+  </property>
+  <property>
+    <name>dfs.datanode.keytab.file</name>
+    <value>/var/keytabs/hdfs.keytab</value> <!-- path to the HDFS keytab -->
+  </property>
+  <property>
+    <name>dfs.datanode.kerberos.principal</name>
+    <value>hdfs/dn1.${NAMESPACE}.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+
+  <!-- Web Authentication config -->
+  <property>
+    <name>dfs.webhdfs.enabled</name>
+    <value>true</value>
+  </property>
+  <property>
+    <name>dfs.web.authentication.kerberos.principal</name>
+    <value>HTTP/dn1.${NAMESPACE}.svc.cluster.local@CLUSTER.LOCAL</value>
+  </property>
+  <property>
+    <name>dfs.web.authentication.kerberos.keytab</name>
+    <value>/var/keytabs/hdfs.keytab</value> <!-- path to the HTTP keytab -->
+  </property>
+
+</configuration>
diff --git a/resource-managers/kubernetes/integration-tests/src/test/resources/krb5.conf b/resource-managers/kubernetes/integration-tests/src/test/resources/krb5.conf
new file mode 100644
index 0000000000000..5c189a09be6c4
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/resources/krb5.conf
@@ -0,0 +1,25 @@
+includedir /etc/krb5.conf.d/
+
+[logging]
+default = FILE:/var/log/krb5libs.log
+kdc = FILE:/var/log/krb5kdc.log
+admin_server = FILE:/var/log/kadmind.log
+
+[libdefaults]
+dns_lookup_realm = false
+ticket_lifetime = 24h
+renew_lifetime = 7d
+forwardable = true
+rdns = false
+default_realm = CLUSTER.LOCAL
+# default_ccache_name = KEYRING:persistent:%{uid}
+
+[realms]
+CLUSTER.LOCAL = {
+  kdc = kerberos.REPLACE_ME.svc.cluster.local
+  admin_server = kerberos.REPLACE_ME.svc.cluster.local
+}
+
+[domain_realm]
+.cluster.local = CLUSTER.LOCAL
+cluster.local = CLUSTER.LOCAL
diff --git a/resource-managers/kubernetes/integration-tests/src/test/resources/yarn-site.xml b/resource-managers/kubernetes/integration-tests/src/test/resources/yarn-site.xml
new file mode 100644
index 0000000000000..92d9346232c76
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/resources/yarn-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+  <!-- must be set for HDFS libraries to obtain delegation tokens -->
+  <!-- (hardcoded to use this ID as the renewer) -->
+  <property>
+    <name>yarn.resourcemanager.principal</name>
+    <value>yarn/_HOST@CLUSTER.LOCAL</value>
+  </property>
+</configuration>
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KerberizedHadoopClusterLauncher.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KerberizedHadoopClusterLauncher.scala
new file mode 100644
index 0000000000000..2bb007bba3bbd
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KerberizedHadoopClusterLauncher.scala
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.kubernetes.integrationtest
+
+import java.io.{File, FileInputStream}
+
+import io.fabric8.kubernetes.api.model.extensions.{Deployment, DeploymentBuilder}
+import io.fabric8.kubernetes.api.model.{ConfigMapBuilder, KeyToPathBuilder, Service}
+import io.fabric8.kubernetes.client.KubernetesClient
+import org.apache.commons.io.FileUtils.readFileToString
+import org.apache.spark.deploy.kubernetes.submit.ContainerNameEqualityPredicate
+
+ /**
+  * Stuff
+  */
+private[spark] class KerberizedHadoopClusterLauncher(
+  kubernetesClient: KubernetesClient,
+  namespace: String) {
+  private def yamlLocation(loc: String) = s"kerberos-yml/$loc.yml"
+  private def loadFromYaml(resource: String) =
+    kubernetesClient.load(new FileInputStream(new File(yamlLocation(resource))))
+//  private val regex = "REPLACE_ME".r
+//  private val krb5ConfFile =
+//    regex.replaceAllIn(
+//      readFileToString(new File("src/test/resources/krb5.conf")),
+//      namespace)
+  private val KRB_VOLUME = "krb5-conf"
+  private val KRB_FILE_DIR = "/etc"
+  private val KRB_CONFIG_MAP_NAME = "krb-config-map"
+  private val KRB_CONF_FILE = "krb5.conf"
+  private val KRB_KEY_PATH =
+    new KeyToPathBuilder()
+      .withKey(KRB_CONF_FILE)
+      .withPath(KRB_CONF_FILE)
+      .build()
+
+  def launchKerberizedCluster(): Unit = {
+    val persistantVolumeList = Seq(
+      "namenode-hadoop",
+      "namenode-hadoop-pv",
+      "server-keytab",
+      "server-keytab-pv")
+    val deploymentServiceList = Seq(
+      "kerberos-deployment",
+      "kerberos-service",
+      "nn-deployment",
+      "nn-service",
+      "dn1-deployment",
+      "dn1-service",
+      "data-populator-deployment",
+      "data-populator-service")
+    persistantVolumeList.foreach{resource =>
+      loadFromYaml(resource).createOrReplace()
+      Thread.sleep(10000)}
+//    val configMap = new ConfigMapBuilder()
+//      .withNewMetadata()
+//      .withName(KRB_CONFIG_MAP_NAME)
+//      .endMetadata()
+//      .addToData(KRB_CONF_FILE, krb5ConfFile)
+//      .build()
+//    kubernetesClient.configMaps().inNamespace(namespace).createOrReplace(configMap)
+//    Thread.sleep(2000)
+    deploymentServiceList.foreach{ resource => loadFromYaml(resource).get().get(0) match {
+      case deployment: Deployment =>
+        val deploymentWithEnv = new DeploymentBuilder(deployment)
+          .editSpec()
+            .editTemplate()
+                .editSpec()
+                  .addNewVolume()
+                      .withName(KRB_VOLUME)
+                      .withNewConfigMap()
+                        .withName(KRB_CONFIG_MAP_NAME)
+                        .withItems(KRB_KEY_PATH)
+                        .endConfigMap()
+                    .endVolume()
+                  .editMatchingContainer(new ContainerNameEqualityPredicate(
+                    deployment.getMetadata.getName))
+                    .addNewEnv()
+                      .withName("NAMESPACE")
+                      .withValue(namespace)
+                      .endEnv()
+                    .addNewVolumeMount()
+                      .withName(KRB_VOLUME)
+                      .withMountPath(KRB_FILE_DIR)
+                      .endVolumeMount()
+                    .endContainer()
+                    .endSpec()
+                  .endTemplate()
+                .endSpec()
+            .build()
+        kubernetesClient.extensions().deployments().inNamespace(namespace).create(deploymentWithEnv)
+        Thread.sleep(10000)
+      case service: Service =>
+        loadFromYaml(resource).createOrReplace()
+        Thread.sleep(10000)}
+    }
+  }
+}
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
index 810ed5e30d6c2..48ed5868e2c5c 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesSuite.scala
@@ -16,27 +16,21 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest
 
-import java.io.File
 import java.nio.file.Paths
 import java.util.UUID
 
-import com.google.common.base.Charsets
-import com.google.common.io.Files
 import io.fabric8.kubernetes.client.internal.readiness.Readiness
-import org.scalatest.BeforeAndAfter
-import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
-import org.scalatest.time.{Minutes, Seconds, Span}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.deploy.kubernetes.SSLUtils
-import org.apache.spark.{SSLOptions, SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.kubernetes.config._
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackendFactory
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube.Minikube
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.submit.{Client, ClientArguments, JavaMainAppResource, KeyAndCertPem, MainAppResource, PythonMainAppResource}
-import org.apache.spark.launcher.SparkLauncher
-import org.apache.spark.util.{RedirectThread, Utils}
+import org.apache.spark.deploy.kubernetes.submit._
+import org.apache.spark.{SSLOptions, SparkConf, SparkFunSuite}
+import org.scalatest.BeforeAndAfter
+import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
+import org.scalatest.time.{Minutes, Seconds, Span}
+
+import scala.collection.JavaConverters._
 
 private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   import KubernetesSuite._
@@ -46,6 +40,7 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   private var sparkConf: SparkConf = _
   private var resourceStagingServerLauncher: ResourceStagingServerLauncher = _
   private var staticAssetServerLauncher: StaticAssetServerLauncher = _
+  private var kerberizedHadoopClusterLauncher: KerberizedHadoopClusterLauncher = _
 
   override def beforeAll(): Unit = {
     testBackend.initialize()
@@ -54,6 +49,9 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
     staticAssetServerLauncher = new StaticAssetServerLauncher(
       kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace))
+    kerberizedHadoopClusterLauncher = new KerberizedHadoopClusterLauncher(
+      kubernetesTestComponents.kubernetesClient.inNamespace(kubernetesTestComponents.namespace),
+      kubernetesTestComponents.namespace)
   }
 
   override def afterAll(): Unit = {
@@ -69,13 +67,32 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
   }
 
   after {
-    kubernetesTestComponents.deleteNamespace()
+    kubernetesTestComponents.deletePersistentVolumes()
+    // kubernetesTestComponents.deleteNamespace()
   }
 
-  test("Include HADOOP_CONF for HDFS based jobs ") {
+//  test("Include HADOOP_CONF for HDFS based jobs") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    // Ensuring that HADOOP_CONF_DIR variable is set, could also be one via env HADOOP_CONF_DIR
+//    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+//    runSparkApplicationAndVerifyCompletion(
+//      JavaMainAppResource(CONTAINER_LOCAL_MAIN_APP_RESOURCE),
+//      SPARK_PI_MAIN_CLASS,
+//      Seq("HADOOP_CONF_DIR defined. Mounting HDFS specific .xml files", "Pi is roughly 3"),
+//      Array("5"),
+//      Seq.empty[String],
+//      Some("src/test/resources"))
+//  }
+
+  test("Secure HDFS test with HDFS keytab") {
     assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    // Ensuring that HADOOP_CONF_DIR variable is set, could also be one via env HADOOP_CONF_DIR
+    launchKerberizedCluster()
     sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+    sparkConf.set(KUBERNETES_KERBEROS_SUPPORT, true)
+    sparkConf.set(KUBERNETES_KERBEROS_KEYTAB, "/tmp/keytabs/hdfs.keytab")
+    sparkConf.set(KUBERNETES_KERBEROS_PRINCIPAL,
+      s"hdfs/nn.${kubernetesTestComponents.namespace}.svc.cluster.local@CLUSTER.LOCAL")
+    System.setProperty("java.security.krb5.conf", "src/test/resources/krb5.conf")
     runSparkApplicationAndVerifyCompletion(
       JavaMainAppResource(CONTAINER_LOCAL_MAIN_APP_RESOURCE),
       SPARK_PI_MAIN_CLASS,
@@ -85,167 +102,167 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
       Some("test-data/hadoop-conf-files"))
   }
 
-  test("Run PySpark Job on file from SUBMITTER with --py-files") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    launchStagingServer(SSLOptions(), None)
-    sparkConf
-      .set(DRIVER_DOCKER_IMAGE,
-        System.getProperty("spark.docker.test.driverImage", "spark-driver-py:latest"))
-      .set(EXECUTOR_DOCKER_IMAGE,
-        System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
-
-    runPySparkPiAndVerifyCompletion(
-      PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION,
-      Seq(PYSPARK_SORT_CONTAINER_LOCAL_FILE_LOCATION)
-    )
-  }
-
-  test("Run PySpark Job on file from CONTAINER with spark.jar defined") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
-    sparkConf
-      .set(DRIVER_DOCKER_IMAGE,
-      System.getProperty("spark.docker.test.driverImage", "spark-driver-py:latest"))
-      .set(EXECUTOR_DOCKER_IMAGE,
-      System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
-
-    runPySparkPiAndVerifyCompletion(PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION, Seq.empty[String])
-  }
-
-  test("Simple submission test with the resource staging server.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    launchStagingServer(SSLOptions(), None)
-    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Enable SSL on the resource staging server") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
-      ipAddress = Minikube.getMinikubeIp,
-      keyStorePassword = "keyStore",
-      keyPassword = "key",
-      trustStorePassword = "trustStore")
-    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore",
-          keyStoreAndTrustStore.keyStore.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore",
-          keyStoreAndTrustStore.trustStore.getAbsolutePath)
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStore")
-      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "key")
-      .set("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword", "trustStore")
-    launchStagingServer(SSLOptions(
-      enabled = true,
-      keyStore = Some(keyStoreAndTrustStore.keyStore),
-      trustStore = Some(keyStoreAndTrustStore.trustStore),
-      keyStorePassword = Some("keyStore"),
-      keyPassword = Some("key"),
-      trustStorePassword = Some("trustStore")),
-      None)
-    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Use container-local resources without the resource staging server") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
-    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Dynamic executor scaling basic test") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    launchStagingServer(SSLOptions(), None)
-    createShuffleServiceDaemonSet()
-
-    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
-    sparkConf.set("spark.dynamicAllocation.enabled", "true")
-    sparkConf.set("spark.shuffle.service.enabled", "true")
-    sparkConf.set("spark.kubernetes.shuffle.labels", "app=spark-shuffle-service")
-    sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
-    sparkConf.set("spark.app.name", "group-by-test")
-    runSparkApplicationAndVerifyCompletion(
-        JavaMainAppResource(SUBMITTER_LOCAL_MAIN_APP_RESOURCE),
-        GROUP_BY_MAIN_CLASS,
-        Seq("The Result is"),
-        Array.empty[String],
-        Seq.empty[String],
-        None)
-  }
-
-  test("Use remote resources without the resource staging server.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
-    sparkConf.setJars(Seq(
-      s"$assetServerUri/${EXAMPLES_JAR_FILE.getName}",
-      s"$assetServerUri/${HELPER_JAR_FILE.getName}"
-    ))
-    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
-  }
-
-  test("Mix remote resources with submitted ones.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    launchStagingServer(SSLOptions(), None)
-    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
-    sparkConf.setJars(Seq(
-      SUBMITTER_LOCAL_MAIN_APP_RESOURCE, s"$assetServerUri/${HELPER_JAR_FILE.getName}"
-    ))
-    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
-  }
-
-  test("Use key and certificate PEM files for TLS.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    val keyAndCertificate = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
-    launchStagingServer(
-        SSLOptions(enabled = true),
-        Some(keyAndCertificate))
-    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
-        .set(
-            RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key, keyAndCertificate.certPem.getAbsolutePath)
-    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
-  }
-
-  test("Use client key and client cert file when requesting executors") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    sparkConf.setJars(Seq(
-        CONTAINER_LOCAL_MAIN_APP_RESOURCE,
-        CONTAINER_LOCAL_HELPER_JAR_PATH))
-    sparkConf.set(
-        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-        kubernetesTestComponents.clientConfig.getClientKeyFile)
-    sparkConf.set(
-        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-        kubernetesTestComponents.clientConfig.getClientCertFile)
-    sparkConf.set(
-        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-        kubernetesTestComponents.clientConfig.getCaCertFile)
-    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
-  }
-
-  test("Added files should be placed in the driver's working directory.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-    val testExistenceFileTempDir = Utils.createTempDir(namePrefix = "test-existence-file-temp-dir")
-    val testExistenceFile = new File(testExistenceFileTempDir, "input.txt")
-    Files.write(TEST_EXISTENCE_FILE_CONTENTS, testExistenceFile, Charsets.UTF_8)
-    launchStagingServer(SSLOptions(), None)
-    sparkConf.set("spark.files", testExistenceFile.getAbsolutePath)
-    runSparkApplicationAndVerifyCompletion(
-        JavaMainAppResource(SUBMITTER_LOCAL_MAIN_APP_RESOURCE),
-        FILE_EXISTENCE_MAIN_CLASS,
-        Seq(s"File found at /opt/spark/${testExistenceFile.getName} with correct contents."),
-        Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS),
-        Seq.empty[String],
-        None)
-  }
-
-  test("Use a very long application name.") {
-    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
-
-    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH)).setAppName("long" * 40)
-    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
-  }
+//  test("Run PySpark Job on file from SUBMITTER with --py-files") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    launchStagingServer(SSLOptions(), None)
+//    sparkConf
+//      .set(DRIVER_DOCKER_IMAGE,
+//        System.getProperty("spark.docker.test.driverImage", "spark-driver-py:latest"))
+//      .set(EXECUTOR_DOCKER_IMAGE,
+//        System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
+//
+//    runPySparkPiAndVerifyCompletion(
+//      PYSPARK_PI_SUBMITTER_LOCAL_FILE_LOCATION,
+//      Seq(PYSPARK_SORT_CONTAINER_LOCAL_FILE_LOCATION)
+//    )
+//  }
+//
+//  test("Run PySpark Job on file from CONTAINER with spark.jar defined") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//
+//    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+//    sparkConf
+//      .set(DRIVER_DOCKER_IMAGE,
+//      System.getProperty("spark.docker.test.driverImage", "spark-driver-py:latest"))
+//      .set(EXECUTOR_DOCKER_IMAGE,
+//      System.getProperty("spark.docker.test.executorImage", "spark-executor-py:latest"))
+//
+//    runPySparkPiAndVerifyCompletion(PYSPARK_PI_CONTAINER_LOCAL_FILE_LOCATION, Seq.empty[String])
+//  }
+//
+//  test("Simple submission test with the resource staging server.") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//
+//    launchStagingServer(SSLOptions(), None)
+//    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+//  }
+//
+//  test("Enable SSL on the resource staging server") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//
+//    val keyStoreAndTrustStore = SSLUtils.generateKeyStoreTrustStorePair(
+//      ipAddress = Minikube.getMinikubeIp,
+//      keyStorePassword = "keyStore",
+//      keyPassword = "key",
+//      trustStorePassword = "trustStore")
+//    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+//      .set("spark.ssl.kubernetes.resourceStagingServer.keyStore",
+//          keyStoreAndTrustStore.keyStore.getAbsolutePath)
+//      .set("spark.ssl.kubernetes.resourceStagingServer.trustStore",
+//          keyStoreAndTrustStore.trustStore.getAbsolutePath)
+//      .set("spark.ssl.kubernetes.resourceStagingServer.keyStorePassword", "keyStore")
+//      .set("spark.ssl.kubernetes.resourceStagingServer.keyPassword", "key")
+//      .set("spark.ssl.kubernetes.resourceStagingServer.trustStorePassword", "trustStore")
+//    launchStagingServer(SSLOptions(
+//      enabled = true,
+//      keyStore = Some(keyStoreAndTrustStore.keyStore),
+//      trustStore = Some(keyStoreAndTrustStore.trustStore),
+//      keyStorePassword = Some("keyStore"),
+//      keyPassword = Some("key"),
+//      trustStorePassword = Some("trustStore")),
+//      None)
+//    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+//  }
+//
+//  test("Use container-local resources without the resource staging server") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//
+//    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+//    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+//  }
+//
+//  test("Dynamic executor scaling basic test") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//
+//    launchStagingServer(SSLOptions(), None)
+//    createShuffleServiceDaemonSet()
+//
+//    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH))
+//    sparkConf.set("spark.dynamicAllocation.enabled", "true")
+//    sparkConf.set("spark.shuffle.service.enabled", "true")
+//    sparkConf.set("spark.kubernetes.shuffle.labels", "app=spark-shuffle-service")
+//    sparkConf.set("spark.kubernetes.shuffle.namespace", kubernetesTestComponents.namespace)
+//    sparkConf.set("spark.app.name", "group-by-test")
+//    runSparkApplicationAndVerifyCompletion(
+//        JavaMainAppResource(SUBMITTER_LOCAL_MAIN_APP_RESOURCE),
+//        GROUP_BY_MAIN_CLASS,
+//        Seq("The Result is"),
+//        Array.empty[String],
+//        Seq.empty[String],
+//        None)
+//  }
+//
+//  test("Use remote resources without the resource staging server.") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
+//    sparkConf.setJars(Seq(
+//      s"$assetServerUri/${EXAMPLES_JAR_FILE.getName}",
+//      s"$assetServerUri/${HELPER_JAR_FILE.getName}"
+//    ))
+//    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+//  }
+//
+//  test("Mix remote resources with submitted ones.") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    launchStagingServer(SSLOptions(), None)
+//    val assetServerUri = staticAssetServerLauncher.launchStaticAssetServer()
+//    sparkConf.setJars(Seq(
+//      SUBMITTER_LOCAL_MAIN_APP_RESOURCE, s"$assetServerUri/${HELPER_JAR_FILE.getName}"
+//    ))
+//    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+//  }
+//
+//  test("Use key and certificate PEM files for TLS.") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    val keyAndCertificate = SSLUtils.generateKeyCertPemPair(Minikube.getMinikubeIp)
+//    launchStagingServer(
+//        SSLOptions(enabled = true),
+//        Some(keyAndCertificate))
+//    sparkConf.set(RESOURCE_STAGING_SERVER_SSL_ENABLED, true)
+//        .set(
+//            RESOURCE_STAGING_SERVER_CLIENT_CERT_PEM.key, keyAndCertificate.certPem.getAbsolutePath)
+//    runSparkPiAndVerifyCompletion(SUBMITTER_LOCAL_MAIN_APP_RESOURCE)
+//  }
+//
+//  test("Use client key and client cert file when requesting executors") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    sparkConf.setJars(Seq(
+//        CONTAINER_LOCAL_MAIN_APP_RESOURCE,
+//        CONTAINER_LOCAL_HELPER_JAR_PATH))
+//    sparkConf.set(
+//        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+//        kubernetesTestComponents.clientConfig.getClientKeyFile)
+//    sparkConf.set(
+//        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+//        kubernetesTestComponents.clientConfig.getClientCertFile)
+//    sparkConf.set(
+//        s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+//        kubernetesTestComponents.clientConfig.getCaCertFile)
+//    runSparkPiAndVerifyCompletion(SparkLauncher.NO_RESOURCE)
+//  }
+//
+//  test("Added files should be placed in the driver's working directory.") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//    val testExistenceFileTempDir = Utils.createTempDir(namePrefix = "test-existence-file-temp-dir")
+//    val testExistenceFile = new File(testExistenceFileTempDir, "input.txt")
+//    Files.write(TEST_EXISTENCE_FILE_CONTENTS, testExistenceFile, Charsets.UTF_8)
+//    launchStagingServer(SSLOptions(), None)
+//    sparkConf.set("spark.files", testExistenceFile.getAbsolutePath)
+//    runSparkApplicationAndVerifyCompletion(
+//        JavaMainAppResource(SUBMITTER_LOCAL_MAIN_APP_RESOURCE),
+//        FILE_EXISTENCE_MAIN_CLASS,
+//        Seq(s"File found at /opt/spark/${testExistenceFile.getName} with correct contents."),
+//        Array(testExistenceFile.getName, TEST_EXISTENCE_FILE_CONTENTS),
+//        Seq.empty[String],
+//        None)
+//  }
+//
+//  test("Use a very long application name.") {
+//    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+//
+//    sparkConf.setJars(Seq(CONTAINER_LOCAL_HELPER_JAR_PATH)).setAppName("long" * 40)
+//    runSparkPiAndVerifyCompletion(CONTAINER_LOCAL_MAIN_APP_RESOURCE)
+//  }
 
   private def launchStagingServer(
       resourceStagingServerSslOptions: SSLOptions, keyAndCertPem: Option[KeyAndCertPem]): Unit = {
@@ -263,6 +280,12 @@ private[spark] class KubernetesSuite extends SparkFunSuite with BeforeAndAfter {
         s"${Minikube.getMinikubeIp}:$resourceStagingServerPort")
   }
 
+  private def launchKerberizedCluster(): Unit = {
+    assume(testBackend.name == MINIKUBE_TEST_BACKEND)
+    kerberizedHadoopClusterLauncher.launchKerberizedCluster()
+    Thread.sleep(60000)
+  }
+
   private def runSparkPiAndVerifyCompletion(appResource: String): Unit = {
     runSparkApplicationAndVerifyCompletion(
         JavaMainAppResource(appResource),
@@ -373,8 +396,8 @@ private[spark] object KubernetesSuite {
     s"integration-tests-jars/${EXAMPLES_JAR_FILE.getName}"
   val CONTAINER_LOCAL_HELPER_JAR_PATH = s"local:///opt/spark/examples/" +
     s"integration-tests-jars/${HELPER_JAR_FILE.getName}"
-  val TIMEOUT = PatienceConfiguration.Timeout(Span(2, Minutes))
-  val INTERVAL = PatienceConfiguration.Interval(Span(2, Seconds))
+  val TIMEOUT = PatienceConfiguration.Timeout(Span(5, Minutes))
+  val INTERVAL = PatienceConfiguration.Interval(Span(5, Seconds))
   val SPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.kubernetes" +
     ".integrationtest.jobs.SparkPiWithInfiniteWait"
   val PYSPARK_PI_MAIN_CLASS = "org.apache.spark.deploy.PythonRunner"
@@ -389,5 +412,6 @@ private[spark] object KubernetesSuite {
     ".integrationtest.jobs.GroupByTest"
   val TEST_EXISTENCE_FILE_CONTENTS = "contents"
 
+
   case object ShuffleNotReadyException extends Exception
 }
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
index 0ca1f482269db..54e2f62d87c2e 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/KubernetesTestComponents.scala
@@ -16,18 +16,17 @@
  */
 package org.apache.spark.deploy.kubernetes.integrationtest
 
-import java.util.UUID
-
 import io.fabric8.kubernetes.client.DefaultKubernetesClient
-import org.scalatest.concurrent.Eventually
-import scala.collection.JavaConverters._
-
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
+import org.scalatest.concurrent.Eventually
+
+import scala.collection.JavaConverters._
 
 private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesClient) {
 
-  val namespace = UUID.randomUUID().toString.replaceAll("-", "")
+  // val namespace = UUID.randomUUID().toString.replaceAll("-", "")
+  val namespace = "kerberostest"
   val kubernetesClient = defaultClient.inNamespace(namespace)
   val clientConfig = kubernetesClient.getConfiguration
 
@@ -51,6 +50,18 @@ private[spark] class KubernetesTestComponents(defaultClient: DefaultKubernetesCl
     }
   }
 
+  def deletePersistentVolumes(): Unit = {
+    kubernetesClient.persistentVolumes().delete()
+    Eventually.eventually(KubernetesSuite.TIMEOUT, KubernetesSuite.INTERVAL) {
+      val persistentList = kubernetesClient
+        .persistentVolumes()
+        .list()
+        .getItems()
+        .asScala
+      require(!persistentList.exists(_.getMetadata.getNamespace == namespace))
+    }
+  }
+
   def newSparkConf(): SparkConf = {
     new SparkConf(true)
       .setMaster(s"k8s://${kubernetesClient.getMasterUrl}")
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
index 461264877edc2..265daf40905e4 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/backend/minikube/MinikubeTestBackend.scala
@@ -17,17 +17,15 @@
 package org.apache.spark.deploy.kubernetes.integrationtest.backend.minikube
 
 import io.fabric8.kubernetes.client.DefaultKubernetesClient
-
 import org.apache.spark.deploy.kubernetes.integrationtest.backend.IntegrationTestBackend
 import org.apache.spark.deploy.kubernetes.integrationtest.constants.MINIKUBE_TEST_BACKEND
-import org.apache.spark.deploy.kubernetes.integrationtest.docker.SparkDockerImageBuilder
 
 private[spark] class MinikubeTestBackend extends IntegrationTestBackend {
   private var defaultClient: DefaultKubernetesClient = _
 
   override def initialize(): Unit = {
     Minikube.startMinikube()
-    new SparkDockerImageBuilder(Minikube.getDockerEnv).buildSparkDockerImages()
+    // new SparkDockerImageBuilder(Minikube.getDockerEnv).buildSparkDockerImages()
     defaultClient = Minikube.getKubernetesClient
   }
 
diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
index e240fcf953f8c..66f616511bc39 100644
--- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
+++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/kubernetes/integrationtest/docker/SparkDockerImageBuilder.scala
@@ -20,15 +20,14 @@ import java.io.File
 import java.net.URI
 import java.nio.file.Paths
 
-import scala.collection.JavaConverters._
-
 import com.spotify.docker.client.{DefaultDockerClient, DockerCertificates, LoggingBuildHandler}
 import org.apache.http.client.utils.URIBuilder
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.RedirectThread
 import org.scalatest.concurrent.{Eventually, PatienceConfiguration}
 import org.scalatest.time.{Minutes, Seconds, Span}
 
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.RedirectThread
+import scala.collection.JavaConverters._
 
 
 
diff --git a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml
index f9e27564b7e9c..99425ba48d4b0 100644
--- a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml
+++ b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/core-site.xml
@@ -29,6 +29,6 @@
 
   <property>
     <name>fs.defaultFS</name>
-    <value>hdfs://nn.default.svc.cluster.local:9000</value>
+    <value>hdfs://nn.kerberostest.svc.cluster.local:9000</value>
   </property>
 </configuration>
\ No newline at end of file
diff --git a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml
index 1b2de25958c65..bd87f26be8626 100644
--- a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml
+++ b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/hdfs-site.xml
@@ -40,11 +40,11 @@
   </property>
   <property>
     <name>dfs.namenode.kerberos.principal</name>
-    <value>hdfs/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+    <value>hdfs/nn.kerberostest.svc.cluster.local@CLUSTER.LOCAL</value>
   </property>
   <property>
     <name>dfs.namenode.kerberos.internal.spnego.principal</name>
-    <value>HTTP/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+    <value>HTTP/nn.kerberostest.svc.cluster.local@CLUSTER.LOCAL</value>
   </property>
 
 
@@ -88,11 +88,11 @@
   </property>
   <property>
     <name>dfs.namenode.kerberos.principal</name>
-    <value>hdfs/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+    <value>hdfs/nn.kerberostest.svc.cluster.local@CLUSTER.LOCAL</value>
   </property>
   <property>
     <name>dfs.namenode.kerberos.internal.spnego.principal</name>
-    <value>HTTP/nn.default.svc.cluster.local@CLUSTER.LOCAL</value>
+    <value>HTTP/nn.kerberostest.svc.cluster.local@CLUSTER.LOCAL</value>
   </property>
 
   <!-- prevent those errors -->
@@ -112,7 +112,7 @@
   </property>
   <property>
     <name>dfs.datanode.kerberos.principal</name>
-    <value>hdfs/dn1.default.svc.cluster.local@CLUSTER.LOCAL</value>
+    <value>hdfs/dn1.kerberostest.svc.cluster.local@CLUSTER.LOCAL</value>
   </property>
 
   <!-- Web Authentication config -->
@@ -122,7 +122,7 @@
   </property>
   <property>
     <name>dfs.web.authentication.kerberos.principal</name>
-    <value>HTTP/dn1.default.svc.cluster.local@CLUSTER.LOCAL</value>
+    <value>HTTP/dn1.kerberostest.svc.cluster.local@CLUSTER.LOCAL</value>
   </property>
   <property>
     <name>dfs.web.authentication.kerberos.keytab</name>
diff --git a/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/yarn-site.xml b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/yarn-site.xml
new file mode 100644
index 0000000000000..92d9346232c76
--- /dev/null
+++ b/resource-managers/kubernetes/integration-tests/test-data/hadoop-conf-files/yarn-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+  <!-- must be set for HDFS libraries to obtain delegation tokens -->
+  <!-- (hardcoded to use this ID as the renewer) -->
+  <property>
+    <name>yarn.resourcemanager.principal</name>
+    <value>yarn/_HOST@CLUSTER.LOCAL</value>
+  </property>
+</configuration>
\ No newline at end of file