Merge pull request apache-spark-on-k8s#426 from palantir/bd/SPARK-25700

robert3005 · web-flow · commit 8b98c58010a8 · 2018-10-26T11:04:04.000+01:00
[SPARK-25700][SQL] Creates ReadSupport in only Append Mode in Data So…
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -246,8 +246,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             df.sparkSession.sessionState.conf)
           val options = sessionOptions ++ extraOptions
 
-          val relation = DataSourceV2Relation.create(source, options)
           if (mode == SaveMode.Append) {
+            val relation = DataSourceV2Relation.create(source, options)
             runCommand(df.sparkSession, "save") {
               AppendData.byName(relation, df.logicalPlan)
             }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -351,6 +351,24 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-25700: do not read schema when writing in other modes except append mode") {
+    withTempPath { file =>
+      val cls = classOf[SimpleWriteOnlyDataSource]
+      val path = file.getCanonicalPath
+      val df = spark.range(5).select('id as 'i, -'id as 'j)
+      try {
+        df.write.format(cls.getName).option("path", path).mode("error").save()
+        df.write.format(cls.getName).option("path", path).mode("overwrite").save()
+        df.write.format(cls.getName).option("path", path).mode("ignore").save()
+      } catch {
+        case e: SchemaReadAttemptException => fail("Schema read was attempted.", e)
+      }
+      intercept[SchemaReadAttemptException] {
+        df.write.format(cls.getName).option("path", path).mode("append").save()
+      }
+    }
+  }
 }
 
 
@@ -640,3 +658,14 @@ object SpecificReaderFactory extends PartitionReaderFactory {
     }
   }
 }
+
+class SchemaReadAttemptException(m: String) extends RuntimeException(m)
+
+class SimpleWriteOnlyDataSource extends SimpleWritableDataSource {
+  override def fullSchema(): StructType = {
+    // This is a bit hacky since this source implements read support but throws
+    // during schema retrieval. Might have to rewrite but it's done
+    // such so for minimised changes.
+    throw new SchemaReadAttemptException("read is not supported")
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -43,13 +43,13 @@ class SimpleWritableDataSource extends DataSourceV2
   with BatchWriteSupportProvider
   with SessionConfigSupport {
 
-  private val schema = new StructType().add("i", "long").add("j", "long")
+  protected def fullSchema(): StructType = new StructType().add("i", "long").add("j", "long")
 
   override def keyPrefix: String = "simpleWritableDataSource"
 
   class ReadSupport(path: String, conf: Configuration) extends SimpleReadSupport {
 
-    override def fullSchema(): StructType = schema
+    override def fullSchema(): StructType = SimpleWritableDataSource.this.fullSchema()
 
     override def planInputPartitions(config: ScanConfig): Array[InputPartition] = {
       val dataPath = new Path(path)
@@ -116,7 +116,6 @@ class SimpleWritableDataSource extends DataSourceV2
       schema: StructType,
       mode: SaveMode,
       options: DataSourceOptions): Optional[BatchWriteSupport] = {
-    assert(DataType.equalsStructurally(schema.asNullable, this.schema.asNullable))
     assert(!SparkContext.getActive.get.conf.getBoolean("spark.speculation", false))
 
     val path = new Path(options.get("path").get())

Original file line number	Diff line number	Diff line change
`@@ -246,8 +246,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {`
`246`	`246`	`df.sparkSession.sessionState.conf)`
`247`	`247`	`val options = sessionOptions ++ extraOptions`
`248`	`248`
`249`		`- val relation = DataSourceV2Relation.create(source, options)`
`250`	`249`	`if (mode == SaveMode.Append) {`
	`250`	`+ val relation = DataSourceV2Relation.create(source, options)`
`251`	`251`	`runCommand(df.sparkSession, "save") {`
`252`	`252`	`AppendData.byName(relation, df.logicalPlan)`
`253`	`253`	`}`
Original file line number	Diff line number	Diff line change
`@@ -351,6 +351,24 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {`
`351`	`351`	`}`
`352`	`352`	`}`
`353`	`353`	`}`
	`354`	`+`
	`355`	`+ test("SPARK-25700: do not read schema when writing in other modes except append mode") {`
	`356`	`+ withTempPath { file =>`
	`357`	`+ val cls = classOf[SimpleWriteOnlyDataSource]`
	`358`	`+ val path = file.getCanonicalPath`
	`359`	`+ val df = spark.range(5).select('id as 'i, -'id as 'j)`
	`360`	`+ try {`
	`361`	`+ df.write.format(cls.getName).option("path", path).mode("error").save()`
	`362`	`+ df.write.format(cls.getName).option("path", path).mode("overwrite").save()`
	`363`	`+ df.write.format(cls.getName).option("path", path).mode("ignore").save()`
	`364`	`+ } catch {`
	`365`	`+ case e: SchemaReadAttemptException => fail("Schema read was attempted.", e)`
	`366`	`+ }`
	`367`	`+ intercept[SchemaReadAttemptException] {`
	`368`	`+ df.write.format(cls.getName).option("path", path).mode("append").save()`
	`369`	`+ }`
	`370`	`+ }`
	`371`	`+ }`
`354`	`372`	`}`
`355`	`373`
`356`	`374`
`@@ -640,3 +658,14 @@ object SpecificReaderFactory extends PartitionReaderFactory {`
`640`	`658`	`}`
`641`	`659`	`}`
`642`	`660`	`}`
	`661`	`+`
	`662`	`+class SchemaReadAttemptException(m: String) extends RuntimeException(m)`
	`663`	`+`
	`664`	`+class SimpleWriteOnlyDataSource extends SimpleWritableDataSource {`
	`665`	`+ override def fullSchema(): StructType = {`
	`666`	`+ // This is a bit hacky since this source implements read support but throws`
	`667`	`+ // during schema retrieval. Might have to rewrite but it's done`
	`668`	`+ // such so for minimised changes.`
	`669`	`+ throw new SchemaReadAttemptException("read is not supported")`
	`670`	`+ }`
	`671`	`+}`