[SPARK-26716][SPARK-26765][FOLLOWUP][SQL] Clean up schema validation methods and override toString method in Avro

gengliangwang · cloud-fan · commit 308996bc72c9 · 2019-01-31T15:44:44.000+08:00
## What changes were proposed in this pull request? In apache#23639, the API `supportDataType` is refactored. We should also remove the method `verifyWriteSchema` and `verifyReadSchema` in `DataSourceUtils`. Since the error message use `FileFormat.toString` to specify the data source naming, this PR also overriding the `toString` method in `AvroFileFormat`. ## How was this patch tested? Unit test. Closes apache#23699 from gengliangwang/SPARK-26716-followup. Authored-by: Gengliang Wang <gengliang.wang@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala
@@ -124,6 +124,8 @@ private[avro] class AvroFileFormat extends FileFormat
 
   override def shortName(): String = "avro"
 
+  override def toString(): String = "Avro"
+
   override def isSplitable(
       sparkSession: SparkSession,
       options: Map[String, String],
diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -896,7 +896,7 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         sql("select testType()").write.format("avro").mode("overwrite").save(tempDir)
       }.getMessage
       assert(msg.toLowerCase(Locale.ROOT)
-        .contains(s"data source does not support calendarinterval data type."))
+        .contains(s"avro data source does not support calendarinterval data type."))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -412,7 +412,7 @@ case class DataSource(
           hs.partitionSchema.map(_.name),
           "in the partition schema",
           equality)
-        DataSourceUtils.verifyReadSchema(hs.fileFormat, hs.dataSchema)
+        DataSourceUtils.verifySchema(hs.fileFormat, hs.dataSchema)
       case _ =>
         SchemaUtils.checkColumnNameDuplication(
           relation.schema.map(_.name),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala
@@ -24,26 +24,11 @@ import org.apache.spark.sql.types._
 
 
 object DataSourceUtils {
-
-  /**
-   * Verify if the schema is supported in datasource in write path.
-   */
-  def verifyWriteSchema(format: FileFormat, schema: StructType): Unit = {
-    verifySchema(format, schema, isReadPath = false)
-  }
-
-  /**
-   * Verify if the schema is supported in datasource in read path.
-   */
-  def verifyReadSchema(format: FileFormat, schema: StructType): Unit = {
-    verifySchema(format, schema, isReadPath = true)
-  }
-
   /**
    * Verify if the schema is supported in datasource. This verification should be done
    * in a driver side.
    */
-  private def verifySchema(format: FileFormat, schema: StructType, isReadPath: Boolean): Unit = {
+  def verifySchema(format: FileFormat, schema: StructType): Unit = {
     schema.foreach { field =>
       if (!format.supportDataType(field.dataType)) {
         throw new AnalysisException(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -98,7 +98,7 @@ object FileFormatWriter extends Logging {
     val caseInsensitiveOptions = CaseInsensitiveMap(options)
 
     val dataSchema = dataColumns.toStructType
-    DataSourceUtils.verifyWriteSchema(fileFormat, dataSchema)
+    DataSourceUtils.verifySchema(fileFormat, dataSchema)
     // Note: prepareWrite has side effect. It sets "job".
     val outputWriterFactory =
       fileFormat.prepareWrite(sparkSession, job, caseInsensitiveOptions, dataSchema)

Original file line number	Diff line number	Diff line change
`@@ -896,7 +896,7 @@ class AvroSuite extends QueryTest with SharedSQLContext with SQLTestUtils {`
`896`	`896`	`sql("select testType()").write.format("avro").mode("overwrite").save(tempDir)`
`897`	`897`	`}.getMessage`
`898`	`898`	`assert(msg.toLowerCase(Locale.ROOT)`
`899`		`- .contains(s"data source does not support calendarinterval data type."))`
	`899`	`+ .contains(s"avro data source does not support calendarinterval data type."))`
`900`	`900`	`}`
`901`	`901`	`}`
`902`	`902`