[MINOR][SQL] Only populate type metadata for required types such as CHAR/VARCHAR.

dilipbiswal · gatorsmile · commit dcbb2294338c · 2017-09-13T22:45:44.000-07:00
## What changes were proposed in this pull request? When reading column descriptions from hive catalog, we currently populate the metadata for all types to record the raw hive type string. In terms of processing , we need this additional metadata information for CHAR/VARCHAR types or complex type containing the CHAR/VARCHAR types. Its a minor cleanup. I haven't created a JIRA for it. ## How was this patch tested? Test added in HiveMetastoreCatalogSuite Author: Dilip Biswal <dbiswal@us.ibm.com> Closes apache#19215 from dilipbiswal/column_metadata.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -849,7 +849,12 @@ private[hive] object HiveClientImpl {
         throw new SparkException("Cannot recognize hive type string: " + hc.getType, e)
     }
 
-    val metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
+    val metadata = if (hc.getType != columnType.catalogString) {
+      new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
+    } else {
+      Metadata.empty
+    }
+
     val field = StructField(
       name = hc.getName,
       dataType = columnType,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
-import org.apache.spark.sql.types.{DecimalType, IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types._
 
 class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils {
   import spark.implicits._
@@ -67,6 +67,73 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils {
       assert(aliases.size == 1)
     }
   }
+
+  test("Validate catalog metadata for supported data types")  {
+    withTable("t") {
+      sql(
+        """
+          |CREATE TABLE t (
+          |c1 boolean,
+          |c2 tinyint,
+          |c3 smallint,
+          |c4 short,
+          |c5 bigint,
+          |c6 long,
+          |c7 float,
+          |c8 double,
+          |c9 date,
+          |c10 timestamp,
+          |c11 string,
+          |c12 char(10),
+          |c13 varchar(10),
+          |c14 binary,
+          |c15 decimal,
+          |c16 decimal(10),
+          |c17 decimal(10,2),
+          |c18 array<string>,
+          |c19 array<int>,
+          |c20 array<char(10)>,
+          |c21 map<int,int>,
+          |c22 map<int,char(10)>,
+          |c23 struct<a:int,b:int>,
+          |c24 struct<c:varchar(10),d:int>
+          |)
+        """.stripMargin)
+
+      val schema = hiveClient.getTable("default", "t").schema
+      val expectedSchema = new StructType()
+        .add("c1", "boolean")
+        .add("c2", "tinyint")
+        .add("c3", "smallint")
+        .add("c4", "short")
+        .add("c5", "bigint")
+        .add("c6", "long")
+        .add("c7", "float")
+        .add("c8", "double")
+        .add("c9", "date")
+        .add("c10", "timestamp")
+        .add("c11", "string")
+        .add("c12", "string", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "char(10)").build())
+        .add("c13", "string", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "varchar(10)").build())
+        .add("c14", "binary")
+        .add("c15", "decimal")
+        .add("c16", "decimal(10)")
+        .add("c17", "decimal(10,2)")
+        .add("c18", "array<string>")
+        .add("c19", "array<int>")
+        .add("c20", "array<string>", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "array<char(10)>").build())
+        .add("c21", "map<int,int>")
+        .add("c22", "map<int,string>", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "map<int,char(10)>").build())
+        .add("c23", "struct<a:int,b:int>")
+        .add("c24", "struct<c:string,d:int>", true,
+          new MetadataBuilder().putString(HIVE_TYPE_STRING, "struct<c:varchar(10),d:int>").build())
+      assert(schema == expectedSchema)
+    }
+  }
 }
 
 class DataSourceWithHiveMetastoreCatalogSuite
@@ -180,5 +247,6 @@ class DataSourceWithHiveMetastoreCatalogSuite
         }
       }
     }
+
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala
@@ -71,7 +71,7 @@ class HiveSchemaInferenceSuite
         name = field,
         dataType = LongType,
         nullable = true,
-        metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "bigint").build())
+        metadata = Metadata.empty)
     }
     // and all partition columns as ints
     val partitionStructFields = partitionCols.map { field =>
@@ -80,7 +80,7 @@ class HiveSchemaInferenceSuite
         name = field.toLowerCase,
         dataType = IntegerType,
         nullable = true,
-        metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "int").build())
+        metadata = Metadata.empty)
     }
     val schema = StructType(structFields ++ partitionStructFields)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -40,14 +40,7 @@ import org.apache.spark.sql.types._
 
 
 class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
-  private def dropMetadata(schema: StructType): StructType = {
-    val newFields = schema.fields.map { f =>
-      StructField(f.name, f.dataType, f.nullable, Metadata.empty)
-    }
-    StructType(newFields)
-  }
-
-  test("Hive serde tables should fallback to HDFS for size estimation") {
+   test("Hive serde tables should fallback to HDFS for size estimation") {
     withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
       withTable("csv_table") {
         withTempDir { tempDir =>
@@ -138,9 +131,9 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
 
       // Verify that the schema stored in catalog is a dummy one used for
       // data source tables. The actual schema is stored in table properties.
-      val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
-      val expectedRawSchema = new StructType()
-        .add("col", "array<string>")
+      val rawSchema = hiveClient.getTable("default", table).schema
+      val metadata = new MetadataBuilder().putString("comment", "from deserializer").build()
+      val expectedRawSchema = new StructType().add("col", "array<string>", true, metadata)
       assert(rawSchema == expectedRawSchema)
 
       val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
@@ -161,14 +154,13 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
   }
 
   test("Analyze hive serde tables when schema is not same as schema in table properties") {
-
     val table = "hive_serde"
     withTable(table) {
       sql(s"CREATE TABLE $table (C1 INT, C2 STRING, C3 DOUBLE)")
 
       // Verify that the table schema stored in hive catalog is
       // different than the schema stored in table properties.
-      val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
+      val rawSchema = hiveClient.getTable("default", table).schema
       val expectedRawSchema = new StructType()
         .add("c1", "int")
         .add("c2", "string")