Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit dcbb229

Browse files
dilipbiswalgatorsmile
authored andcommitted
[MINOR][SQL] Only populate type metadata for required types such as CHAR/VARCHAR.
## What changes were proposed in this pull request? When reading column descriptions from hive catalog, we currently populate the metadata for all types to record the raw hive type string. In terms of processing , we need this additional metadata information for CHAR/VARCHAR types or complex type containing the CHAR/VARCHAR types. Its a minor cleanup. I haven't created a JIRA for it. ## How was this patch tested? Test added in HiveMetastoreCatalogSuite Author: Dilip Biswal <[email protected]> Closes apache#19215 from dilipbiswal/column_metadata.
1 parent 8be7e6b commit dcbb229

File tree

4 files changed

+82
-17
lines changed

4 files changed

+82
-17
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,12 @@ private[hive] object HiveClientImpl {
849849
throw new SparkException("Cannot recognize hive type string: " + hc.getType, e)
850850
}
851851

852-
val metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
852+
val metadata = if (hc.getType != columnType.catalogString) {
853+
new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
854+
} else {
855+
Metadata.empty
856+
}
857+
853858
val field = StructField(
854859
name = hc.getName,
855860
dataType = columnType,

sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias
2525
import org.apache.spark.sql.hive.test.TestHiveSingleton
2626
import org.apache.spark.sql.internal.SQLConf
2727
import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
28-
import org.apache.spark.sql.types.{DecimalType, IntegerType, StringType, StructField, StructType}
28+
import org.apache.spark.sql.types._
2929

3030
class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils {
3131
import spark.implicits._
@@ -67,6 +67,73 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils {
6767
assert(aliases.size == 1)
6868
}
6969
}
70+
71+
test("Validate catalog metadata for supported data types") {
72+
withTable("t") {
73+
sql(
74+
"""
75+
|CREATE TABLE t (
76+
|c1 boolean,
77+
|c2 tinyint,
78+
|c3 smallint,
79+
|c4 short,
80+
|c5 bigint,
81+
|c6 long,
82+
|c7 float,
83+
|c8 double,
84+
|c9 date,
85+
|c10 timestamp,
86+
|c11 string,
87+
|c12 char(10),
88+
|c13 varchar(10),
89+
|c14 binary,
90+
|c15 decimal,
91+
|c16 decimal(10),
92+
|c17 decimal(10,2),
93+
|c18 array<string>,
94+
|c19 array<int>,
95+
|c20 array<char(10)>,
96+
|c21 map<int,int>,
97+
|c22 map<int,char(10)>,
98+
|c23 struct<a:int,b:int>,
99+
|c24 struct<c:varchar(10),d:int>
100+
|)
101+
""".stripMargin)
102+
103+
val schema = hiveClient.getTable("default", "t").schema
104+
val expectedSchema = new StructType()
105+
.add("c1", "boolean")
106+
.add("c2", "tinyint")
107+
.add("c3", "smallint")
108+
.add("c4", "short")
109+
.add("c5", "bigint")
110+
.add("c6", "long")
111+
.add("c7", "float")
112+
.add("c8", "double")
113+
.add("c9", "date")
114+
.add("c10", "timestamp")
115+
.add("c11", "string")
116+
.add("c12", "string", true,
117+
new MetadataBuilder().putString(HIVE_TYPE_STRING, "char(10)").build())
118+
.add("c13", "string", true,
119+
new MetadataBuilder().putString(HIVE_TYPE_STRING, "varchar(10)").build())
120+
.add("c14", "binary")
121+
.add("c15", "decimal")
122+
.add("c16", "decimal(10)")
123+
.add("c17", "decimal(10,2)")
124+
.add("c18", "array<string>")
125+
.add("c19", "array<int>")
126+
.add("c20", "array<string>", true,
127+
new MetadataBuilder().putString(HIVE_TYPE_STRING, "array<char(10)>").build())
128+
.add("c21", "map<int,int>")
129+
.add("c22", "map<int,string>", true,
130+
new MetadataBuilder().putString(HIVE_TYPE_STRING, "map<int,char(10)>").build())
131+
.add("c23", "struct<a:int,b:int>")
132+
.add("c24", "struct<c:string,d:int>", true,
133+
new MetadataBuilder().putString(HIVE_TYPE_STRING, "struct<c:varchar(10),d:int>").build())
134+
assert(schema == expectedSchema)
135+
}
136+
}
70137
}
71138

72139
class DataSourceWithHiveMetastoreCatalogSuite
@@ -180,5 +247,6 @@ class DataSourceWithHiveMetastoreCatalogSuite
180247
}
181248
}
182249
}
250+
183251
}
184252
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class HiveSchemaInferenceSuite
7171
name = field,
7272
dataType = LongType,
7373
nullable = true,
74-
metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "bigint").build())
74+
metadata = Metadata.empty)
7575
}
7676
// and all partition columns as ints
7777
val partitionStructFields = partitionCols.map { field =>
@@ -80,7 +80,7 @@ class HiveSchemaInferenceSuite
8080
name = field.toLowerCase,
8181
dataType = IntegerType,
8282
nullable = true,
83-
metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, "int").build())
83+
metadata = Metadata.empty)
8484
}
8585
val schema = StructType(structFields ++ partitionStructFields)
8686

sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,7 @@ import org.apache.spark.sql.types._
4040

4141

4242
class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
43-
private def dropMetadata(schema: StructType): StructType = {
44-
val newFields = schema.fields.map { f =>
45-
StructField(f.name, f.dataType, f.nullable, Metadata.empty)
46-
}
47-
StructType(newFields)
48-
}
49-
50-
test("Hive serde tables should fallback to HDFS for size estimation") {
43+
test("Hive serde tables should fallback to HDFS for size estimation") {
5144
withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
5245
withTable("csv_table") {
5346
withTempDir { tempDir =>
@@ -138,9 +131,9 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
138131

139132
// Verify that the schema stored in catalog is a dummy one used for
140133
// data source tables. The actual schema is stored in table properties.
141-
val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
142-
val expectedRawSchema = new StructType()
143-
.add("col", "array<string>")
134+
val rawSchema = hiveClient.getTable("default", table).schema
135+
val metadata = new MetadataBuilder().putString("comment", "from deserializer").build()
136+
val expectedRawSchema = new StructType().add("col", "array<string>", true, metadata)
144137
assert(rawSchema == expectedRawSchema)
145138

146139
val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
@@ -161,14 +154,13 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
161154
}
162155

163156
test("Analyze hive serde tables when schema is not same as schema in table properties") {
164-
165157
val table = "hive_serde"
166158
withTable(table) {
167159
sql(s"CREATE TABLE $table (C1 INT, C2 STRING, C3 DOUBLE)")
168160

169161
// Verify that the table schema stored in hive catalog is
170162
// different than the schema stored in table properties.
171-
val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
163+
val rawSchema = hiveClient.getTable("default", table).schema
172164
val expectedRawSchema = new StructType()
173165
.add("c1", "int")
174166
.add("c2", "string")

0 commit comments

Comments
 (0)