Skip to content

Commit 513a080

Browse files
asl3cloud-fan
authored andcommitted
[SPARK-51525][SQL] Collation field for Desc As JSON StringType
### What changes were proposed in this pull request? Add a collation field for Desc As JSON StringType. For example: ``` "columns":[{"name":"c1","type":{"name":"string", "collation":"UNICODE_CI"} ``` or the default collation value: ``` "columns":[{"name":"c1","type":{"name":"string", "collation":"UTF8_BINARY"} ``` ### Why are the changes needed? Add support for collation data type in Desc As JSON ### Does this PR introduce _any_ user-facing change? Yes, it affects the output of Desc As JSON for collation data type. ### How was this patch tested? Added test in DescribeTableSuite ### Was this patch authored or co-authored using generative AI tooling? No Closes #50290 from asl3/asl3/collation-descasjson. Authored-by: Amanda Liu <amanda.liu@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent ecc6595 commit 513a080

File tree

3 files changed

+82
-21
lines changed

3 files changed

+82
-21
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,12 @@ case class DescribeRelationJsonCommand(
223223
"end_unit" -> JString(getFieldName(dayTimeIntervalType.endField))
224224
)
225225

226+
case stringType: StringType =>
227+
JObject(
228+
"name" -> JString("string"),
229+
"collation" -> JString(stringType.collationName)
230+
)
231+
226232
case _ =>
227233
JObject("name" -> JString(dataType.simpleString))
228234
}

sql/core/src/test/resources/sql-tests/results/describe.sql.out

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ DESCRIBE EXTENDED t AS JSON
7676
-- !query schema
7777
struct<json_metadata:string>
7878
-- !query output
79-
{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not included in comparison]/{warehouse_dir}/t","storage_properties":{"a":"1","b":"2","password":"*********(redacted)"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"parquet","comment":"table_comment","table_properties":{"e":"3","password":"*********(redacted)","t":"test"},"partition_provider":"Catalog","partition_columns":["c","d"]}
79+
{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"d","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true}],"num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"location":"file:[not included in comparison]/{warehouse_dir}/t","storage_properties":{"a":"1","b":"2","password":"*********(redacted)"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"parquet","comment":"table_comment","table_properties":{"e":"3","password":"*********(redacted)","t":"test"},"partition_provider":"Catalog","partition_columns":["c","d"]}
8080

8181

8282
-- !query
@@ -303,7 +303,7 @@ DESC EXTENDED t PARTITION (c='Us', d=1) AS JSON
303303
-- !query schema
304304
struct<json_metadata:string>
305305
-- !query output
306-
{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string"},"nullable":true},{"name":"d","type":{"name":"string"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not included in comparison]/{warehouse_dir}/t/c=Us/d=1","storage_properties":{"a":"1","b":"2","password":"*********(redacted)"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"parquet","num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"table_properties":{"password":"*********(redacted)","t":"test"},"partition_provider":"Catalog","partition_columns":["c","d"]}
306+
{"table_name":"t","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"a","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"b","type":{"name":"int"},"nullable":true},{"name":"c","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"d","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true}],"partition_values":{"c":"Us","d":"1"},"location":"file:[not included in comparison]/{warehouse_dir}/t/c=Us/d=1","storage_properties":{"a":"1","b":"2","password":"*********(redacted)"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"parquet","num_buckets":2,"bucket_columns":["a"],"sort_columns":["b"],"table_properties":{"password":"*********(redacted)","t":"test"},"partition_provider":"Catalog","partition_columns":["c","d"]}
307307

308308

309309
-- !query
@@ -929,7 +929,7 @@ DESC TABLE EXTENDED f PARTITION (B='SPARK', C=TIMESTAMP'2018-11-17 13:33:33') AS
929929
-- !query schema
930930
struct<json_metadata:string>
931931
-- !query output
932-
{"table_name":"f","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"A","type":{"name":"string"},"nullable":true},{"name":"B","type":{"name":"binary"},"nullable":true},{"name":"C","type":{"name":"timestamp_ltz"},"nullable":true}],"partition_values":{"B":"SPARK","C":"2018-11-17 13:33:33"},"location":"file:[not included in comparison]/{warehouse_dir}/f/B=SPARK/C=2018-11-17 13%3A33%3A33","partition_parameters":{"numFiles":"1","totalSize":"15","transient_lastDdlTime [not included in comparison]":"None"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"json","partition_provider":"Catalog","partition_columns":["B","C"]}
932+
{"table_name":"f","catalog_name":"spark_catalog","namespace":["default"],"schema_name":"default","columns":[{"name":"A","type":{"name":"string","collation":"UTF8_BINARY"},"nullable":true},{"name":"B","type":{"name":"binary"},"nullable":true},{"name":"C","type":{"name":"timestamp_ltz"},"nullable":true}],"partition_values":{"B":"SPARK","C":"2018-11-17 13:33:33"},"location":"file:[not included in comparison]/{warehouse_dir}/f/B=SPARK/C=2018-11-17 13%3A33%3A33","partition_parameters":{"numFiles":"1","totalSize":"15","transient_lastDdlTime [not included in comparison]":"None"},"created_time [not included in comparison]":"None","last_access [not included in comparison]":"None","created_by [not included in comparison]":"None","type":"MANAGED","provider":"json","partition_provider":"Catalog","partition_columns":["B","C"]}
933933

934934

935935
-- !query

sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,8 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
265265
schema_name = Some("ns"),
266266
columns = Some(List(
267267
TableColumn("employee_id", Type("int"), true),
268-
TableColumn("employee_name", Type("string"), true),
269-
TableColumn("department", Type("string"), true),
268+
TableColumn("employee_name", Type("string", collation = Some("UTF8_BINARY")), true),
269+
TableColumn("department", Type("string", collation = Some("UTF8_BINARY")), true),
270270
TableColumn("hire_date", Type("date"), true)
271271
)),
272272
last_access = Some("UNKNOWN"),
@@ -330,9 +330,9 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
330330
schema_name = Some("ns"),
331331
columns = Some(List(
332332
TableColumn("id", Type("int"), true),
333-
TableColumn("name", Type("string"), true),
334-
TableColumn("region", Type("string"), true),
335-
TableColumn("category", Type("string"), true)
333+
TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), true),
334+
TableColumn("region", Type("string", collation = Some("UTF8_BINARY")), true),
335+
TableColumn("category", Type("string", collation = Some("UTF8_BINARY")), true)
336336
)),
337337
last_access = Some("UNKNOWN"),
338338
created_by = Some(s"Spark $SPARK_VERSION"),
@@ -394,9 +394,9 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
394394
schema_name = Some("ns"),
395395
columns = Some(List(
396396
TableColumn("id", Type("int"), true),
397-
TableColumn("name", Type("string"), true),
398-
TableColumn("region", Type("string"), true),
399-
TableColumn("category", Type("string"), true)
397+
TableColumn("name", Type("string", collation = Some("UTF8_BINARY")), true),
398+
TableColumn("region", Type("string", collation = Some("UTF8_BINARY")), true),
399+
TableColumn("category", Type("string", collation = Some("UTF8_BINARY")), true)
400400
)),
401401
last_access = Some("UNKNOWN"),
402402
created_by = Some(s"Spark $SPARK_VERSION"),
@@ -424,6 +424,58 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
424424
}
425425
}
426426

427+
test("DESCRIBE AS JSON collation") {
428+
withNamespaceAndTable("ns", "table") { t =>
429+
val tableCreationStr =
430+
s"""
431+
|CREATE TABLE $t (
432+
| c1 STRING COLLATE UNICODE_CI,
433+
| c2 STRING COLLATE UNICODE_RTRIM,
434+
| c3 STRING COLLATE FR,
435+
| c4 STRING,
436+
| id INT
437+
|)
438+
|USING parquet COMMENT 'table_comment'
439+
|""".stripMargin
440+
spark.sql(tableCreationStr)
441+
442+
val descriptionDf = spark.sql(s"DESC EXTENDED $t AS JSON")
443+
val firstRow = descriptionDf.select("json_metadata").head()
444+
val jsonValue = firstRow.getString(0)
445+
val parsedOutput = parse(jsonValue).extract[DescribeTableJson]
446+
447+
val expectedOutput = DescribeTableJson(
448+
table_name = Some("table"),
449+
catalog_name = Some("spark_catalog"),
450+
namespace = Some(List("ns")),
451+
schema_name = Some("ns"),
452+
columns = Some(List(
453+
TableColumn("c1", Type("string", collation = Some("UNICODE_CI"))),
454+
TableColumn("c2", Type("string", collation = Some("UNICODE_RTRIM"))),
455+
TableColumn("c3", Type("string", collation = Some("fr"))),
456+
TableColumn("c4", Type("string", collation = Some("UTF8_BINARY"))),
457+
TableColumn("id", Type("int")))),
458+
last_access = Some("UNKNOWN"),
459+
created_by = Some(s"Spark $SPARK_VERSION"),
460+
`type` = Some("MANAGED"),
461+
storage_properties = None,
462+
provider = Some("parquet"),
463+
bucket_columns = Some(Nil),
464+
sort_columns = Some(Nil),
465+
comment = Some("table_comment"),
466+
serde_library = if (getProvider() == "hive") {
467+
Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
468+
} else {
469+
None
470+
},
471+
table_properties = None
472+
)
473+
assert(parsedOutput.location.isDefined)
474+
assert(iso8601Regex.matches(parsedOutput.created_time.get))
475+
assert(expectedOutput == parsedOutput.copy(location = None, created_time = None))
476+
}
477+
}
478+
427479
test("DESCRIBE AS JSON default values") {
428480
withNamespaceAndTable("ns", "table") { t =>
429481
val tableCreationStr =
@@ -450,7 +502,8 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
450502
schema_name = Some("ns"),
451503
columns = Some(List(
452504
TableColumn("id", Type("int"), default = Some("1")),
453-
TableColumn("name", Type("string"), default = Some("'unknown'")),
505+
TableColumn("name", Type("string", collation = Some("UTF8_BINARY")),
506+
default = Some("'unknown'")),
454507
TableColumn("created_at", Type("timestamp_ltz"), default = Some("CURRENT_TIMESTAMP")),
455508
TableColumn("is_active", Type("boolean"), default = Some("true"))
456509
)),
@@ -503,7 +556,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
503556
schema_name = if (isTemp) Some("session") else Some("default"),
504557
columns = Some(List(
505558
TableColumn("id", Type("int")),
506-
TableColumn("name", Type("string")),
559+
TableColumn("name", Type("string", collation = Some("UTF8_BINARY"))),
507560
TableColumn("created_at", Type("timestamp_ltz"))
508561
)),
509562
last_access = Some("UNKNOWN"),
@@ -603,7 +656,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
603656
fields = Some(List(
604657
Field(
605658
name = "name",
606-
`type` = Type("string")
659+
`type` = Type("string", collation = Some("UTF8_BINARY"))
607660
),
608661
Field(
609662
name = "age",
@@ -616,13 +669,13 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
616669
fields = Some(List(
617670
Field(
618671
name = "email",
619-
`type` = Type("string")
672+
`type` = Type("string", collation = Some("UTF8_BINARY"))
620673
),
621674
Field(
622675
name = "phone_numbers",
623676
`type` = Type(
624677
name = "array",
625-
element_type = Some(Type("string")),
678+
element_type = Some(Type("string", collation = Some("UTF8_BINARY"))),
626679
element_nullable = Some(true)
627680
)
628681
),
@@ -635,11 +688,11 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
635688
fields = Some(List(
636689
Field(
637690
name = "street",
638-
`type` = Type("string")
691+
`type` = Type("string", collation = Some("UTF8_BINARY"))
639692
),
640693
Field(
641694
name = "city",
642-
`type` = Type("string")
695+
`type` = Type("string", collation = Some("UTF8_BINARY"))
643696
),
644697
Field(
645698
name = "zip",
@@ -661,10 +714,10 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
661714
name = "preferences",
662715
`type` = Type(
663716
name = "map",
664-
key_type = Some(Type("string")),
717+
key_type = Some(Type("string", collation = Some("UTF8_BINARY"))),
665718
value_type = Some(Type(
666719
name = "array",
667-
element_type = Some(Type("string")),
720+
element_type = Some(Type("string", collation = Some("UTF8_BINARY"))),
668721
element_nullable = Some(true)
669722
)),
670723
value_nullable = Some(true)
@@ -673,7 +726,7 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
673726
),
674727
TableColumn(
675728
name = "id",
676-
`type` = Type("string"),
729+
`type` = Type("string", collation = Some("UTF8_BINARY")),
677730
default = None
678731
)
679732
)),
@@ -836,6 +889,8 @@ case class TableColumn(
836889

837890
case class Type(
838891
name: String,
892+
collation: Option[String] = None,
893+
length: Option[Int] = None,
839894
fields: Option[List[Field]] = None,
840895
`type`: Option[Type] = None,
841896
element_type: Option[Type] = None,

0 commit comments

Comments
 (0)