Skip to content

Commit 86c2140

Browse files
asl3cloud-fan
authored andcommitted
[SPARK-51363][SQL] Desc As JSON` clustering column names
### What changes were proposed in this pull request? Include only column names for Desc As JSON `clustering_columns` field. Previously, `clustering_information` contained redundant column information, as clustering columns full StructType defn will already be included in the `columns` field of Desc As JSON. ### Why are the changes needed? Clean up code to ensure consistency and update docs to include the `clustering_columns` field (and similar `partition_columns` which was previously not documented). ### Does this PR introduce _any_ user-facing change? Yes, updates the output of SQL command `DESC AS JSON` ### How was this patch tested? Added test ### Was this patch authored or co-authored using generative AI tooling? No Closes #50125 from asl3/asl3/clusteringinfo-test. Authored-by: Amanda Liu <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 4b2ae48 commit 86c2140

File tree

3 files changed

+75
-18
lines changed

3 files changed

+75
-18
lines changed

docs/sql-ref-syntax-aux-describe-table.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ to return the metadata pertaining to a partition or column respectively.
9797
"partition_values": {
9898
"<col_name>": "<val>"
9999
},
100+
"partition_columns": ["col1", "col2"],
101+
"clustering_columns": ["col1", "col2"],
100102
"location": "<path>",
101103
"view_text": "<view_text>",
102104
"view_original_text": "<view_original_text>",

sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -239,23 +239,17 @@ case class DescribeRelationJsonCommand(
239239
private def describeClusteringInfoJson(
240240
table: CatalogTable, jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = {
241241
table.clusterBySpec.foreach { clusterBySpec =>
242-
val clusteringColumnsJson: JValue = JArray(
243-
clusterBySpec.columnNames.map { fieldNames =>
244-
val nestedFieldOpt = table.schema.findNestedField(fieldNames.fieldNames.toIndexedSeq)
245-
assert(nestedFieldOpt.isDefined,
246-
"The clustering column " +
247-
s"${fieldNames.fieldNames.map(quoteIfNeeded).mkString(".")} " +
248-
s"was not found in the table schema ${table.schema.catalogString}."
249-
)
250-
val (path, field) = nestedFieldOpt.get
251-
JObject(
252-
"name" -> JString((path :+ field.name).map(quoteIfNeeded).mkString(".")),
253-
"type" -> jsonType(field.dataType),
254-
"comment" -> field.getComment().map(JString).getOrElse(JNull)
255-
)
256-
}.toList
257-
)
258-
addKeyValueToMap("clustering_information", clusteringColumnsJson, jsonMap)
242+
val clusteringColumnsJson = JArray(clusterBySpec.columnNames.map { fieldNames =>
243+
val nestedFieldOpt = table.schema.findNestedField(fieldNames.fieldNames.toIndexedSeq)
244+
assert(nestedFieldOpt.isDefined,
245+
"The clustering column " +
246+
s"${fieldNames.fieldNames.map(quoteIfNeeded).mkString(".")} " +
247+
s"was not found in the table schema ${table.schema.catalogString}."
248+
)
249+
JString(fieldNames.fieldNames.map(quoteIfNeeded).mkString("."))
250+
}.toList)
251+
252+
addKeyValueToMap("clustering_columns", clusteringColumnsJson, jsonMap)
259253
}
260254
}
261255

sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,8 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
289289
"max_records" -> "1000"
290290
)),
291291
partition_provider = Some("Catalog"),
292-
partition_columns = Some(List("department", "hire_date"))
292+
partition_columns = Some(List("department", "hire_date")),
293+
clustering_columns = None // no cluster spec for "CLUSTERED BY"
293294
)
294295

295296
assert(parsedOutput.location.isDefined)
@@ -364,6 +365,65 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
364365
}
365366
}
366367

368+
test("DESCRIBE AS JSON cluster spec") {
369+
withNamespaceAndTable("ns", "table") { t =>
370+
val tableCreationStr =
371+
s"""
372+
|CREATE TABLE $t (
373+
| id INT,
374+
| name STRING,
375+
| region STRING,
376+
| category STRING
377+
|) USING parquet
378+
|COMMENT 'test cluster spec'
379+
|CLUSTER BY (id, name)
380+
|TBLPROPERTIES ('t' = 'test')
381+
|""".stripMargin
382+
spark.sql(tableCreationStr)
383+
384+
val descriptionDf =
385+
spark.sql(s"DESCRIBE FORMATTED $t AS JSON")
386+
val firstRow = descriptionDf.select("json_metadata").head()
387+
val jsonValue = firstRow.getString(0)
388+
val parsedOutput = parse(jsonValue).extract[DescribeTableJson]
389+
390+
val expectedOutput = DescribeTableJson(
391+
table_name = Some("table"),
392+
catalog_name = Some("spark_catalog"),
393+
namespace = Some(List("ns")),
394+
schema_name = Some("ns"),
395+
columns = Some(List(
396+
TableColumn("id", Type("int"), true),
397+
TableColumn("name", Type("string"), true),
398+
TableColumn("region", Type("string"), true),
399+
TableColumn("category", Type("string"), true)
400+
)),
401+
last_access = Some("UNKNOWN"),
402+
created_by = Some(s"Spark $SPARK_VERSION"),
403+
`type` = Some("MANAGED"),
404+
provider = Some("parquet"),
405+
bucket_columns = Some(Nil),
406+
sort_columns = Some(Nil),
407+
comment = Some("test cluster spec"),
408+
table_properties = Some(Map(
409+
"t" -> "test",
410+
"clusteringColumns" -> "[[\"id\"],[\"name\"]]"
411+
)),
412+
serde_library = if (getProvider() == "hive") {
413+
Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
414+
} else {
415+
None
416+
},
417+
clustering_columns = Some(List("id", "name"))
418+
)
419+
420+
assert(parsedOutput.location.isDefined)
421+
assert(iso8601Regex.matches(parsedOutput.created_time.get))
422+
assert(expectedOutput == parsedOutput.copy(
423+
location = None, created_time = None, storage_properties = None))
424+
}
425+
}
426+
367427
test("DESCRIBE AS JSON default values") {
368428
withNamespaceAndTable("ns", "table") { t =>
369429
val tableCreationStr =
@@ -756,6 +816,7 @@ case class DescribeTableJson(
756816
partition_provider: Option[String] = None,
757817
partition_columns: Option[List[String]] = Some(Nil),
758818
partition_values: Option[Map[String, String]] = None,
819+
clustering_columns: Option[List[String]] = None,
759820
statistics: Option[Map[String, Any]] = None,
760821
view_text: Option[String] = None,
761822
view_original_text: Option[String] = None,

0 commit comments

Comments
 (0)