[SPARK-51363][SQL] Desc As JSON` clustering column names

asl3 · cloud-fan · commit 86c2140b62e4 · 2025-03-04T11:04:05.000+08:00
### What changes were proposed in this pull request? Include only column names for Desc As JSON `clustering_columns` field. Previously, `clustering_information` contained redundant column information, as clustering columns full StructType defn will already be included in the `columns` field of Desc As JSON. ### Why are the changes needed? Clean up code to ensure consistency and update docs to include the `clustering_columns` field (and similar `partition_columns` which was previously not documented). ### Does this PR introduce _any_ user-facing change? Yes, updates the output of SQL command `DESC AS JSON` ### How was this patch tested? Added test ### Was this patch authored or co-authored using generative AI tooling? No Closes #50125 from asl3/asl3/clusteringinfo-test. Authored-by: Amanda Liu <amanda.liu@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/docs/sql-ref-syntax-aux-describe-table.md b/docs/sql-ref-syntax-aux-describe-table.md
@@ -97,6 +97,8 @@ to return the metadata pertaining to a partition or column respectively.
       "partition_values": {
         "<col_name>": "<val>"
       },
+      "partition_columns": ["col1", "col2"],
+      "clustering_columns": ["col1", "col2"],
       "location": "<path>",
       "view_text": "<view_text>",
       "view_original_text": "<view_original_text>",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DescribeRelationJsonCommand.scala
@@ -239,23 +239,17 @@ case class DescribeRelationJsonCommand(
   private def describeClusteringInfoJson(
       table: CatalogTable, jsonMap: mutable.LinkedHashMap[String, JValue]): Unit = {
     table.clusterBySpec.foreach { clusterBySpec =>
-      val clusteringColumnsJson: JValue = JArray(
-        clusterBySpec.columnNames.map { fieldNames =>
-          val nestedFieldOpt = table.schema.findNestedField(fieldNames.fieldNames.toIndexedSeq)
-          assert(nestedFieldOpt.isDefined,
-            "The clustering column " +
-              s"${fieldNames.fieldNames.map(quoteIfNeeded).mkString(".")} " +
-              s"was not found in the table schema ${table.schema.catalogString}."
-          )
-          val (path, field) = nestedFieldOpt.get
-          JObject(
-            "name" -> JString((path :+ field.name).map(quoteIfNeeded).mkString(".")),
-            "type" -> jsonType(field.dataType),
-            "comment" -> field.getComment().map(JString).getOrElse(JNull)
-          )
-        }.toList
-      )
-      addKeyValueToMap("clustering_information", clusteringColumnsJson, jsonMap)
+      val clusteringColumnsJson = JArray(clusterBySpec.columnNames.map { fieldNames =>
+        val nestedFieldOpt = table.schema.findNestedField(fieldNames.fieldNames.toIndexedSeq)
+        assert(nestedFieldOpt.isDefined,
+          "The clustering column " +
+            s"${fieldNames.fieldNames.map(quoteIfNeeded).mkString(".")} " +
+            s"was not found in the table schema ${table.schema.catalogString}."
+        )
+        JString(fieldNames.fieldNames.map(quoteIfNeeded).mkString("."))
+      }.toList)
+
+      addKeyValueToMap("clustering_columns", clusteringColumnsJson, jsonMap)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/DescribeTableSuite.scala
@@ -289,7 +289,8 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
           "max_records" -> "1000"
         )),
         partition_provider = Some("Catalog"),
-        partition_columns = Some(List("department", "hire_date"))
+        partition_columns = Some(List("department", "hire_date")),
+        clustering_columns = None // no cluster spec for "CLUSTERED BY"
       )
 
       assert(parsedOutput.location.isDefined)
@@ -364,6 +365,65 @@ trait DescribeTableSuiteBase extends command.DescribeTableSuiteBase
     }
   }
 
+  test("DESCRIBE AS JSON cluster spec") {
+    withNamespaceAndTable("ns", "table") { t =>
+      val tableCreationStr =
+        s"""
+           |CREATE TABLE $t (
+           |  id INT,
+           |  name STRING,
+           |  region STRING,
+           |  category STRING
+           |) USING parquet
+           |COMMENT 'test cluster spec'
+           |CLUSTER BY (id, name)
+           |TBLPROPERTIES ('t' = 'test')
+           |""".stripMargin
+      spark.sql(tableCreationStr)
+
+      val descriptionDf =
+        spark.sql(s"DESCRIBE FORMATTED $t AS JSON")
+      val firstRow = descriptionDf.select("json_metadata").head()
+      val jsonValue = firstRow.getString(0)
+      val parsedOutput = parse(jsonValue).extract[DescribeTableJson]
+
+      val expectedOutput = DescribeTableJson(
+        table_name = Some("table"),
+        catalog_name = Some("spark_catalog"),
+        namespace = Some(List("ns")),
+        schema_name = Some("ns"),
+        columns = Some(List(
+          TableColumn("id", Type("int"), true),
+          TableColumn("name", Type("string"), true),
+          TableColumn("region", Type("string"), true),
+          TableColumn("category", Type("string"), true)
+        )),
+        last_access = Some("UNKNOWN"),
+        created_by = Some(s"Spark $SPARK_VERSION"),
+        `type` = Some("MANAGED"),
+        provider = Some("parquet"),
+        bucket_columns = Some(Nil),
+        sort_columns = Some(Nil),
+        comment = Some("test cluster spec"),
+        table_properties = Some(Map(
+          "t" -> "test",
+          "clusteringColumns" -> "[[\"id\"],[\"name\"]]"
+        )),
+        serde_library = if (getProvider() == "hive") {
+          Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")
+        } else {
+          None
+        },
+        clustering_columns = Some(List("id", "name"))
+      )
+
+      assert(parsedOutput.location.isDefined)
+      assert(iso8601Regex.matches(parsedOutput.created_time.get))
+      assert(expectedOutput == parsedOutput.copy(
+        location = None, created_time = None, storage_properties = None))
+    }
+  }
+
   test("DESCRIBE AS JSON default values") {
     withNamespaceAndTable("ns", "table") { t =>
       val tableCreationStr =
@@ -756,6 +816,7 @@ case class DescribeTableJson(
     partition_provider: Option[String] = None,
     partition_columns: Option[List[String]] = Some(Nil),
     partition_values: Option[Map[String, String]] = None,
+    clustering_columns: Option[List[String]] = None,
     statistics: Option[Map[String, Any]] = None,
     view_text: Option[String] = None,
     view_original_text: Option[String] = None,