Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit 515910e

Browse files
wzhfygatorsmile
authored andcommitted
[SPARK-17642][SQL] support DESC EXTENDED/FORMATTED table column commands
## What changes were proposed in this pull request? Support DESC (EXTENDED | FORMATTED) ? TABLE COLUMN command. Support DESC EXTENDED | FORMATTED TABLE COLUMN command to show column-level statistics. Do NOT support describe nested columns. ## How was this patch tested? Added test cases. Author: Zhenhua Wang <[email protected]> Author: Zhenhua Wang <[email protected]> Author: wangzhenhua <[email protected]> Closes apache#16422 from wzhfy/descColumn.
1 parent 9575582 commit 515910e

File tree

7 files changed

+332
-13
lines changed

7 files changed

+332
-13
lines changed

sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ describeFuncName
270270
;
271271

272272
describeColName
273-
: identifier ('.' (identifier | STRING))*
273+
: nameParts+=identifier ('.' nameParts+=identifier)*
274274
;
275275

276276
ctes

sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -330,10 +330,16 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
330330
* Create a [[DescribeTableCommand]] logical plan.
331331
*/
332332
override def visitDescribeTable(ctx: DescribeTableContext): LogicalPlan = withOrigin(ctx) {
333-
// Describe column are not supported yet. Return null and let the parser decide
334-
// what to do with this (create an exception or pass it on to a different system).
333+
val isExtended = ctx.EXTENDED != null || ctx.FORMATTED != null
335334
if (ctx.describeColName != null) {
336-
null
335+
if (ctx.partitionSpec != null) {
336+
throw new ParseException("DESC TABLE COLUMN for a specific partition is not supported", ctx)
337+
} else {
338+
DescribeColumnCommand(
339+
visitTableIdentifier(ctx.tableIdentifier),
340+
ctx.describeColName.nameParts.asScala.map(_.getText),
341+
isExtended)
342+
}
337343
} else {
338344
val partitionSpec = if (ctx.partitionSpec != null) {
339345
// According to the syntax, visitPartitionSpec returns `Map[String, Option[String]]`.
@@ -348,7 +354,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
348354
DescribeTableCommand(
349355
visitTableIdentifier(ctx.tableIdentifier),
350356
partitionSpec,
351-
ctx.EXTENDED != null || ctx.FORMATTED != null)
357+
isExtended)
352358
}
353359
}
354360

sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ import org.apache.hadoop.fs.Path
2929

3030
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
3131
import org.apache.spark.sql.catalyst.TableIdentifier
32-
import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException
32+
import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute}
3333
import org.apache.spark.sql.catalyst.catalog._
3434
import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
3535
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
3636
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
3737
import org.apache.spark.sql.catalyst.util.quoteIdentifier
38-
import org.apache.spark.sql.execution.datasources.{DataSource, FileFormat, PartitioningUtils}
38+
import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils}
3939
import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
4040
import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
4141
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
@@ -631,6 +631,74 @@ case class DescribeTableCommand(
631631
}
632632
}
633633

634+
/**
635+
* A command to list the info for a column, including name, data type, column stats and comment.
636+
* This function creates a [[DescribeColumnCommand]] logical plan.
637+
*
638+
* The syntax of using this command in SQL is:
639+
* {{{
640+
* DESCRIBE [EXTENDED|FORMATTED] table_name column_name;
641+
* }}}
642+
*/
643+
case class DescribeColumnCommand(
644+
table: TableIdentifier,
645+
colNameParts: Seq[String],
646+
isExtended: Boolean)
647+
extends RunnableCommand {
648+
649+
override val output: Seq[Attribute] = {
650+
Seq(
651+
AttributeReference("info_name", StringType, nullable = false,
652+
new MetadataBuilder().putString("comment", "name of the column info").build())(),
653+
AttributeReference("info_value", StringType, nullable = false,
654+
new MetadataBuilder().putString("comment", "value of the column info").build())()
655+
)
656+
}
657+
658+
override def run(sparkSession: SparkSession): Seq[Row] = {
659+
val catalog = sparkSession.sessionState.catalog
660+
val resolver = sparkSession.sessionState.conf.resolver
661+
val relation = sparkSession.table(table).queryExecution.analyzed
662+
663+
val colName = UnresolvedAttribute(colNameParts).name
664+
val field = {
665+
relation.resolve(colNameParts, resolver).getOrElse {
666+
throw new AnalysisException(s"Column $colName does not exist")
667+
}
668+
}
669+
if (!field.isInstanceOf[Attribute]) {
670+
// If the field is not an attribute after `resolve`, then it's a nested field.
671+
throw new AnalysisException(
672+
s"DESC TABLE COLUMN command does not support nested data types: $colName")
673+
}
674+
675+
val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table)
676+
val colStats = catalogTable.stats.map(_.colStats).getOrElse(Map.empty)
677+
val cs = colStats.get(field.name)
678+
679+
val comment = if (field.metadata.contains("comment")) {
680+
Option(field.metadata.getString("comment"))
681+
} else {
682+
None
683+
}
684+
685+
val buffer = ArrayBuffer[Row](
686+
Row("col_name", field.name),
687+
Row("data_type", field.dataType.catalogString),
688+
Row("comment", comment.getOrElse("NULL"))
689+
)
690+
if (isExtended) {
691+
// Show column stats when EXTENDED or FORMATTED is specified.
692+
buffer += Row("min", cs.flatMap(_.min.map(_.toString)).getOrElse("NULL"))
693+
buffer += Row("max", cs.flatMap(_.max.map(_.toString)).getOrElse("NULL"))
694+
buffer += Row("num_nulls", cs.map(_.nullCount.toString).getOrElse("NULL"))
695+
buffer += Row("distinct_count", cs.map(_.distinctCount.toString).getOrElse("NULL"))
696+
buffer += Row("avg_col_len", cs.map(_.avgLen.toString).getOrElse("NULL"))
697+
buffer += Row("max_col_len", cs.map(_.maxLen.toString).getOrElse("NULL"))
698+
}
699+
buffer
700+
}
701+
}
634702

635703
/**
636704
* A command for users to get tables in the given database.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
-- Test temp table
2+
CREATE TEMPORARY VIEW desc_col_temp_table (key int COMMENT 'column_comment') USING PARQUET;
3+
4+
DESC desc_col_temp_table key;
5+
6+
DESC EXTENDED desc_col_temp_table key;
7+
8+
DESC FORMATTED desc_col_temp_table key;
9+
10+
-- Describe a column with qualified name
11+
DESC FORMATTED desc_col_temp_table desc_col_temp_table.key;
12+
13+
-- Describe a non-existent column
14+
DESC desc_col_temp_table key1;
15+
16+
-- Test persistent table
17+
CREATE TABLE desc_col_table (key int COMMENT 'column_comment') USING PARQUET;
18+
19+
ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key;
20+
21+
DESC desc_col_table key;
22+
23+
DESC EXTENDED desc_col_table key;
24+
25+
DESC FORMATTED desc_col_table key;
26+
27+
-- Test complex columns
28+
CREATE TABLE desc_col_complex_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET;
29+
30+
DESC FORMATTED desc_col_complex_table `a.b`;
31+
32+
DESC FORMATTED desc_col_complex_table col;
33+
34+
-- Describe a nested column
35+
DESC FORMATTED desc_col_complex_table col.x;
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
-- Automatically generated by SQLQueryTestSuite
2+
-- Number of queries: 15
3+
4+
5+
-- !query 0
6+
CREATE TEMPORARY VIEW desc_col_temp_table (key int COMMENT 'column_comment') USING PARQUET
7+
-- !query 0 schema
8+
struct<>
9+
-- !query 0 output
10+
11+
12+
13+
-- !query 1
14+
DESC desc_col_temp_table key
15+
-- !query 1 schema
16+
struct<info_name:string,info_value:string>
17+
-- !query 1 output
18+
col_name key
19+
data_type int
20+
comment column_comment
21+
22+
23+
-- !query 2
24+
DESC EXTENDED desc_col_temp_table key
25+
-- !query 2 schema
26+
struct<info_name:string,info_value:string>
27+
-- !query 2 output
28+
col_name key
29+
data_type int
30+
comment column_comment
31+
min NULL
32+
max NULL
33+
num_nulls NULL
34+
distinct_count NULL
35+
avg_col_len NULL
36+
max_col_len NULL
37+
38+
39+
-- !query 3
40+
DESC FORMATTED desc_col_temp_table key
41+
-- !query 3 schema
42+
struct<info_name:string,info_value:string>
43+
-- !query 3 output
44+
col_name key
45+
data_type int
46+
comment column_comment
47+
min NULL
48+
max NULL
49+
num_nulls NULL
50+
distinct_count NULL
51+
avg_col_len NULL
52+
max_col_len NULL
53+
54+
55+
-- !query 4
56+
DESC FORMATTED desc_col_temp_table desc_col_temp_table.key
57+
-- !query 4 schema
58+
struct<info_name:string,info_value:string>
59+
-- !query 4 output
60+
col_name key
61+
data_type int
62+
comment column_comment
63+
min NULL
64+
max NULL
65+
num_nulls NULL
66+
distinct_count NULL
67+
avg_col_len NULL
68+
max_col_len NULL
69+
70+
71+
-- !query 5
72+
DESC desc_col_temp_table key1
73+
-- !query 5 schema
74+
struct<>
75+
-- !query 5 output
76+
org.apache.spark.sql.AnalysisException
77+
Column key1 does not exist;
78+
79+
80+
-- !query 6
81+
CREATE TABLE desc_col_table (key int COMMENT 'column_comment') USING PARQUET
82+
-- !query 6 schema
83+
struct<>
84+
-- !query 6 output
85+
86+
87+
88+
-- !query 7
89+
ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key
90+
-- !query 7 schema
91+
struct<>
92+
-- !query 7 output
93+
94+
95+
96+
-- !query 8
97+
DESC desc_col_table key
98+
-- !query 8 schema
99+
struct<info_name:string,info_value:string>
100+
-- !query 8 output
101+
col_name key
102+
data_type int
103+
comment column_comment
104+
105+
106+
-- !query 9
107+
DESC EXTENDED desc_col_table key
108+
-- !query 9 schema
109+
struct<info_name:string,info_value:string>
110+
-- !query 9 output
111+
col_name key
112+
data_type int
113+
comment column_comment
114+
min NULL
115+
max NULL
116+
num_nulls 0
117+
distinct_count 0
118+
avg_col_len 4
119+
max_col_len 4
120+
121+
122+
-- !query 10
123+
DESC FORMATTED desc_col_table key
124+
-- !query 10 schema
125+
struct<info_name:string,info_value:string>
126+
-- !query 10 output
127+
col_name key
128+
data_type int
129+
comment column_comment
130+
min NULL
131+
max NULL
132+
num_nulls 0
133+
distinct_count 0
134+
avg_col_len 4
135+
max_col_len 4
136+
137+
138+
-- !query 11
139+
CREATE TABLE desc_col_complex_table (`a.b` int, col struct<x:int, y:string>) USING PARQUET
140+
-- !query 11 schema
141+
struct<>
142+
-- !query 11 output
143+
144+
145+
146+
-- !query 12
147+
DESC FORMATTED desc_col_complex_table `a.b`
148+
-- !query 12 schema
149+
struct<info_name:string,info_value:string>
150+
-- !query 12 output
151+
col_name a.b
152+
data_type int
153+
comment NULL
154+
min NULL
155+
max NULL
156+
num_nulls NULL
157+
distinct_count NULL
158+
avg_col_len NULL
159+
max_col_len NULL
160+
161+
162+
-- !query 13
163+
DESC FORMATTED desc_col_complex_table col
164+
-- !query 13 schema
165+
struct<info_name:string,info_value:string>
166+
-- !query 13 output
167+
col_name col
168+
data_type struct<x:int,y:string>
169+
comment NULL
170+
min NULL
171+
max NULL
172+
num_nulls NULL
173+
distinct_count NULL
174+
avg_col_len NULL
175+
max_col_len NULL
176+
177+
178+
-- !query 14
179+
DESC FORMATTED desc_col_complex_table col.x
180+
-- !query 14 schema
181+
struct<>
182+
-- !query 14 output
183+
org.apache.spark.sql.AnalysisException
184+
DESC TABLE COLUMN command does not support nested data types: col.x;

sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
2626
import org.apache.spark.sql.catalyst.plans.logical._
2727
import org.apache.spark.sql.catalyst.rules.RuleExecutor
2828
import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile}
29-
import org.apache.spark.sql.execution.command.DescribeTableCommand
29+
import org.apache.spark.sql.execution.command.{DescribeColumnCommand, DescribeTableCommand}
3030
import org.apache.spark.sql.test.SharedSQLContext
3131
import org.apache.spark.sql.types.StructType
3232

@@ -214,11 +214,11 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
214214
/** Executes a query and returns the result as (schema of the output, normalized output). */
215215
private def getNormalizedResult(session: SparkSession, sql: String): (StructType, Seq[String]) = {
216216
// Returns true if the plan is supposed to be sorted.
217-
def needSort(plan: LogicalPlan): Boolean = plan match {
217+
def isSorted(plan: LogicalPlan): Boolean = plan match {
218218
case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false
219-
case _: DescribeTableCommand => true
219+
case _: DescribeTableCommand | _: DescribeColumnCommand => true
220220
case PhysicalOperation(_, _, Sort(_, true, _)) => true
221-
case _ => plan.children.iterator.exists(needSort)
221+
case _ => plan.children.iterator.exists(isSorted)
222222
}
223223

224224
try {
@@ -233,7 +233,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
233233
.replaceAll("Last Access.*", s"Last Access $notIncludedMsg"))
234234

235235
// If the output is not pre-sorted, sort it.
236-
if (needSort(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
236+
if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted)
237237

238238
} catch {
239239
case a: AnalysisException =>

0 commit comments

Comments
 (0)