Skip to content

Commit b96f61b

Browse files
mgaido91cloud-fan
authored andcommitted
[SPARK-22475][SQL] show histogram in DESC COLUMN command
## What changes were proposed in this pull request? Added the histogram representation to the output of the `DESCRIBE EXTENDED table_name column_name` command. ## How was this patch tested? Modified SQL UT and checked output Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Marco Gaido <[email protected]> Closes #19774 from mgaido91/SPARK-22475.
1 parent 6d7ebf2 commit b96f61b

File tree

3 files changed

+93
-8
lines changed

3 files changed

+93
-8
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.catalog._
3434
import org.apache.spark.sql.catalyst.catalog.CatalogTableType._
3535
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
3636
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
37+
import org.apache.spark.sql.catalyst.plans.logical.Histogram
3738
import org.apache.spark.sql.catalyst.util.quoteIdentifier
3839
import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils}
3940
import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
@@ -689,9 +690,25 @@ case class DescribeColumnCommand(
689690
buffer += Row("distinct_count", cs.map(_.distinctCount.toString).getOrElse("NULL"))
690691
buffer += Row("avg_col_len", cs.map(_.avgLen.toString).getOrElse("NULL"))
691692
buffer += Row("max_col_len", cs.map(_.maxLen.toString).getOrElse("NULL"))
693+
val histDesc = for {
694+
c <- cs
695+
hist <- c.histogram
696+
} yield histogramDescription(hist)
697+
buffer ++= histDesc.getOrElse(Seq(Row("histogram", "NULL")))
692698
}
693699
buffer
694700
}
701+
702+
private def histogramDescription(histogram: Histogram): Seq[Row] = {
703+
val header = Row("histogram",
704+
s"height: ${histogram.height}, num_of_bins: ${histogram.bins.length}")
705+
val bins = histogram.bins.zipWithIndex.map {
706+
case (bin, index) =>
707+
Row(s"bin_$index",
708+
s"lower_bound: ${bin.lo}, upper_bound: ${bin.hi}, distinct_count: ${bin.ndv}")
709+
}
710+
header +: bins
711+
}
695712
}
696713

697714
/**

sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ DESC FORMATTED desc_complex_col_table col;
3434
-- Describe a nested column
3535
DESC FORMATTED desc_complex_col_table col.x;
3636

37+
-- Test output for histogram statistics
38+
SET spark.sql.statistics.histogram.enabled=true;
39+
SET spark.sql.statistics.histogram.numBins=2;
40+
41+
INSERT INTO desc_col_table values 1, 2, 3, 4;
42+
43+
ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key;
44+
45+
DESC EXTENDED desc_col_table key;
46+
3747
DROP VIEW desc_col_temp_view;
3848

3949
DROP TABLE desc_col_table;

sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 18
2+
-- Number of queries: 23
33

44

55
-- !query 0
@@ -34,6 +34,7 @@ num_nulls NULL
3434
distinct_count NULL
3535
avg_col_len NULL
3636
max_col_len NULL
37+
histogram NULL
3738

3839

3940
-- !query 3
@@ -50,6 +51,7 @@ num_nulls NULL
5051
distinct_count NULL
5152
avg_col_len NULL
5253
max_col_len NULL
54+
histogram NULL
5355

5456

5557
-- !query 4
@@ -66,6 +68,7 @@ num_nulls NULL
6668
distinct_count NULL
6769
avg_col_len NULL
6870
max_col_len NULL
71+
histogram NULL
6972

7073

7174
-- !query 5
@@ -117,6 +120,7 @@ num_nulls 0
117120
distinct_count 0
118121
avg_col_len 4
119122
max_col_len 4
123+
histogram NULL
120124

121125

122126
-- !query 10
@@ -133,6 +137,7 @@ num_nulls 0
133137
distinct_count 0
134138
avg_col_len 4
135139
max_col_len 4
140+
histogram NULL
136141

137142

138143
-- !query 11
@@ -157,6 +162,7 @@ num_nulls NULL
157162
distinct_count NULL
158163
avg_col_len NULL
159164
max_col_len NULL
165+
histogram NULL
160166

161167

162168
-- !query 13
@@ -173,6 +179,7 @@ num_nulls NULL
173179
distinct_count NULL
174180
avg_col_len NULL
175181
max_col_len NULL
182+
histogram NULL
176183

177184

178185
-- !query 14
@@ -185,24 +192,75 @@ DESC TABLE COLUMN command does not support nested data types: col.x;
185192

186193

187194
-- !query 15
188-
DROP VIEW desc_col_temp_view
195+
SET spark.sql.statistics.histogram.enabled=true
189196
-- !query 15 schema
190-
struct<>
197+
struct<key:string,value:string>
191198
-- !query 15 output
192-
199+
spark.sql.statistics.histogram.enabled true
193200

194201

195202
-- !query 16
196-
DROP TABLE desc_col_table
203+
SET spark.sql.statistics.histogram.numBins=2
197204
-- !query 16 schema
198-
struct<>
205+
struct<key:string,value:string>
199206
-- !query 16 output
200-
207+
spark.sql.statistics.histogram.numBins 2
201208

202209

203210
-- !query 17
204-
DROP TABLE desc_complex_col_table
211+
INSERT INTO desc_col_table values 1, 2, 3, 4
205212
-- !query 17 schema
206213
struct<>
207214
-- !query 17 output
208215

216+
217+
218+
-- !query 18
219+
ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key
220+
-- !query 18 schema
221+
struct<>
222+
-- !query 18 output
223+
224+
225+
226+
-- !query 19
227+
DESC EXTENDED desc_col_table key
228+
-- !query 19 schema
229+
struct<info_name:string,info_value:string>
230+
-- !query 19 output
231+
col_name key
232+
data_type int
233+
comment column_comment
234+
min 1
235+
max 4
236+
num_nulls 0
237+
distinct_count 4
238+
avg_col_len 4
239+
max_col_len 4
240+
histogram height: 2.0, num_of_bins: 2
241+
bin_0 lower_bound: 1.0, upper_bound: 2.0, distinct_count: 2
242+
bin_1 lower_bound: 2.0, upper_bound: 4.0, distinct_count: 2
243+
244+
245+
-- !query 20
246+
DROP VIEW desc_col_temp_view
247+
-- !query 20 schema
248+
struct<>
249+
-- !query 20 output
250+
251+
252+
253+
-- !query 21
254+
DROP TABLE desc_col_table
255+
-- !query 21 schema
256+
struct<>
257+
-- !query 21 output
258+
259+
260+
261+
-- !query 22
262+
DROP TABLE desc_complex_col_table
263+
-- !query 22 schema
264+
struct<>
265+
-- !query 22 output
266+

0 commit comments

Comments
 (0)