Skip to content

Commit 1ff4a77

Browse files
wzhfycloud-fan
authored andcommitted
[SPARK-22529][SQL] Relation stats should be consistent with other plans based on cbo config
## What changes were proposed in this pull request? Currently, relation stats is the same whether cbo is enabled or not. While relation (`LogicalRelation` or `HiveTableRelation`) is a `LogicalPlan`, its behavior is inconsistent with other plans. This can cause confusion when user runs EXPLAIN COST commands. Besides, when CBO is disabled, we apply the size-only estimation strategy, so there's no need to propagate other catalog statistics to relation. ## How was this patch tested? Enhanced existing tests case and added a test case. Author: Zhenhua Wang <[email protected]> Closes #19757 from wzhfy/catalog_stats_conversion.
1 parent 2dbe275 commit 1ff4a77

File tree

4 files changed

+49
-22
lines changed

4 files changed

+49
-22
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,16 @@ case class CatalogStatistics(
366366
* Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
367367
* on column names.
368368
*/
369-
def toPlanStats(planOutput: Seq[Attribute]): Statistics = {
370-
val matched = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
371-
Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
372-
attributeStats = AttributeMap(matched))
369+
def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = {
370+
if (cboEnabled) {
371+
val attrStats = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
372+
Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
373+
attributeStats = AttributeMap(attrStats))
374+
} else {
375+
// When CBO is disabled, we apply the size-only estimation strategy, so there's no need to
376+
// propagate other statistics from catalog to the plan.
377+
Statistics(sizeInBytes = sizeInBytes)
378+
}
373379
}
374380

375381
/** Readable string representation for the CatalogStatistics. */
@@ -452,7 +458,7 @@ case class HiveTableRelation(
452458
)
453459

454460
override def computeStats(): Statistics = {
455-
tableMeta.stats.map(_.toPlanStats(output)).getOrElse {
461+
tableMeta.stats.map(_.toPlanStats(output, conf.cboEnabled)).getOrElse {
456462
throw new IllegalStateException("table stats must be specified.")
457463
}
458464
}

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ case class LogicalRelation(
4141

4242
override def computeStats(): Statistics = {
4343
catalogTable
44-
.flatMap(_.stats.map(_.toPlanStats(output)))
44+
.flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled)))
4545
.getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))
4646
}
4747

sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, H
2828
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, LogicalPlan}
2929
import org.apache.spark.sql.catalyst.util.DateTimeUtils
3030
import org.apache.spark.sql.execution.datasources.LogicalRelation
31-
import org.apache.spark.sql.internal.StaticSQLConf
31+
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
3232
import org.apache.spark.sql.test.SQLTestUtils
3333
import org.apache.spark.sql.types.Decimal
3434

@@ -223,11 +223,19 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
223223
assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
224224

225225
// Check relation statistics
226-
assert(relation.stats.sizeInBytes == 0)
227-
assert(relation.stats.rowCount == Some(0))
228-
assert(relation.stats.attributeStats.size == 1)
229-
val (attribute, colStat) = relation.stats.attributeStats.head
230-
assert(attribute.name == "c1")
231-
assert(colStat == emptyColStat)
226+
withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
227+
assert(relation.stats.sizeInBytes == 0)
228+
assert(relation.stats.rowCount == Some(0))
229+
assert(relation.stats.attributeStats.size == 1)
230+
val (attribute, colStat) = relation.stats.attributeStats.head
231+
assert(attribute.name == "c1")
232+
assert(colStat == emptyColStat)
233+
}
234+
relation.invalidateStatsCache()
235+
withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
236+
assert(relation.stats.sizeInBytes == 0)
237+
assert(relation.stats.rowCount.isEmpty)
238+
assert(relation.stats.attributeStats.isEmpty)
239+
}
232240
}
233241
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
package org.apache.spark.sql.hive.execution
1919

2020
import org.apache.spark.sql.QueryTest
21+
import org.apache.spark.sql.catalyst.TableIdentifier
2122
import org.apache.spark.sql.catalyst.parser.ParseException
2223
import org.apache.spark.sql.hive.test.TestHiveSingleton
24+
import org.apache.spark.sql.internal.SQLConf
2325
import org.apache.spark.sql.test.SQLTestUtils
2426

2527
/**
@@ -29,21 +31,32 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
2931
import testImplicits._
3032

3133
test("show cost in explain command") {
34+
val explainCostCommand = "EXPLAIN COST SELECT * FROM src"
3235
// For readability, we only show optimized plan and physical plan in explain cost command
33-
checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "),
36+
checkKeywordsExist(sql(explainCostCommand),
3437
"Optimized Logical Plan", "Physical Plan")
35-
checkKeywordsNotExist(sql("EXPLAIN COST SELECT * FROM src "),
38+
checkKeywordsNotExist(sql(explainCostCommand),
3639
"Parsed Logical Plan", "Analyzed Logical Plan")
3740

38-
// Only has sizeInBytes before ANALYZE command
39-
checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), "sizeInBytes")
40-
checkKeywordsNotExist(sql("EXPLAIN COST SELECT * FROM src "), "rowCount")
41+
withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
42+
// Only has sizeInBytes before ANALYZE command
43+
checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
44+
checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
4145

42-
// Has both sizeInBytes and rowCount after ANALYZE command
43-
sql("ANALYZE TABLE src COMPUTE STATISTICS")
44-
checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), "sizeInBytes", "rowCount")
46+
// Has both sizeInBytes and rowCount after ANALYZE command
47+
sql("ANALYZE TABLE src COMPUTE STATISTICS")
48+
checkKeywordsExist(sql(explainCostCommand), "sizeInBytes", "rowCount")
49+
}
50+
51+
spark.sessionState.catalog.refreshTable(TableIdentifier("src"))
52+
53+
withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
54+
// Don't show rowCount if cbo is disabled
55+
checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
56+
checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
57+
}
4558

46-
// No cost information
59+
// No statistics information if "cost" is not specified
4760
checkKeywordsNotExist(sql("EXPLAIN SELECT * FROM src "), "sizeInBytes", "rowCount")
4861
}
4962

0 commit comments

Comments
 (0)