[SPARK-22529][SQL] Relation stats should be consistent with other plans based on cbo config

wzhfy · cloud-fan · commit 1ff4a77be498 · 2017-11-28T01:13:44.000+08:00
## What changes were proposed in this pull request? Currently, relation stats is the same whether cbo is enabled or not. While relation (`LogicalRelation` or `HiveTableRelation`) is a `LogicalPlan`, its behavior is inconsistent with other plans. This can cause confusion when user runs EXPLAIN COST commands. Besides, when CBO is disabled, we apply the size-only estimation strategy, so there's no need to propagate other catalog statistics to relation. ## How was this patch tested? Enhanced existing tests case and added a test case. Author: Zhenhua Wang <wangzhenhua@huawei.com> Closes #19757 from wzhfy/catalog_stats_conversion.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -366,10 +366,16 @@ case class CatalogStatistics(
    * Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
    * on column names.
    */
-  def toPlanStats(planOutput: Seq[Attribute]): Statistics = {
-    val matched = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
-    Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
-      attributeStats = AttributeMap(matched))
+  def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = {
+    if (cboEnabled) {
+      val attrStats = planOutput.flatMap(a => colStats.get(a.name).map(a -> _))
+      Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
+        attributeStats = AttributeMap(attrStats))
+    } else {
+      // When CBO is disabled, we apply the size-only estimation strategy, so there's no need to
+      // propagate other statistics from catalog to the plan.
+      Statistics(sizeInBytes = sizeInBytes)
+    }
   }
 
   /** Readable string representation for the CatalogStatistics. */
@@ -452,7 +458,7 @@ case class HiveTableRelation(
   )
 
   override def computeStats(): Statistics = {
-    tableMeta.stats.map(_.toPlanStats(output)).getOrElse {
+    tableMeta.stats.map(_.toPlanStats(output, conf.cboEnabled)).getOrElse {
       throw new IllegalStateException("table stats must be specified.")
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala
@@ -41,7 +41,7 @@ case class LogicalRelation(
 
   override def computeStats(): Statistics = {
     catalogTable
-      .flatMap(_.stats.map(_.toPlanStats(output)))
+      .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled)))
       .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, H
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, LogicalPlan}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.internal.StaticSQLConf
+import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.Decimal
 
@@ -223,11 +223,19 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
     assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat))
 
     // Check relation statistics
-    assert(relation.stats.sizeInBytes == 0)
-    assert(relation.stats.rowCount == Some(0))
-    assert(relation.stats.attributeStats.size == 1)
-    val (attribute, colStat) = relation.stats.attributeStats.head
-    assert(attribute.name == "c1")
-    assert(colStat == emptyColStat)
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      assert(relation.stats.sizeInBytes == 0)
+      assert(relation.stats.rowCount == Some(0))
+      assert(relation.stats.attributeStats.size == 1)
+      val (attribute, colStat) = relation.stats.attributeStats.head
+      assert(attribute.name == "c1")
+      assert(colStat == emptyColStat)
+    }
+    relation.invalidateStatsCache()
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
+      assert(relation.stats.sizeInBytes == 0)
+      assert(relation.stats.rowCount.isEmpty)
+      assert(relation.stats.attributeStats.isEmpty)
+    }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -18,8 +18,10 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 
 /**
@@ -29,21 +31,32 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
   import testImplicits._
 
   test("show cost in explain command") {
+    val explainCostCommand = "EXPLAIN COST  SELECT * FROM src"
     // For readability, we only show optimized plan and physical plan in explain cost command
-    checkKeywordsExist(sql("EXPLAIN COST  SELECT * FROM src "),
+    checkKeywordsExist(sql(explainCostCommand),
       "Optimized Logical Plan", "Physical Plan")
-    checkKeywordsNotExist(sql("EXPLAIN COST  SELECT * FROM src "),
+    checkKeywordsNotExist(sql(explainCostCommand),
       "Parsed Logical Plan", "Analyzed Logical Plan")
 
-    // Only has sizeInBytes before ANALYZE command
-    checkKeywordsExist(sql("EXPLAIN COST  SELECT * FROM src "), "sizeInBytes")
-    checkKeywordsNotExist(sql("EXPLAIN COST  SELECT * FROM src "), "rowCount")
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
+      // Only has sizeInBytes before ANALYZE command
+      checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
+      checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
 
-    // Has both sizeInBytes and rowCount after ANALYZE command
-    sql("ANALYZE TABLE src COMPUTE STATISTICS")
-    checkKeywordsExist(sql("EXPLAIN COST  SELECT * FROM src "), "sizeInBytes", "rowCount")
+      // Has both sizeInBytes and rowCount after ANALYZE command
+      sql("ANALYZE TABLE src COMPUTE STATISTICS")
+      checkKeywordsExist(sql(explainCostCommand), "sizeInBytes", "rowCount")
+    }
+
+    spark.sessionState.catalog.refreshTable(TableIdentifier("src"))
+
+    withSQLConf(SQLConf.CBO_ENABLED.key -> "false") {
+      // Don't show rowCount if cbo is disabled
+      checkKeywordsExist(sql(explainCostCommand), "sizeInBytes")
+      checkKeywordsNotExist(sql(explainCostCommand), "rowCount")
+    }
 
-    // No cost information
+    // No statistics information if "cost" is not specified
     checkKeywordsNotExist(sql("EXPLAIN  SELECT * FROM src "), "sizeInBytes", "rowCount")
   }
 

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ case class LogicalRelation(`
`41`	`41`
`42`	`42`	`override def computeStats(): Statistics = {`
`43`	`43`	`catalogTable`
`44`		`- .flatMap(_.stats.map(_.toPlanStats(output)))`
	`44`	`+ .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled)))`
`45`	`45`	`.getOrElse(Statistics(sizeInBytes = relation.sizeInBytes))`
`46`	`46`	`}`
`47`	`47`