Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit 42cc830

Browse files
lianhuiwangcloud-fan
authored andcommitted
[SPARK-20986][SQL] Reset table's statistics after PruneFileSourcePartitions rule.
## What changes were proposed in this pull request? After PruneFileSourcePartitions rule, It needs reset table's statistics because PruneFileSourcePartitions can filter some unnecessary partitions. So the statistics need to be changed. ## How was this patch tested? add unit test. Author: lianhuiwang <[email protected]> Closes apache#18205 from lianhuiwang/SPARK-20986. (cherry picked from commit 8b5b2e2) Signed-off-by: Wenchen Fan <[email protected]>
1 parent 53212c3 commit 42cc830

File tree

2 files changed

+31
-2
lines changed

2 files changed

+31
-2
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.sql.execution.datasources
1919

20+
import org.apache.spark.sql.catalyst.catalog.CatalogStatistics
2021
import org.apache.spark.sql.catalyst.expressions._
2122
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
2223
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
@@ -59,8 +60,11 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
5960
val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq)
6061
val prunedFsRelation =
6162
fsRelation.copy(location = prunedFileIndex)(sparkSession)
62-
val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
63-
63+
// Change table stats based on the sizeInBytes of pruned files
64+
val withStats = logicalRelation.catalogTable.map(_.copy(
65+
stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes)))))
66+
val prunedLogicalRelation = logicalRelation.copy(
67+
relation = prunedFsRelation, catalogTable = withStats)
6468
// Keep partition-pruning predicates so that they are visible in physical planning
6569
val filterExpression = filters.reduceLeft(And)
6670
val filter = Filter(filterExpression, prunedLogicalRelation)

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark.sql.hive.execution
1919

2020
import org.apache.spark.sql.QueryTest
21+
import org.apache.spark.sql.catalyst.TableIdentifier
2122
import org.apache.spark.sql.catalyst.dsl.expressions._
2223
import org.apache.spark.sql.catalyst.dsl.plans._
2324
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
@@ -66,4 +67,28 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
6667
}
6768
}
6869
}
70+
71+
test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") {
72+
withTable("tbl") {
73+
spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl")
74+
sql(s"ANALYZE TABLE tbl COMPUTE STATISTICS")
75+
val tableStats = spark.sessionState.catalog.getTableMetadata(TableIdentifier("tbl")).stats
76+
assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost")
77+
78+
val df = sql("SELECT * FROM tbl WHERE p = 1")
79+
val sizes1 = df.queryExecution.analyzed.collect {
80+
case relation: LogicalRelation => relation.catalogTable.get.stats.get.sizeInBytes
81+
}
82+
assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}")
83+
assert(sizes1(0) == tableStats.get.sizeInBytes)
84+
85+
val relations = df.queryExecution.optimizedPlan.collect {
86+
case relation: LogicalRelation => relation
87+
}
88+
assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}")
89+
val size2 = relations(0).computeStats(conf).sizeInBytes
90+
assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes)
91+
assert(size2 < tableStats.get.sizeInBytes)
92+
}
93+
}
6994
}

0 commit comments

Comments
 (0)