Skip to content

Commit c48085a

Browse files
Mykhailo Shtelmagatorsmile
authored andcommitted
[SPARK-23799][SQL] FilterEstimation.evaluateInSet produces devision by zero in a case of empty table with analyzed statistics
>What changes were proposed in this pull request? During evaluation of IN conditions, if the source data frame, is represented by a plan, that uses hive table with columns, which were previously analysed, and the plan has conditions for these fields, that cannot be satisfied (which leads us to an empty data frame), FilterEstimation.evaluateInSet method produces NumberFormatException and ClassCastException. In order to fix this bug, method FilterEstimation.evaluateInSet at first checks, if distinct count is not zero, and also checks if colStat.min and colStat.max are defined, and only in this case proceeds with the calculation. If at least one of the conditions is not satisfied, zero is returned. >How was this patch tested? In order to test the PR two tests were implemented: one in FilterEstimationSuite, that tests the plan with the statistics that violates the conditions mentioned above, and another one in StatisticsCollectionSuite, that test the whole process of analysis/optimisation of the query, that leads to the problems, mentioned in the first section. Author: Mykhailo Shtelma <[email protected]> Author: smikesh <[email protected]> Closes apache#21052 from mshtelma/filter_estimation_evaluateInSet_Bugs.
1 parent 7bc853d commit c48085a

File tree

3 files changed

+43
-0
lines changed

3 files changed

+43
-0
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
392392
val dataType = attr.dataType
393393
var newNdv = ndv
394394

395+
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
396+
return Some(0.0)
397+
}
398+
395399
// use [min, max] to filter the original hSet
396400
dataType match {
397401
case _: NumericType | BooleanType | DateType | TimestampType =>

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,17 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
357357
expectedRowCount = 3)
358358
}
359359

360+
test("evaluateInSet with all zeros") {
361+
validateEstimatedStats(
362+
Filter(InSet(attrString, Set(3, 4, 5)),
363+
StatsTestPlan(Seq(attrString), 0,
364+
AttributeMap(Seq(attrString ->
365+
ColumnStat(distinctCount = Some(0), min = None, max = None,
366+
nullCount = Some(0), avgLen = Some(0), maxLen = Some(0)))))),
367+
Seq(attrString -> ColumnStat(distinctCount = Some(0))),
368+
expectedRowCount = 0)
369+
}
370+
360371
test("cint NOT IN (3, 4, 5)") {
361372
validateEstimatedStats(
362373
Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),

sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,4 +382,32 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
382382
}
383383
}
384384
}
385+
386+
test("Simple queries must be working, if CBO is turned on") {
387+
withSQLConf(SQLConf.CBO_ENABLED.key -> "true") {
388+
withTable("TBL1", "TBL") {
389+
import org.apache.spark.sql.functions._
390+
val df = spark.range(1000L).select('id,
391+
'id * 2 as "FLD1",
392+
'id * 12 as "FLD2",
393+
lit("aaa") + 'id as "fld3")
394+
df.write
395+
.mode(SaveMode.Overwrite)
396+
.bucketBy(10, "id", "FLD1", "FLD2")
397+
.sortBy("id", "FLD1", "FLD2")
398+
.saveAsTable("TBL")
399+
sql("ANALYZE TABLE TBL COMPUTE STATISTICS ")
400+
sql("ANALYZE TABLE TBL COMPUTE STATISTICS FOR COLUMNS ID, FLD1, FLD2, FLD3")
401+
val df2 = spark.sql(
402+
"""
403+
|SELECT t1.id, t1.fld1, t1.fld2, t1.fld3
404+
|FROM tbl t1
405+
|JOIN tbl t2 on t1.id=t2.id
406+
|WHERE t1.fld3 IN (-123.23,321.23)
407+
""".stripMargin)
408+
df2.createTempView("TBL2")
409+
sql("SELECT * FROM tbl2 WHERE fld3 IN ('qqq', 'qwe') ").queryExecution.executedPlan
410+
}
411+
}
412+
}
385413
}

0 commit comments

Comments
 (0)