-
Notifications
You must be signed in to change notification settings - Fork 267
feat: do not fallback to Spark for COUNT(distinct)
#2429
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
78761f2
f81548d
b795ec4
4177f63
554abaf
1356d51
66103c9
f029bf7
0c20769
9bb890e
9d3a40f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,15 +19,73 @@ | |
|
|
||
| package org.apache.comet | ||
|
|
||
| import org.apache.comet.DataTypeSupport.isComplexType | ||
|
|
||
| class CometFuzzAggregateSuite extends CometFuzzTestBase { | ||
|
|
||
| test("count distinct") { | ||
| test("count distinct - simple columns") { | ||
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
| for (col <- df.columns) { | ||
| for (col <- df.schema.fields.filterNot(f => isComplexType(f.dataType)).map(_.name)) { | ||
| val sql = s"SELECT count(distinct $col) FROM t1" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you also add tests for count distinct with multiple columns e.g.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @andygrove I'll add them separately as well |
||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
| assert(1 == collectNativeScans(cometPlan).length) | ||
| } | ||
|
|
||
| checkSparkAnswerAndOperator(sql) | ||
| } | ||
| } | ||
|
|
||
| // Aggregate by complex columns not yet supported | ||
| // https://github.com/apache/datafusion-comet/issues/2382 | ||
| test("count distinct - complex columns") { | ||
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
| for (col <- df.schema.fields.filter(f => isComplexType(f.dataType)).map(_.name)) { | ||
| val sql = s"SELECT count(distinct $col) FROM t1" | ||
| // Comet does not support count distinct yet | ||
| // https://github.com/apache/datafusion-comet/issues/2292 | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
| assert(1 == collectNativeScans(cometPlan).length) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("count distinct group by multiple column - simple columns ") { | ||
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
| for (col <- df.schema.fields.filterNot(f => isComplexType(f.dataType)).map(_.name)) { | ||
| val sql = s"SELECT c1, c2, c3, count(distinct $col) FROM t1 group by c1, c2, c3" | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
| assert(1 == collectNativeScans(cometPlan).length) | ||
| } | ||
|
|
||
| checkSparkAnswerAndOperator(sql) | ||
| } | ||
| } | ||
|
|
||
| // Aggregate by complex columns not yet supported | ||
| // https://github.com/apache/datafusion-comet/issues/2382 | ||
| test("count distinct group by multiple column - complex columns ") { | ||
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
| for (col <- df.schema.fields.filter(f => isComplexType(f.dataType)).map(_.name)) { | ||
| val sql = s"SELECT c1, c2, c3, count(distinct $col) FROM t1 group by c1, c2, c3" | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
| assert(1 == collectNativeScans(cometPlan).length) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // COUNT(distinct x, y, z, ...) not yet supported | ||
| // https://github.com/apache/datafusion-comet/issues/2292 | ||
| test("count distinct multiple values and group by multiple column") { | ||
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
| for (col <- df.columns) { | ||
| val sql = s"SELECT c1, c2, c3, count(distinct $col, c4, c5) FROM t1 group by c1, c2, c3" | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
| assert(1 == collectNativeScans(cometPlan).length) | ||
|
|
@@ -39,7 +97,6 @@ class CometFuzzAggregateSuite extends CometFuzzTestBase { | |
| val df = spark.read.parquet(filename) | ||
| df.createOrReplaceTempView("t1") | ||
| for (col <- df.columns) { | ||
| // cannot run fully natively due to range partitioning and sort | ||
| val sql = s"SELECT $col, count(*) FROM t1 GROUP BY $col ORDER BY $col" | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
|
|
@@ -53,7 +110,6 @@ class CometFuzzAggregateSuite extends CometFuzzTestBase { | |
| df.createOrReplaceTempView("t1") | ||
| val groupCol = df.columns.head | ||
| for (col <- df.columns.drop(1)) { | ||
| // cannot run fully natively due to range partitioning and sort | ||
| val sql = s"SELECT $groupCol, count($col) FROM t1 GROUP BY $groupCol ORDER BY $groupCol" | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
| if (usingDataSourceExec) { | ||
|
|
@@ -67,7 +123,6 @@ class CometFuzzAggregateSuite extends CometFuzzTestBase { | |
| df.createOrReplaceTempView("t1") | ||
| val groupCol = df.columns.head | ||
| val otherCol = df.columns.drop(1) | ||
| // cannot run fully natively due to range partitioning and sort | ||
| val sql = s"SELECT $groupCol, count(${otherCol.mkString(", ")}) FROM t1 " + | ||
| s"GROUP BY $groupCol ORDER BY $groupCol" | ||
| val (_, cometPlan) = checkSparkAnswer(sql) | ||
|
|
@@ -88,5 +143,4 @@ class CometFuzzAggregateSuite extends CometFuzzTestBase { | |
| } | ||
| } | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't we need to pass the
aggExpr.isDistinctvalue into the protobuf plan?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is a good point, I was thinking the same but IMO Spark doesn't call the count distinct on partial phase.
The local distinct is made by
HashAggregateso when count distinct get called asaggExprit might not be needing the flag as data already deduped on reducers. Checking the Final stage thoughThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for explaining that. We should add tests for other distinct aggregates as well, such as
sumandavg. I'm not sure if there are others?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Spark has tests for the following aggregates with DISTINCT:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this PR, we could just remove the fallback for COUNT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep, will do once