Skip to content

Commit 0353790

Browse files
mihailoale-dbcloud-fan
authored andcommitted
[SPARK-54871][SQL] Trim aliases from grouping and aggregate expressions before handling grouping analytics
### What changes were proposed in this pull request? In this PR I propose to trim aliases from grouping and aggregate expressions before handling grouping analytics. This is needed for the following query: ``` SELECT col1 AS k2 FROM values(1) GROUP BY CUBE(k2) ``` Here we have `col1` in the single-pass whereas in the fixed-point we have `col1 AS k2` before constructing an `Aggregate` in `ResolveGroupingAnalytics`. Change removes the `AS k2` part and keeps the compatibility between single-pass and fixed-point analyzers without changing outputs (analyzed plans are different, only names). The change also inlines the behavior with regular aggregates (without grouping analytics). In other words: ``` SELECT col1 + col2 AS a FROM VALUES(1,2) GROUP BY a ``` Here `col1 + col2` should be the grouping expression (as it is) and we inline the behaviors. ### Why are the changes needed? To keep the compatibility between fixed-point and single-pass analyzers. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Changed tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #53644 from mihailoale-db/trimaliasesgroupinganalytics. Authored-by: mihailoale-db <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 0567dcd commit 0353790

File tree

4 files changed

+27
-24
lines changed

4 files changed

+27
-24
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,7 @@ class Analyzer(
755755
groupByExprs: Seq[Expression],
756756
aggregationExprs: Seq[NamedExpression],
757757
child: LogicalPlan): LogicalPlan = {
758+
val aggregationExprsNoAlias = aggregationExprs.map(trimNonTopLevelAliases)
758759

759760
if (groupByExprs.size > GroupingID.dataType.defaultSize * 8) {
760761
throw QueryCompilationErrors.groupingSizeTooLargeError(GroupingID.dataType.defaultSize * 8)
@@ -775,7 +776,7 @@ class Analyzer(
775776
val groupingAttrs = expand.output.drop(child.output.length)
776777

777778
val aggregations = constructAggregateExprs(
778-
groupByExprs, aggregationExprs, groupByAliases, groupingAttrs, gid)
779+
groupByExprs, aggregationExprsNoAlias, groupByAliases, groupingAttrs, gid)
779780

780781
Aggregate(groupingAttrs, aggregations, expand)
781782
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -267,24 +267,26 @@ object GroupingID {
267267
}
268268
}
269269

270-
object GroupingAnalytics {
270+
object GroupingAnalytics extends AliasHelper {
271271
def unapply(exprs: Seq[Expression])
272272
: Option[(Seq[Seq[Expression]], Seq[Expression])] = {
273-
if (!exprs.exists(_.isInstanceOf[BaseGroupingSets])) {
273+
val exprsNoAlias = exprs.map(trimAliases)
274+
275+
if (!exprsNoAlias.exists(_.isInstanceOf[BaseGroupingSets])) {
274276
None
275277
} else {
276-
val resolved = exprs.forall {
278+
val resolved = exprsNoAlias.forall {
277279
case gs: BaseGroupingSets => gs.childrenResolved
278280
case other => other.resolved
279281
}
280282
if (!resolved) {
281283
None
282284
} else {
283-
val groups = exprs.flatMap {
285+
val groups = exprsNoAlias.flatMap {
284286
case gs: BaseGroupingSets => gs.groupByExprs
285287
case other: Expression => other :: Nil
286288
}
287-
val unmergedSelectedGroupByExprs = exprs.map {
289+
val unmergedSelectedGroupByExprs = exprsNoAlias.map {
288290
case gs: BaseGroupingSets => gs.selectedGroupByExprs
289291
case other: Expression => Seq(Seq(other))
290292
}

sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -501,9 +501,9 @@ Project [course#x, year#x]
501501
-- !query
502502
SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2)
503503
-- !query analysis
504-
Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x, sum((a#x - b#x)) AS sum((a - b))#xL]
505-
+- Expand [[a#x, b#x, k1#x, k2#x, 0], [a#x, b#x, k1#x, null, 1], [a#x, b#x, null, k2#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, k1#x, k2#x, spark_grouping_id#xL]
506-
+- Project [a#x, b#x, (a#x + b#x) AS k1#x, b#x AS k2#x]
504+
Aggregate [(a + b)#x, b#x, spark_grouping_id#xL], [(a + b)#x AS k1#x, b#x AS k2#x, sum((a#x - b#x)) AS sum((a - b))#xL]
505+
+- Expand [[a#x, b#x, (a + b)#x, b#x, 0], [a#x, b#x, (a + b)#x, null, 1], [a#x, b#x, null, b#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, (a + b)#x, b#x, spark_grouping_id#xL]
506+
+- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS b#x]
507507
+- SubqueryAlias testdata
508508
+- View (`testData`, [a#x, b#x])
509509
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -515,9 +515,9 @@ Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x, sum((a#x - b#x)) AS s
515515
-- !query
516516
SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
517517
-- !query analysis
518-
Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x, sum((a#x - b#x)) AS sum((a - b))#xL]
519-
+- Expand [[a#x, b#x, k#x, b#x, 0], [a#x, b#x, k#x, null, 1], [a#x, b#x, null, null, 3]], [a#x, b#x, k#x, b#x, spark_grouping_id#xL]
520-
+- Project [a#x, b#x, (a#x + b#x) AS k#x, b#x AS b#x]
518+
Aggregate [(a + b)#x, b#x, spark_grouping_id#xL], [(a + b)#x AS k#x, b#x, sum((a#x - b#x)) AS sum((a - b))#xL]
519+
+- Expand [[a#x, b#x, (a + b)#x, b#x, 0], [a#x, b#x, (a + b)#x, null, 1], [a#x, b#x, null, null, 3]], [a#x, b#x, (a + b)#x, b#x, spark_grouping_id#xL]
520+
+- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS b#x]
521521
+- SubqueryAlias testdata
522522
+- View (`testData`, [a#x, b#x])
523523
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -529,9 +529,9 @@ Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x, sum((a#x - b#x)) AS sum((
529529
-- !query
530530
SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k)
531531
-- !query analysis
532-
Aggregate [(a + b)#x, k#x, spark_grouping_id#xL], [(a + b)#x AS (a + b)#x, k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
533-
+- Expand [[a#x, b#x, null, k#x, 2]], [a#x, b#x, (a + b)#x, k#x, spark_grouping_id#xL]
534-
+- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS k#x]
532+
Aggregate [(a + b)#x, b#x, spark_grouping_id#xL], [(a + b)#x AS (a + b)#x, b#x AS k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
533+
+- Expand [[a#x, b#x, null, b#x, 2]], [a#x, b#x, (a + b)#x, b#x, spark_grouping_id#xL]
534+
+- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS b#x]
535535
+- SubqueryAlias testdata
536536
+- View (`testData`, [a#x, b#x])
537537
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]

sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -374,9 +374,9 @@ Project [course#x, year#x]
374374
-- !query
375375
SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2)
376376
-- !query analysis
377-
Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x, sum((a#x - b#x)) AS sum((a - b))#xL]
378-
+- Expand [[a#x, b#x, k1#x, k2#x, 0], [a#x, b#x, k1#x, null, 1], [a#x, b#x, null, k2#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, k1#x, k2#x, spark_grouping_id#xL]
379-
+- Project [a#x, b#x, cast(udf(cast((a#x + b#x) as string)) as int) AS k1#x, cast(udf(cast(b#x as string)) as int) AS k2#x]
377+
Aggregate [udf((a + b))#x, udf(b)#x, spark_grouping_id#xL], [udf((a + b))#x AS k1#x, udf(b)#x AS k2#x, sum((a#x - b#x)) AS sum((a - b))#xL]
378+
+- Expand [[a#x, b#x, udf((a + b))#x, udf(b)#x, 0], [a#x, b#x, udf((a + b))#x, null, 1], [a#x, b#x, null, udf(b)#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, udf((a + b))#x, udf(b)#x, spark_grouping_id#xL]
379+
+- Project [a#x, b#x, cast(udf(cast((a#x + b#x) as string)) as int) AS udf((a + b))#x, cast(udf(cast(b#x as string)) as int) AS udf(b)#x]
380380
+- SubqueryAlias testdata
381381
+- View (`testData`, [a#x, b#x])
382382
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -388,9 +388,9 @@ Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x, sum((a#x - b#x)) AS s
388388
-- !query
389389
SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
390390
-- !query analysis
391-
Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x, sum((a#x - b#x)) AS sum((a - b))#xL]
392-
+- Expand [[a#x, b#x, k#x, b#x, 0], [a#x, b#x, k#x, null, 1], [a#x, b#x, null, null, 3]], [a#x, b#x, k#x, b#x, spark_grouping_id#xL]
393-
+- Project [a#x, b#x, cast(udf(cast(cast(udf(cast((a#x + b#x) as string)) as int) as string)) as int) AS k#x, b#x AS b#x]
391+
Aggregate [udf(udf((a + b)))#x, b#x, spark_grouping_id#xL], [udf(udf((a + b)))#x AS k#x, b#x, sum((a#x - b#x)) AS sum((a - b))#xL]
392+
+- Expand [[a#x, b#x, udf(udf((a + b)))#x, b#x, 0], [a#x, b#x, udf(udf((a + b)))#x, null, 1], [a#x, b#x, null, null, 3]], [a#x, b#x, udf(udf((a + b)))#x, b#x, spark_grouping_id#xL]
393+
+- Project [a#x, b#x, cast(udf(cast(cast(udf(cast((a#x + b#x) as string)) as int) as string)) as int) AS udf(udf((a + b)))#x, b#x AS b#x]
394394
+- SubqueryAlias testdata
395395
+- View (`testData`, [a#x, b#x])
396396
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -402,9 +402,9 @@ Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x, sum((a#x - b#x)) AS sum((
402402
-- !query
403403
SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k)
404404
-- !query analysis
405-
Aggregate [(a + b)#x, k#x, spark_grouping_id#xL], [cast(udf(cast((a + b)#x as string)) as int) AS udf((a + b))#x, k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
406-
+- Expand [[a#x, b#x, null, k#x, 2]], [a#x, b#x, (a + b)#x, k#x, spark_grouping_id#xL]
407-
+- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, cast(udf(cast(cast(udf(cast(b#x as string)) as int) as string)) as int) AS k#x]
405+
Aggregate [(a + b)#x, udf(udf(b))#x, spark_grouping_id#xL], [cast(udf(cast((a + b)#x as string)) as int) AS udf((a + b))#x, udf(udf(b))#x AS k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
406+
+- Expand [[a#x, b#x, null, udf(udf(b))#x, 2]], [a#x, b#x, (a + b)#x, udf(udf(b))#x, spark_grouping_id#xL]
407+
+- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, cast(udf(cast(cast(udf(cast(b#x as string)) as int) as string)) as int) AS udf(udf(b))#x]
408408
+- SubqueryAlias testdata
409409
+- View (`testData`, [a#x, b#x])
410410
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]

0 commit comments

Comments
 (0)