[SPARK-50630][SQL] Fix GROUP BY ordinal support for pipe SQL AGGREGATE operators

dtenedor · cloud-fan · commit 08675b169270 · 2024-12-23T12:53:59.000+08:00
### What changes were proposed in this pull request? This PR fixes GROUP BY ordinal support for pipe SQL AGGREGATE operators. It adds a new `UnresolvedPipeAggregateOrdinal` expression to represent these ordinals. In this context, the ordinal refers to the one-based position of the column in the input relation. Note that this behavior is different from GROUP BY ordinals in regular SQL, wherein the ordinal refers to the one-based position of the column in the SELECT clause instead. For example: ``` select 3 as x, 4 as y, 5 as z |> aggregate sum(y) group by 2, 3 > 4, 5, 4 select 3 as x, 4 as y, 5 as z |> aggregate sum(y) group by 1, 2, 3 > 3, 4, 5, 4 ``` This PR also makes a small fix for `|> UNION` (and other set operations) to prefer future pipe operators to apply on the result of the entire union, rather than binding to the right leg of the union only (to allay reported confusion during testing). For example, `values (0, 1) s(x, y) |> union all values (2, 3) t(x, y) |> drop x` will succeed rather than report an error that the number of columns does not match. ### Why are the changes needed? The current implementation has a bug where the ordinals are sometimes mistakenly retained as literal integers. ### Does this PR introduce _any_ user-facing change? Yes, see above. ### How was this patch tested? This PR adds new golden file based test coverage. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #49248 from dtenedor/group-by-ordinals-pipe-aggregate. Authored-by: Daniel Tenedorio <daniel.tenedorio@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -1523,7 +1523,7 @@ operatorPipeRightSide
     | unpivotClause pivotClause?
     | sample
     | joinRelation
-    | operator=(UNION | EXCEPT | SETMINUS | INTERSECT) setQuantifier? right=queryTerm
+    | operator=(UNION | EXCEPT | SETMINUS | INTERSECT) setQuantifier? right=queryPrimary
     | queryOrganization
     | AGGREGATE namedExpressionSeq? aggregationClause?
     ;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1887,10 +1887,14 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
 
       // Replace the index with the corresponding expression in aggregateExpressions. The index is
       // a 1-base position of aggregateExpressions, which is output columns (select expression)
-      case Aggregate(groups, aggs, child, hint) if aggs.forall(_.resolved) &&
+      case Aggregate(groups, aggs, child, hint)
+        if aggs
+          .filter(!containUnresolvedPipeAggregateOrdinal(_))
+          .forall(_.resolved) &&
         groups.exists(containUnresolvedOrdinal) =>
-        val newGroups = groups.map(resolveGroupByExpressionOrdinal(_, aggs))
-        Aggregate(newGroups, aggs, child, hint)
+        val newAggs = aggs.map(resolvePipeAggregateExpressionOrdinal(_, child.output))
+        val newGroups = groups.map(resolveGroupByExpressionOrdinal(_, newAggs))
+        Aggregate(newGroups, newAggs, child, hint)
     }
 
     private def containUnresolvedOrdinal(e: Expression): Boolean = e match {
@@ -1899,6 +1903,11 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
       case _ => false
     }
 
+    private def containUnresolvedPipeAggregateOrdinal(e: Expression): Boolean = e match {
+      case UnresolvedAlias(_: UnresolvedPipeAggregateOrdinal, _) => true
+      case _ => false
+    }
+
     private def resolveGroupByExpressionOrdinal(
         expr: Expression,
         aggs: Seq[Expression]): Expression = expr match {
@@ -1934,6 +1943,17 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
     }
   }
 
+  private def resolvePipeAggregateExpressionOrdinal(
+      expr: NamedExpression,
+      inputs: Seq[Attribute]): NamedExpression = expr match {
+    case UnresolvedAlias(UnresolvedPipeAggregateOrdinal(index), _) =>
+      // In this case, the user applied the SQL pipe aggregate operator ("|> AGGREGATE") and used
+      // ordinals in its GROUP BY clause. This expression then refers to the i-th attribute of the
+      // child operator (one-based). Here we resolve the ordinal to the corresponding attribute.
+      inputs(index - 1)
+    case other =>
+      other
+  }
 
   /**
    * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -956,6 +956,28 @@ case class UnresolvedOrdinal(ordinal: Int)
   final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_ORDINAL)
 }
 
+/**
+ * Represents an unresolved ordinal used in the GROUP BY clause of a SQL pipe aggregate operator
+ * ("|> AGGREGATE").
+ *
+ * In this context, the ordinal refers to the one-based position of the column in the input
+ * relation. Note that this behavior is different from GROUP BY ordinals in regular SQL, wherein the
+ * ordinal refers to the one-based position of the column in the SELECT clause.
+ *
+ * For example:
+ * {{{
+ *   values ('abc', 'def') tab(x, y)
+ *   |> aggregate sum(x) group by 2
+ * }}}
+ * @param ordinal ordinal starts from 1, instead of 0
+ */
+case class UnresolvedPipeAggregateOrdinal(ordinal: Int)
+  extends LeafExpression with Unevaluable with NonSQLExpression {
+  override def dataType: DataType = throw new UnresolvedException("dataType")
+  override def nullable: Boolean = throw new UnresolvedException("nullable")
+  override lazy val resolved = false
+}
+
 /**
  * Represents unresolved having clause, the child for it can be Aggregate, GroupingSets, Rollup
  * and Cube. It is turned by the analyzer into a Filter.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -6016,7 +6016,8 @@ class AstBuilder extends DataTypeAstBuilder
     // analyzer behave as if we had added the corresponding SQL clause after a table subquery
     // containing the input plan.
     def withSubqueryAlias(): LogicalPlan = left match {
-      case _: SubqueryAlias | _: UnresolvedRelation | _: Join | _: Filter =>
+      case _: SubqueryAlias | _: UnresolvedRelation | _: Join | _: Filter |
+           _: GlobalLimit | _: LocalLimit | _: Offset | _: Sort =>
         left
       case _ =>
         SubqueryAlias(SubqueryAlias.generateSubqueryName(), left)
@@ -6137,7 +6138,7 @@ class AstBuilder extends DataTypeAstBuilder
         "The AGGREGATE clause requires a list of aggregate expressions " +
           "or a list of grouping expressions, or both", ctx)
     }
-    // Visit each aggregate expression, and add a PipeAggregate expression on top of it to generate
+    // Visit each aggregate expression, and add a [[PipeExpression]] on top of it to generate
     // clear error messages if the expression does not contain at least one aggregate function.
     val aggregateExpressions: Seq[NamedExpression] =
       Option(ctx.namedExpressionSeq()).map { n: NamedExpressionSeqContext =>
@@ -6183,12 +6184,28 @@ class AstBuilder extends DataTypeAstBuilder
           a.aggregateExpressions.foreach(visit)
           // Prepend grouping keys to the list of aggregate functions, since operator pipe AGGREGATE
           // clause returns the GROUP BY expressions followed by the list of aggregate functions.
-          val namedGroupingExpressions: Seq[NamedExpression] =
-            a.groupingExpressions.map {
-              case n: NamedExpression => n
-              case e: Expression => UnresolvedAlias(e, None)
-            }
-          a.copy(aggregateExpressions = namedGroupingExpressions ++ a.aggregateExpressions)
+          val newGroupingExpressions = ArrayBuffer.empty[Expression]
+          val newAggregateExpressions = ArrayBuffer.empty[NamedExpression]
+          a.groupingExpressions.foreach {
+            case n: NamedExpression =>
+              newGroupingExpressions += n
+              newAggregateExpressions += n
+            // If the grouping expression is an integer literal, create [[UnresolvedOrdinal]] and
+            // [[UnresolvedPipeAggregateOrdinal]] expressions to represent it in the final grouping
+            // and aggregate expressions, respectively. This will let the
+            // [[ResolveOrdinalInOrderByAndGroupBy]] rule detect the ordinal in the aggregate list
+            // and replace it with the corresponding attribute from the child operator.
+            case Literal(v: Int, IntegerType) if conf.groupByOrdinal =>
+              newGroupingExpressions += UnresolvedOrdinal(newAggregateExpressions.length + 1)
+              newAggregateExpressions += UnresolvedAlias(UnresolvedPipeAggregateOrdinal(v), None)
+            case e: Expression =>
+              newGroupingExpressions += e
+              newAggregateExpressions += UnresolvedAlias(e, None)
+          }
+          newAggregateExpressions.appendAll(a.aggregateExpressions)
+          a.copy(
+            groupingExpressions = newGroupingExpressions.toSeq,
+            aggregateExpressions = newAggregateExpressions.toSeq)
       }
     }.getOrElse {
       // This is a table aggregation with no grouping expressions.
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out
@@ -1539,6 +1539,78 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+table t
+|> select x, length(y) as z
+|> limit 1000
+|> where x + length(y) < 4
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 71,
+    "stopIndex" : 71,
+    "fragment" : "y"
+  } ]
+}
+
+
+-- !query
+table t
+|> select x, length(y) as z
+|> limit 1000 offset 1
+|> where x + length(y) < 4
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 80,
+    "stopIndex" : 80,
+    "fragment" : "y"
+  } ]
+}
+
+
+-- !query
+table t
+|> select x, length(y) as z
+|> order by x, y
+|> where x + length(y) < 4
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 52,
+    "stopIndex" : 52,
+    "fragment" : "y"
+  } ]
+}
+
+
 -- !query
 (select x, sum(length(y)) as sum_len from t group by x)
 |> where sum(length(y)) = 3
@@ -2697,21 +2769,34 @@ Union false, false
 
 
 -- !query
-values (0, 1) tab(x, y)
+values (2, 'xyz') tab(x, y)
 |> union table t
 |> where x = 0
 -- !query analysis
-Distinct
-+- Union false, false
-   :- Project [x#x, cast(y#x as bigint) AS y#xL]
-   :  +- SubqueryAlias tab
-   :     +- LocalRelation [x#x, y#x]
-   +- Project [x#x, cast(y#x as bigint) AS y#xL]
-      +- Filter (x#x = 0)
+Filter (x#x = 0)
++- SubqueryAlias __auto_generated_subquery_name
+   +- Distinct
+      +- Union false, false
+         :- SubqueryAlias tab
+         :  +- LocalRelation [x#x, y#x]
          +- SubqueryAlias spark_catalog.default.t
             +- Relation spark_catalog.default.t[x#x,y#x] csv
 
 
+-- !query
+values (2, 'xyz') tab(x, y)
+|> union table t
+|> drop x
+-- !query analysis
+Project [y#x]
++- Distinct
+   +- Union false, false
+      :- SubqueryAlias tab
+      :  +- LocalRelation [x#x, y#x]
+      +- SubqueryAlias spark_catalog.default.t
+         +- Relation spark_catalog.default.t[x#x,y#x] csv
+
+
 -- !query
 (select * from t)
 |> union all (select * from t)
@@ -2878,10 +2963,9 @@ table t
 -- !query analysis
 GlobalLimit 1
 +- LocalLimit 1
-   +- SubqueryAlias __auto_generated_subquery_name
-      +- Sort [x#x ASC NULLS FIRST], true
-         +- SubqueryAlias spark_catalog.default.t
-            +- Relation spark_catalog.default.t[x#x,y#x] csv
+   +- Sort [x#x ASC NULLS FIRST], true
+      +- SubqueryAlias spark_catalog.default.t
+         +- Relation spark_catalog.default.t[x#x,y#x] csv
 
 
 -- !query
@@ -3109,11 +3193,101 @@ Aggregate [x#x, y#x], [x#x, y#x]
 select 3 as x, 4 as y
 |> aggregate group by 1, 2
 -- !query analysis
-Aggregate [1, 2], [1 AS 1#x, 2 AS 2#x]
+Aggregate [x#x, y#x], [x#x, y#x]
 +- Project [3 AS x#x, 4 AS y#x]
    +- OneRowRelation
 
 
+-- !query
+values (3, 4) as tab(x, y)
+|> aggregate sum(y) group by 1
+-- !query analysis
+Aggregate [x#x], [x#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [x#x, y#x]
+
+
+-- !query
+values (3, 4), (5, 4) as tab(x, y)
+|> aggregate sum(y) group by 1
+-- !query analysis
+Aggregate [x#x], [x#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [x#x, y#x]
+
+
+-- !query
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 1, 1
+-- !query analysis
+Aggregate [x#x, x#x], [x#x, x#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x]
+   +- OneRowRelation
+
+
+-- !query
+select 1 as `1`, 2 as `2`
+|> aggregate sum(`2`) group by `1`
+-- !query analysis
+Aggregate [1#x], [1#x, pipeexpression(sum(2#x), true, AGGREGATE) AS pipeexpression(sum(2))#xL]
++- Project [1 AS 1#x, 2 AS 2#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 2
+-- !query analysis
+Aggregate [y#x], [y#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2
+-- !query analysis
+Aggregate [y#x], [y#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 3
+-- !query analysis
+Aggregate [z#x], [z#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2, 3
+-- !query analysis
+Aggregate [y#x, z#x], [y#x, z#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 1, 2, 3
+-- !query analysis
+Aggregate [x#x, y#x, z#x], [x#x, y#x, z#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by x, 2, 3
+-- !query analysis
+Aggregate [x#x, y#x, z#x], [x#x, y#x, z#x, pipeexpression(sum(y#x), true, AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
 -- !query
 table t
 |> aggregate sum(x)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql b/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql
diff --git a/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out b/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out

Original file line number	Diff line number	Diff line change
`@@ -1523,7 +1523,7 @@ operatorPipeRightSide`
`1523`	`1523`	`\| unpivotClause pivotClause?`
`1524`	`1524`	`\| sample`
`1525`	`1525`	`\| joinRelation`
`1526`		`- \| operator=(UNION \| EXCEPT \| SETMINUS \| INTERSECT) setQuantifier? right=queryTerm`
	`1526`	`+ \| operator=(UNION \| EXCEPT \| SETMINUS \| INTERSECT) setQuantifier? right=queryPrimary`
`1527`	`1527`	`\| queryOrganization`
`1528`	`1528`	`\| AGGREGATE namedExpressionSeq? aggregationClause?`
`1529`	`1529`	`;`