diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 9d228de9b5908..c8dab9d7169da 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4284,6 +4284,11 @@ "The function is invoked with DISTINCT and WITHIN GROUP but expressions and do not match. The WITHIN GROUP ordering expression must be picked from the function inputs." ] }, + "MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST" : { + "message" : [ + "The function with DISTINCT and WITHIN GROUP (ORDER BY) is not supported for input. Explicitly cast the input to before passing it to both the function argument and ORDER BY expression." + ] + }, "WITHIN_GROUP_MISSING" : { "message" : [ "WITHIN GROUP is required for the function." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 7e134d7f23ebe..89b1b9b84b150 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -449,9 +449,8 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString messageParameters = Map("funcName" -> toSQLExpr(w))) case agg @ AggregateExpression(listAgg: ListAgg, _, _, _, _) - if agg.isDistinct && listAgg.needSaveOrderValue => - throw QueryCompilationErrors.functionAndOrderExpressionMismatchError( - listAgg.prettyName, listAgg.child, listAgg.orderExpressions) + if agg.isDistinct && listAgg.hasDistinctOrderIncompatibility => + listAgg.throwDistinctOrderError() case w: WindowExpression => WindowResolution.validateResolvedWindowExpression(w) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AggregateExpressionResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AggregateExpressionResolver.scala index b194a4e44a9d1..0a3513ff30264 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AggregateExpressionResolver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AggregateExpressionResolver.scala @@ -79,7 +79,9 @@ class AggregateExpressionResolver( * `handleOuterAggregateExpression`); * - Validation: * 1. [[ListAgg]] is not allowed in DISTINCT aggregates if it contains [[SortOrder]] different - * from its child; + * from its child. However, when [[SQLConf.LISTAGG_ALLOW_DISTINCT_CAST_WITH_ORDER]] is + * enabled, a mismatch is tolerated if it is solely due to a [[Cast]] whose source type is + * injective when cast to string (see [[ListAgg.hasDistinctOrderIncompatibility]]); * 2. Nested aggregate functions are not allowed; * 3. Nondeterministic expressions in the subtree of a related aggregate function are not * allowed; @@ -116,8 +118,8 @@ class AggregateExpressionResolver( private def validateResolvedAggregateExpression(aggregateExpression: AggregateExpression): Unit = aggregateExpression match { case agg @ AggregateExpression(listAgg: ListAgg, _, _, _, _) - if agg.isDistinct && listAgg.needSaveOrderValue => - throwFunctionAndOrderExpressionMismatchError(listAgg) + if agg.isDistinct && listAgg.hasDistinctOrderIncompatibility => + listAgg.throwDistinctOrderError() case _ => if (expressionResolutionContextStack.peek().hasAggregateExpressions) { throwNestedAggregateFunction(aggregateExpression) @@ -212,14 +214,6 @@ class AggregateExpressionResolver( } } - private def throwFunctionAndOrderExpressionMismatchError(listAgg: ListAgg) = { - throw QueryCompilationErrors.functionAndOrderExpressionMismatchError( - listAgg.prettyName, - listAgg.child, - listAgg.orderExpressions - ) - } - private def throwNestedAggregateFunction(aggregateExpression: AggregateExpression): Nothing = { throw new AnalysisException( errorClass = "NESTED_AGGREGATE_FUNCTION", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala index 29163d08297da..4fa6b166f75b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala @@ -21,6 +21,7 @@ import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, Growable} import scala.util.{Left, Right} +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess} @@ -31,6 +32,7 @@ import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, TypeUtil import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLExpr import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.errors.DataTypeErrors.{toSQLId, toSQLType} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.types.StringTypeWithCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{ByteArray, UTF8String} @@ -564,6 +566,140 @@ case class ListAgg( false } + /** + * Returns true if the ordering expression is incompatible with DISTINCT deduplication. + * + * When LISTAGG(DISTINCT col) WITHIN GROUP (ORDER BY col) is used on a non-string column, + * the child is implicitly cast to string (with UTF8_BINARY collation). The DISTINCT rewrite + * (see [[RewriteDistinctAggregates]]) uses GROUP BY on both the original and cast columns, + * so the cast must preserve equality semantics: values that are GROUP BY-equal must cast to + * equal strings, and vice versa. Types like Float/Double violate this because IEEE 754 + * negative zero (-0.0) and positive zero (0.0) are equal but produce different strings. + * + * Returns false when the order expression matches the child (i.e., [[needSaveOrderValue]] + * is false). Otherwise, the behavior depends on the + * [[SQLConf.LISTAGG_ALLOW_DISTINCT_CAST_WITH_ORDER]] config: + * - If enabled, delegates to [[orderMismatchCastSafety]] to determine whether the + * mismatch is due to a safe cast, an unsafe cast, or not a cast at all. + * - If disabled, any mismatch is considered incompatible. + * + * @return true if an incompatibility exists, false if the ordering is safe + * @see [[throwDistinctOrderError]] to throw the appropriate error when this returns true + */ + def hasDistinctOrderIncompatibility: Boolean = { + needSaveOrderValue && { + if (SQLConf.get.listaggAllowDistinctCastWithOrder) { + orderMismatchCastSafety match { + case CastSafetyResult.SafeCast => false + case _ => true + } + } else { + true + } + } + } + + def throwDistinctOrderError(): Nothing = { + if (SQLConf.get.listaggAllowDistinctCastWithOrder) { + orderMismatchCastSafety match { + case CastSafetyResult.UnsafeCast(inputType, castType) => + throwFunctionAndOrderExpressionUnsafeCastError(inputType, castType) + case CastSafetyResult.NotACast => + throwFunctionAndOrderExpressionMismatchError() + case CastSafetyResult.SafeCast => + throw SparkException.internalError( + "ListAgg.throwDistinctOrderError should not be called when the cast is safe") + } + } else { + throwFunctionAndOrderExpressionMismatchError() + } + } + + private def throwFunctionAndOrderExpressionMismatchError() = { + throw QueryCompilationErrors.functionAndOrderExpressionMismatchError( + prettyName, child, orderExpressions) + } + + private def throwFunctionAndOrderExpressionUnsafeCastError( + inputType: DataType, castType: DataType) = { + throw QueryCompilationErrors.functionAndOrderExpressionUnsafeCastError( + prettyName, inputType, castType) + } + + /** + * Classifies the order-expression mismatch as a safe cast, unsafe cast, or not a cast. + * + * @see [[hasDistinctOrderIncompatibility]] for the full invariant this enforces + */ + private def orderMismatchCastSafety: CastSafetyResult = { + if (orderExpressions.size != 1) return CastSafetyResult.NotACast + child match { + case Cast(castChild, castType, _, _) + if orderExpressions.head.child.semanticEquals(castChild) => + if (isCastSafeForDistinct(castChild.dataType) && + isCastTargetSafeForDistinct(castType)) { + CastSafetyResult.SafeCast + } else { + CastSafetyResult.UnsafeCast(castChild.dataType, castType) + } + case _ => CastSafetyResult.NotACast + } + } + + /** + * Returns true if casting `dt` to string/binary is injective for DISTINCT deduplication. + * + * @see [[hasDistinctOrderIncompatibility]] + */ + private def isCastSafeForDistinct(dt: DataType): Boolean = dt match { + case _: IntegerType | LongType | ShortType | ByteType => true + case _: DecimalType => true + case _: DateType | TimestampNTZType => true + case _: TimeType => true + case _: CalendarIntervalType => true + case _: YearMonthIntervalType => true + case _: DayTimeIntervalType => true + case BooleanType => true + case BinaryType => true + case st: StringType => st.isUTF8BinaryCollation + case _: DoubleType | FloatType => false + // During DST fall-back, two distinct UTC epochs can format to the same local time string + // because the default format omits the timezone offset. TimestampNTZType is safe (uses UTC). + case _: TimestampType => false + case _ => false + } + + /** + * Returns true if the target type's equality semantics are safe for DISTINCT deduplication + * (i.e., UTF8_BINARY collation or BinaryType). + * + * @see [[hasDistinctOrderIncompatibility]] + */ + private def isCastTargetSafeForDistinct(dt: DataType): Boolean = dt match { + case st: StringType => st.isUTF8BinaryCollation + case BinaryType => true + case _ => false + } + + /** + * Result of checking whether a LISTAGG(DISTINCT) order-expression mismatch + * is caused by a cast and whether that cast is safe for deduplication. + */ + private sealed trait CastSafetyResult + + private object CastSafetyResult { + /** The mismatch is not due to a cast at all. */ + case object NotACast extends CastSafetyResult + + /** The mismatch is due to a cast that is safe for DISTINCT. */ + case object SafeCast extends CastSafetyResult + + /** The mismatch is due to a cast that is unsafe for DISTINCT. */ + case class UnsafeCast( + inputType: DataType, + castType: DataType) extends CastSafetyResult + } + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy( child = newChildren.head, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index b4b01d6803679..a17240554ce09 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -1116,6 +1116,18 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat "orderingExpr" -> orderExpr.map(order => toSQLExpr(order.child)).mkString(", "))) } + def functionAndOrderExpressionUnsafeCastError( + functionName: String, + inputType: DataType, + castType: DataType): Throwable = { + new AnalysisException( + errorClass = "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + messageParameters = Map( + "funcName" -> toSQLId(functionName), + "inputType" -> toSQLType(inputType), + "castType" -> toSQLType(castType))) + } + def wrongCommandForObjectTypeError( operation: String, requiredType: String, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index f17199547665f..81f4a2bde4b73 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -6889,6 +6889,17 @@ object SQLConf { .booleanConf .createWithDefault(Utils.isTesting) + val LISTAGG_ALLOW_DISTINCT_CAST_WITH_ORDER = + buildConf("spark.sql.listagg.allowDistinctCastWithOrder.enabled") + .internal() + .doc("When true, LISTAGG(DISTINCT expr) WITHIN GROUP (ORDER BY expr) is allowed on " + + "non-string expr when the implicit cast to string preserves equality (e.g., integer, " + + "decimal, date). When false, the function argument and ORDER BY expression must have " + + "the exact same type, which requires explicit casts.") + .version("4.2.0") + .booleanConf + .createWithDefault(true) + /** * Holds information about keys that have been deprecated. * @@ -8115,6 +8126,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def isTimeTypeEnabled: Boolean = getConf(SQLConf.TIME_TYPE_ENABLED) + def listaggAllowDistinctCastWithOrder: Boolean = getConf(LISTAGG_ALLOW_DISTINCT_CAST_WITH_ORDER) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out index 7a7d7aa3f0472..ebf428339c7ac 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out @@ -118,3 +118,72 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "orderingExpr" : "\"collate(c1, utf8_binary)\"" } } + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS STRING), NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING COLLATE UTF8_LCASE\"", + "funcName" : "`listagg`", + "inputType" : "\"STRING\"" + } +} + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS STRING), NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING COLLATE UTF8_LCASE\"", + "funcName" : "`listagg`", + "inputType" : "\"BINARY\"" + } +} + + +-- !query +SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as binary), null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS BINARY), NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM (SELECT col COLLATE UTF8_LCASE AS col FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col)) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"BINARY\"", + "funcName" : "`listagg`", + "inputType" : "\"STRING COLLATE UTF8_LCASE\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out index 1d78d882cf6a8..146aa9ff6f521 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out @@ -272,7 +272,7 @@ Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x DESC NULLS LAST, -- !query -WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t +WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t -- !query analysis WithCTE :- CTERelationDef xxxx, false @@ -281,13 +281,13 @@ WithCTE : +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x] : +- SubqueryAlias __auto_generated_subquery_name : +- LocalRelation [col1#x] -+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x] +- SubqueryAlias t +- CTERelationRef xxxx, true, [col#x], false, false, 1 -- !query -WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t +WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t -- !query analysis WithCTE :- CTERelationDef xxxx, false @@ -296,13 +296,13 @@ WithCTE : +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x] : +- SubqueryAlias __auto_generated_subquery_name : +- LocalRelation [col1#x] -+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x] +- SubqueryAlias t +- CTERelationRef xxxx, true, [col#x], false, false, 1 -- !query -WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'42'), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t +WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'42')), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t -- !query analysis WithCTE :- CTERelationDef xxxx, false @@ -311,7 +311,7 @@ WithCTE : +- Aggregate [listagg(col1#x, 0x42, 0, 0) AS listagg(col1, X'42')#x] : +- SubqueryAlias __auto_generated_subquery_name : +- LocalRelation [col1#x] -+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0x42 as string)) AS regexp_count(col, X'42')#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0x42)) AS regexp_count(hex(col), hex(X'42'))#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x] +- SubqueryAlias t +- CTERelationRef xxxx, true, [col#x], false, false, 1 @@ -335,6 +335,156 @@ WithCTE +- CTERelationRef xxxx, true, [col1#x, col2#x], false, false, 1 +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), (2), (2), (3) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as bigint)), (cast(2 as bigint)), (cast(2 as bigint)), (cast(3 as bigint)) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#xL as string), ,, col#xL ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#xL] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as smallint)), (cast(2 as smallint)), (cast(2 as smallint)), (cast(3 as smallint)) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as tinyint)), (cast(2 as tinyint)), (cast(2 as tinyint)), (cast(3 as tinyint)) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.10 as decimal(10,2))), (cast(2.20 as decimal(10,2))), (cast(2.20 as decimal(10,2))) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (DATE'2024-01-01'), (DATE'2024-01-02'), (DATE'2024-01-01') AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP_NTZ'2024-01-01 10:00:00'), (TIMESTAMP_NTZ'2024-01-02 12:00:00'), (TIMESTAMP_NTZ'2024-01-01 10:00:00') AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (true), (false), (true) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (INTERVAL '1' MONTH), (INTERVAL '2' MONTH), (INTERVAL '1' MONTH) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT cast(col as string), ',') WITHIN GROUP (ORDER BY col) FROM VALUES (10), (1), (2), (20), (2) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS STRING), ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col DESC) FROM VALUES (1), (10), (2), (20), (2) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x DESC NULLS LAST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col DESC NULLS LAST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), (2), (null), (2), (3) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col NULLS FIRST) FROM VALUES (1), (null), (2), (null) AS t(col) +-- !query analysis +Aggregate [listagg(distinct cast(col#x as string), ,, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, ,) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [col#x] + + +-- !query +SELECT grp, listagg(DISTINCT col) WITHIN GROUP (ORDER BY col) FROM VALUES (1, 'a'), (1, 'b'), (2, 'a'), (2, 'a'), (1, 'b') AS t(grp, col) GROUP BY grp +-- !query analysis +Aggregate [grp#x], [grp#x, listagg(distinct col#x, null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col, NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x] ++- SubqueryAlias t + +- LocalRelation [grp#x, col#x] + + +-- !query +WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')), regexp_count(hex(col), hex(X'CAFE')) FROM t +-- !query analysis +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x AS col#x] +: +- Aggregate [listagg(distinct col1#x, 0x2C, col1#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x, regexp_count(hex(col#x), hex(0xCAFE)) AS regexp_count(hex(col), hex(X'CAFE'))#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false, 1 + + +-- !query +WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(hex(col), hex(X'AA')), regexp_count(hex(col), hex(X'BB')) FROM t +-- !query analysis +WithCTE +:- CTERelationDef xxxx, false +: +- SubqueryAlias t +: +- Project [listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x AS col#x] +: +- Aggregate [listagg(distinct col1#x, 0x7C, col1#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x] +: +- SubqueryAlias __auto_generated_subquery_name +: +- LocalRelation [col1#x] ++- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xAA)) AS regexp_count(hex(col), hex(X'AA'))#x, regexp_count(hex(col#x), hex(0xBB)) AS regexp_count(hex(col), hex(X'BB'))#x] + +- SubqueryAlias t + +- CTERelationRef xxxx, true, [col#x], false, false, 1 + + +-- !query +SELECT grp, hex(listagg(DISTINCT col, X'2C') WITHIN GROUP (ORDER BY col)) FROM VALUES (1, X'AA'), (1, X'BB'), (1, X'AA'), (2, X'CC'), (2, X'CC') AS t(grp, col) GROUP BY grp +-- !query analysis +Aggregate [grp#x], [grp#x, hex(listagg(distinct col#x, 0x2C, col#x ASC NULLS FIRST, 0, 0)) AS hex(listagg(DISTINCT col, X'2C') WITHIN GROUP (ORDER BY col ASC NULLS FIRST))#x] ++- SubqueryAlias t + +- LocalRelation [grp#x, col#x] + + -- !query SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1) -- !query analysis @@ -507,3 +657,48 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "orderingExpr" : "\"col1\", \"col2\"" } } + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.1 as double)), (cast(2.2 as double)), (cast(2.2 as double)), (cast(3.3 as double)) AS t(col) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING\"", + "funcName" : "`listagg`", + "inputType" : "\"DOUBLE\"" + } +} + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.0 as float)), (cast(2.0 as float)), (cast(2.0 as float)) AS t(col) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING\"", + "funcName" : "`listagg`", + "inputType" : "\"FLOAT\"" + } +} + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP'2024-01-01 10:00:00'), (TIMESTAMP'2024-01-02 12:00:00'), (TIMESTAMP'2024-01-01 10:00:00') AS t(col) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING\"", + "funcName" : "`listagg`", + "inputType" : "\"TIMESTAMP\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql index aa3d02dc2fb66..86466711f93a5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql @@ -12,3 +12,19 @@ WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_ -- Error case with collations SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1); + +-- LISTAGG DISTINCT cast safety with collations: +-- string -> string (safe): explicit cast to same collation +SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col); +-- string -> string (unsafe): cast to non-binary-equality collation on target +SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col); + +-- binary -> string (safe): cast to default STRING (UTF8_BINARY) +SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col); -- ABC, abc, ABC +-- binary -> string (unsafe): cast to non-binary-equality collation on target +SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col); -- ABC, abc, ABC + +-- string -> binary (safe): UTF8_BINARY source, BinaryType target +SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col); +-- string -> binary (unsafe): non-binary-equality source (UTF8_LCASE), BinaryType target +SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM (SELECT col COLLATE UTF8_LCASE AS col FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col)) diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql index 021de9975d308..3a7401893aaaf 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql @@ -24,11 +24,30 @@ WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df; SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df; SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df; -WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t; -WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t; -WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'42'), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t; +WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t; +WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t; +WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'42')), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t; WITH t(col1, col2) AS (SELECT listagg(col1), listagg(col2, ',') FROM df2) SELECT len(col1), regexp_count(col1, '1'), regexp_count(col1, '2'), regexp_count(col1, '3'), len(col2), regexp_count(col2, 'true'), regexp_count(col1, 'false') FROM t; +-- LISTAGG with DISTINCT with implicit cast from non-string types (safe types - should succeed) +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), (2), (2), (3) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as bigint)), (cast(2 as bigint)), (cast(2 as bigint)), (cast(3 as bigint)) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as smallint)), (cast(2 as smallint)), (cast(2 as smallint)), (cast(3 as smallint)) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as tinyint)), (cast(2 as tinyint)), (cast(2 as tinyint)), (cast(3 as tinyint)) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.10 as decimal(10,2))), (cast(2.20 as decimal(10,2))), (cast(2.20 as decimal(10,2))) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (DATE'2024-01-01'), (DATE'2024-01-02'), (DATE'2024-01-01') AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP_NTZ'2024-01-01 10:00:00'), (TIMESTAMP_NTZ'2024-01-02 12:00:00'), (TIMESTAMP_NTZ'2024-01-01 10:00:00') AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (true), (false), (true) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (INTERVAL '1' MONTH), (INTERVAL '2' MONTH), (INTERVAL '1' MONTH) AS t(col); +SELECT listagg(DISTINCT cast(col as string), ',') WITHIN GROUP (ORDER BY col) FROM VALUES (10), (1), (2), (20), (2) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col DESC) FROM VALUES (1), (10), (2), (20), (2) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), (2), (null), (2), (3) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col NULLS FIRST) FROM VALUES (1), (null), (2), (null) AS t(col); +SELECT grp, listagg(DISTINCT col) WITHIN GROUP (ORDER BY col) FROM VALUES (1, 'a'), (1, 'b'), (2, 'a'), (2, 'a'), (1, 'b') AS t(grp, col) GROUP BY grp; +WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')), regexp_count(hex(col), hex(X'CAFE')) FROM t; +WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(hex(col), hex(X'AA')), regexp_count(hex(col), hex(X'BB')) FROM t; +SELECT grp, hex(listagg(DISTINCT col, X'2C') WITHIN GROUP (ORDER BY col)) FROM VALUES (1, X'AA'), (1, X'BB'), (1, X'AA'), (2, X'CC'), (2, X'CC') AS t(grp, col) GROUP BY grp; + -- Error cases SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1); SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1); @@ -39,3 +58,6 @@ SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM d SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df; SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df; SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df; +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.1 as double)), (cast(2.2 as double)), (cast(2.2 as double)), (cast(3.3 as double)) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.0 as float)), (cast(2.0 as float)), (cast(2.0 as float)) AS t(col); +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP'2024-01-01 10:00:00'), (TIMESTAMP'2024-01-02 12:00:00'), (TIMESTAMP'2024-01-01 10:00:00') AS t(col); diff --git a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out index 1f8c5822e7d84..6e826a3ed9be6 100644 --- a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out @@ -80,3 +80,78 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "orderingExpr" : "\"collate(c1, utf8_binary)\"" } } + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col) +-- !query schema +struct +-- !query output +ABCabc + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING COLLATE UTF8_LCASE\"", + "funcName" : "`listagg`", + "inputType" : "\"STRING\"" + } +} + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col) +-- !query schema +struct +-- !query output +ABCabc + + +-- !query +SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING COLLATE UTF8_LCASE\"", + "funcName" : "`listagg`", + "inputType" : "\"BINARY\"" + } +} + + +-- !query +SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col) +-- !query schema +struct +-- !query output +ABCabc + + +-- !query +SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM (SELECT col COLLATE UTF8_LCASE AS col FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"BINARY\"", + "funcName" : "`listagg`", + "inputType" : "\"STRING COLLATE UTF8_LCASE\"" + } +} diff --git a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out index 22ffceedb627d..ca387fe70fa13 100644 --- a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out @@ -158,25 +158,25 @@ bbaa -- !query -WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t +WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t -- !query schema -struct +struct -- !query output 4 1 1 -- !query -WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t +WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t -- !query schema -struct +struct -- !query output 4 1 1 -- !query -WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'42'), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t +WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'42')), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t -- !query schema -struct +struct -- !query output 5 1 1 1 @@ -189,6 +189,144 @@ struct +-- !query output +1,2,3 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as bigint)), (cast(2 as bigint)), (cast(2 as bigint)), (cast(3 as bigint)) AS t(col) +-- !query schema +struct +-- !query output +1,2,3 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as smallint)), (cast(2 as smallint)), (cast(2 as smallint)), (cast(3 as smallint)) AS t(col) +-- !query schema +struct +-- !query output +1,2,3 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1 as tinyint)), (cast(2 as tinyint)), (cast(2 as tinyint)), (cast(3 as tinyint)) AS t(col) +-- !query schema +struct +-- !query output +1,2,3 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.10 as decimal(10,2))), (cast(2.20 as decimal(10,2))), (cast(2.20 as decimal(10,2))) AS t(col) +-- !query schema +struct +-- !query output +1.10,2.20 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (DATE'2024-01-01'), (DATE'2024-01-02'), (DATE'2024-01-01') AS t(col) +-- !query schema +struct +-- !query output +2024-01-01,2024-01-02 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP_NTZ'2024-01-01 10:00:00'), (TIMESTAMP_NTZ'2024-01-02 12:00:00'), (TIMESTAMP_NTZ'2024-01-01 10:00:00') AS t(col) +-- !query schema +struct +-- !query output +2024-01-01 10:00:00,2024-01-02 12:00:00 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (true), (false), (true) AS t(col) +-- !query schema +struct +-- !query output +false,true + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (INTERVAL '1' MONTH), (INTERVAL '2' MONTH), (INTERVAL '1' MONTH) AS t(col) +-- !query schema +struct +-- !query output +INTERVAL '1' MONTH,INTERVAL '2' MONTH + + +-- !query +SELECT listagg(DISTINCT cast(col as string), ',') WITHIN GROUP (ORDER BY col) FROM VALUES (10), (1), (2), (20), (2) AS t(col) +-- !query schema +struct +-- !query output +1,2,10,20 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col DESC) FROM VALUES (1), (10), (2), (20), (2) AS t(col) +-- !query schema +struct +-- !query output +20,10,2,1 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (1), (2), (null), (2), (3) AS t(col) +-- !query schema +struct +-- !query output +1,2,3 + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col NULLS FIRST) FROM VALUES (1), (null), (2), (null) AS t(col) +-- !query schema +struct +-- !query output +1,2 + + +-- !query +SELECT grp, listagg(DISTINCT col) WITHIN GROUP (ORDER BY col) FROM VALUES (1, 'a'), (1, 'b'), (2, 'a'), (2, 'a'), (1, 'b') AS t(grp, col) GROUP BY grp +-- !query schema +struct +-- !query output +1 ab +2 a + + +-- !query +WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')), regexp_count(hex(col), hex(X'CAFE')) FROM t +-- !query schema +struct +-- !query output +8 1 1 1 + + +-- !query +WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(hex(col), hex(X'AA')), regexp_count(hex(col), hex(X'BB')) FROM t +-- !query schema +struct +-- !query output +3 1 1 + + +-- !query +SELECT grp, hex(listagg(DISTINCT col, X'2C') WITHIN GROUP (ORDER BY col)) FROM VALUES (1, X'AA'), (1, X'BB'), (1, X'AA'), (2, X'CC'), (2, X'CC') AS t(grp, col) GROUP BY grp +-- !query schema +struct +-- !query output +1 AA2CBB +2 CC + + -- !query SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1) -- !query schema @@ -374,3 +512,54 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "orderingExpr" : "\"col1\", \"col2\"" } } + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.1 as double)), (cast(2.2 as double)), (cast(2.2 as double)), (cast(3.3 as double)) AS t(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING\"", + "funcName" : "`listagg`", + "inputType" : "\"DOUBLE\"" + } +} + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (cast(1.0 as float)), (cast(2.0 as float)), (cast(2.0 as float)) AS t(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING\"", + "funcName" : "`listagg`", + "inputType" : "\"FLOAT\"" + } +} + + +-- !query +SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROM VALUES (TIMESTAMP'2024-01-01 10:00:00'), (TIMESTAMP'2024-01-02 12:00:00'), (TIMESTAMP'2024-01-01 10:00:00') AS t(col) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST", + "sqlState" : "42K0K", + "messageParameters" : { + "castType" : "\"STRING\"", + "funcName" : "`listagg`", + "inputType" : "\"TIMESTAMP\"" + } +}