You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: common/utils/src/main/resources/error/error-conditions.json
+1-1Lines changed: 1 addition & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -4286,7 +4286,7 @@
4286
4286
},
4287
4287
"MISMATCH_WITH_DISTINCT_INPUT_UNSAFE_CAST" : {
4288
4288
"message" : [
4289
-
"The function <funcName> with DISTINCT requires a cast from <inputType> to <castType>, but this cast may not preserve equality semantics for the input type (e.g., floating-point -0.0 and 0.0 are treated as equal during GROUP BY but cast to different strings, leading to incorrect deduplication)."
4289
+
"The function <funcName> with DISTINCT and WITHIN GROUP (ORDER BY) is not supported for <inputType> input. Explicitly cast the input to <castType> before passing it to the function argument and ORDER BY expression."
Copy file name to clipboardExpand all lines: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AggregateExpressionResolver.scala
+13-13Lines changed: 13 additions & 13 deletions
Original file line number
Diff line number
Diff line change
@@ -115,25 +115,25 @@ class AggregateExpressionResolver(
SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col)
125
+
-- !query analysis
126
+
Aggregate [listagg(distinct cast(col#x as string), null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS STRING), NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x]
127
+
+- SubqueryAlias t
128
+
+- LocalRelation [col#x]
129
+
130
+
131
+
-- !query
132
+
SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col)
SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col)
148
+
-- !query analysis
149
+
Aggregate [listagg(distinct cast(col#x as string), null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS STRING), NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x]
150
+
+- SubqueryAlias t
151
+
+- LocalRelation [col#x]
152
+
153
+
123
154
-- !query
124
155
SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROM VALUES (X'414243'), (X'616263'), (X'414243') AS t(col)
SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col)
171
+
-- !query analysis
172
+
Aggregate [listagg(distinct cast(col#x as binary), null, col#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT CAST(col AS BINARY), NULL) WITHIN GROUP (ORDER BY col ASC NULLS FIRST)#x]
173
+
+- SubqueryAlias t
174
+
+- LocalRelation [col#x]
175
+
176
+
177
+
-- !query
178
+
SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM (SELECT col COLLATE UTF8_LCASE AS col FROM VALUES ('ABC'), ('abc'), ('ABC') AS t(col))
Copy file name to clipboardExpand all lines: sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
+10-10Lines changed: 10 additions & 10 deletions
Original file line number
Diff line number
Diff line change
@@ -272,7 +272,7 @@ Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x DESC NULLS LAST,
272
272
273
273
274
274
-- !query
275
-
WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t
275
+
WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t
276
276
-- !query analysis
277
277
WithCTE
278
278
:- CTERelationDef xxxx, false
@@ -281,13 +281,13 @@ WithCTE
281
281
: +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x]
282
282
: +- SubqueryAlias __auto_generated_subquery_name
283
283
: +- LocalRelation [col1#x]
284
-
+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x]
284
+
+- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x]
WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t
290
+
WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t
291
291
-- !query analysis
292
292
WithCTE
293
293
:- CTERelationDef xxxx, false
@@ -296,13 +296,13 @@ WithCTE
296
296
: +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x]
297
297
: +- SubqueryAlias __auto_generated_subquery_name
298
298
: +- LocalRelation [col1#x]
299
-
+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x]
299
+
+- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x]
WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'42'), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t
305
+
WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'42')), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t
306
306
-- !query analysis
307
307
WithCTE
308
308
:- CTERelationDef xxxx, false
@@ -311,7 +311,7 @@ WithCTE
311
311
: +- Aggregate [listagg(col1#x, 0x42, 0, 0) AS listagg(col1, X'42')#x]
312
312
: +- SubqueryAlias __auto_generated_subquery_name
313
313
: +- LocalRelation [col1#x]
314
-
+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0x42 as string)) AS regexp_count(col, X'42')#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x]
314
+
+- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0x42)) AS regexp_count(hex(col), hex(X'42'))#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x]
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF'), regexp_count(col, X'CAFE') FROM t
451
+
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')), regexp_count(hex(col), hex(X'CAFE')) FROM t
452
452
-- !query analysis
453
453
WithCTE
454
454
:- CTERelationDef xxxx, false
@@ -457,13 +457,13 @@ WithCTE
457
457
: +- Aggregate [listagg(distinct col1#x, 0x2C, col1#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x]
458
458
: +- SubqueryAlias __auto_generated_subquery_name
459
459
: +- LocalRelation [col1#x]
460
-
+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0xDEAD as string)) AS regexp_count(col, X'DEAD')#x, regexp_count(cast(col#x as string), cast(0xBEEF as string)) AS regexp_count(col, X'BEEF')#x, regexp_count(cast(col#x as string), cast(0xCAFE as string)) AS regexp_count(col, X'CAFE')#x]
460
+
+- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xDEAD)) AS regexp_count(hex(col), hex(X'DEAD'))#x, regexp_count(hex(col#x), hex(0xBEEF)) AS regexp_count(hex(col), hex(X'BEEF'))#x, regexp_count(hex(col#x), hex(0xCAFE)) AS regexp_count(hex(col), hex(X'CAFE'))#x]
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(col, X'AA'), regexp_count(col, X'BB') FROM t
466
+
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(hex(col), hex(X'AA')), regexp_count(hex(col), hex(X'BB')) FROM t
467
467
-- !query analysis
468
468
WithCTE
469
469
:- CTERelationDef xxxx, false
@@ -472,7 +472,7 @@ WithCTE
472
472
: +- Aggregate [listagg(distinct col1#x, 0x7C, col1#x ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x]
473
473
: +- SubqueryAlias __auto_generated_subquery_name
474
474
: +- LocalRelation [col1#x]
475
-
+- Project [len(col#x) AS len(col)#x, regexp_count(cast(col#x as string), cast(0xAA as string)) AS regexp_count(col, X'AA')#x, regexp_count(cast(col#x as string), cast(0xBB as string)) AS regexp_count(col, X'BB')#x]
475
+
+- Project [len(col#x) AS len(col)#x, regexp_count(hex(col#x), hex(0xAA)) AS regexp_count(hex(col), hex(X'AA'))#x, regexp_count(hex(col#x), hex(0xBB)) AS regexp_count(hex(col), hex(X'BB'))#x]
Copy file name to clipboardExpand all lines: sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
+16-1Lines changed: 16 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -12,4 +12,19 @@ WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE unicode_
12
12
13
13
-- Error case with collations
14
14
SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1);
15
-
SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROMVALUES (X'414243'), (X'616263'), (X'414243') AS t(col)
15
+
16
+
-- LISTAGG DISTINCT cast safety with collations:
17
+
-- string -> string (safe): explicit cast to same collation
18
+
SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROMVALUES ('ABC'), ('abc'), ('ABC') AS t(col);
19
+
-- string -> string (unsafe): cast to non-binary-equality collation on target
20
+
SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROMVALUES ('ABC'), ('abc'), ('ABC') AS t(col);
21
+
22
+
-- binary -> string (safe): cast to default STRING (UTF8_BINARY)
23
+
SELECT listagg(DISTINCT CAST(col AS STRING)) WITHIN GROUP (ORDER BY col) FROMVALUES (X'414243'), (X'616263'), (X'414243') AS t(col); -- ABC, abc, ABC
24
+
-- binary -> string (unsafe): cast to non-binary-equality collation on target
25
+
SELECT listagg(DISTINCT CAST(col AS STRING COLLATE UTF8_LCASE)) WITHIN GROUP (ORDER BY col) FROMVALUES (X'414243'), (X'616263'), (X'414243') AS t(col); -- ABC, abc, ABC
SELECT listagg(DISTINCT CAST(col AS BINARY)) WITHIN GROUP (ORDER BY col) FROM (SELECT col COLLATE UTF8_LCASE AS col FROMVALUES ('ABC'), ('abc'), ('ABC') AS t(col))
Copy file name to clipboardExpand all lines: sql/core/src/test/resources/sql-tests/inputs/listagg.sql
+5-5Lines changed: 5 additions & 5 deletions
Original file line number
Diff line number
Diff line change
@@ -24,9 +24,9 @@ WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM
24
24
SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df;
25
25
SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df;
26
26
SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df;
27
-
WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t;
28
-
WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t;
29
-
WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(col, X'42'), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF') FROM t;
27
+
WITH t(col) AS (SELECT listagg(col1) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t;
28
+
WITH t(col) AS (SELECT listagg(col1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t;
29
+
WITH t(col) AS (SELECT listagg(col1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF'))) SELECT len(col), regexp_count(hex(col), hex(X'42')), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')) FROM t;
30
30
WITH t(col1, col2) AS (SELECT listagg(col1), listagg(col2, ',') FROM df2) SELECT len(col1), regexp_count(col1, '1'), regexp_count(col1, '2'), regexp_count(col1, '3'), len(col2), regexp_count(col2, 'true'), regexp_count(col1, 'false') FROM t;
31
31
32
32
-- LISTAGG with DISTINCT with implicit cast from non-string types (safe types - should succeed)
@@ -44,8 +44,8 @@ SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col DESC) FROM VALUES (
44
44
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col) FROMVALUES (1), (2), (null), (2), (3) AS t(col);
45
45
SELECT listagg(DISTINCT col, ',') WITHIN GROUP (ORDER BY col NULLS FIRST) FROMVALUES (1), (null), (2), (null) AS t(col);
46
46
SELECT grp, listagg(DISTINCT col) WITHIN GROUP (ORDER BY col) FROMVALUES (1, 'a'), (1, 'b'), (2, 'a'), (2, 'a'), (1, 'b') AS t(grp, col) GROUP BY grp;
47
-
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(col, X'DEAD'), regexp_count(col, X'BEEF'), regexp_count(col, X'CAFE') FROM t;
48
-
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(col, X'AA'), regexp_count(col, X'BB') FROM t;
47
+
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'2C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'DEAD'), (X'BEEF'), (X'DEAD'), (X'CAFE'))) SELECT len(col), regexp_count(hex(col), hex(X'DEAD')), regexp_count(hex(col), hex(X'BEEF')), regexp_count(hex(col), hex(X'CAFE')) FROM t;
48
+
WITH t(col) AS (SELECT listagg(DISTINCT col1, X'7C') WITHIN GROUP (ORDER BY col1) FROM (VALUES (X'BB'), (X'AA'), (NULL), (X'BB'))) SELECT len(col), regexp_count(hex(col), hex(X'AA')), regexp_count(hex(col), hex(X'BB')) FROM t;
49
49
SELECT grp, hex(listagg(DISTINCT col, X'2C') WITHIN GROUP (ORDER BY col)) FROMVALUES (1, X'AA'), (1, X'BB'), (1, X'AA'), (2, X'CC'), (2, X'CC') AS t(grp, col) GROUP BY grp;
0 commit comments