Skip to content

Commit eaaf1aa

Browse files
imback82HyukjinKwon
authored andcommitted
[SPARK-28278][SQL][PYTHON][TESTS] Convert and port 'except-all.sql' into UDF test base
## What changes were proposed in this pull request? This PR adds some tests converted from `except-all.sql` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'except-all.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/except-all.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-except-all.sql.out index 01091a2..b7bfad0 100644 --- a/sql/core/src/test/resources/sql-tests/results/except-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-except-all.sql.out -49,11 +49,11 struct<> -- !query 4 -SELECT * FROM tab1 +SELECT udf(c1) FROM tab1 EXCEPT ALL -SELECT * FROM tab2 +SELECT udf(c1) FROM tab2 -- !query 4 schema -struct<c1:int> +struct<CAST(udf(cast(c1 as string)) AS INT):int> -- !query 4 output 0 2 -62,11 +62,11 NULL -- !query 5 -SELECT * FROM tab1 +SELECT udf(c1) FROM tab1 MINUS ALL -SELECT * FROM tab2 +SELECT udf(c1) FROM tab2 -- !query 5 schema -struct<c1:int> +struct<CAST(udf(cast(c1 as string)) AS INT):int> -- !query 5 output 0 2 -75,11 +75,11 NULL -- !query 6 -SELECT * FROM tab1 +SELECT udf(c1) FROM tab1 EXCEPT ALL -SELECT * FROM tab2 WHERE c1 IS NOT NULL +SELECT udf(c1) FROM tab2 WHERE udf(c1) IS NOT NULL -- !query 6 schema -struct<c1:int> +struct<CAST(udf(cast(c1 as string)) AS INT):int> -- !query 6 output 0 2 -89,21 +89,21 NULL -- !query 7 -SELECT * FROM tab1 WHERE c1 > 5 +SELECT udf(c1) FROM tab1 WHERE udf(c1) > 5 EXCEPT ALL -SELECT * FROM tab2 +SELECT udf(c1) FROM tab2 -- !query 7 schema -struct<c1:int> +struct<CAST(udf(cast(c1 as string)) AS INT):int> -- !query 7 output -- !query 8 -SELECT * FROM tab1 +SELECT udf(c1) FROM tab1 EXCEPT ALL -SELECT * FROM tab2 WHERE c1 > 6 +SELECT udf(c1) FROM tab2 WHERE udf(c1 > udf(6)) -- !query 8 schema -struct<c1:int> +struct<CAST(udf(cast(c1 as string)) AS INT):int> -- !query 8 output 0 1 -117,11 +117,11 NULL -- !query 9 -SELECT * FROM tab1 +SELECT udf(c1) FROM tab1 EXCEPT ALL -SELECT CAST(1 AS BIGINT) +SELECT CAST(udf(1) AS BIGINT) -- !query 9 schema -struct<c1:bigint> +struct<CAST(udf(cast(c1 as string)) AS INT):bigint> -- !query 9 output 0 2 -134,7 +134,7 NULL -- !query 10 -SELECT * FROM tab1 +SELECT udf(c1) FROM tab1 EXCEPT ALL SELECT array(1) -- !query 10 schema -145,62 +145,62 ExceptAll can only be performed on tables with the compatible column types. arra -- !query 11 -SELECT * FROM tab3 +SELECT udf(k), v FROM tab3 EXCEPT ALL -SELECT * FROM tab4 +SELECT k, udf(v) FROM tab4 -- !query 11 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 11 output 1 2 1 3 -- !query 12 -SELECT * FROM tab4 +SELECT k, udf(v) FROM tab4 EXCEPT ALL -SELECT * FROM tab3 +SELECT udf(k), v FROM tab3 -- !query 12 schema -struct<k:int,v:int> +struct<k:int,CAST(udf(cast(v as string)) AS INT):int> -- !query 12 output 2 2 2 20 -- !query 13 -SELECT * FROM tab4 +SELECT udf(k), udf(v) FROM tab4 EXCEPT ALL -SELECT * FROM tab3 +SELECT udf(k), udf(v) FROM tab3 INTERSECT DISTINCT -SELECT * FROM tab4 +SELECT udf(k), udf(v) FROM tab4 -- !query 13 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 13 output 2 2 2 20 -- !query 14 -SELECT * FROM tab4 +SELECT udf(k), v FROM tab4 EXCEPT ALL -SELECT * FROM tab3 +SELECT k, udf(v) FROM tab3 EXCEPT DISTINCT -SELECT * FROM tab4 +SELECT udf(k), udf(v) FROM tab4 -- !query 14 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 14 output -- !query 15 -SELECT * FROM tab3 +SELECT k, udf(v) FROM tab3 EXCEPT ALL -SELECT * FROM tab4 +SELECT udf(k), udf(v) FROM tab4 UNION ALL -SELECT * FROM tab3 +SELECT udf(k), v FROM tab3 EXCEPT DISTINCT -SELECT * FROM tab4 +SELECT k, udf(v) FROM tab4 -- !query 15 schema -struct<k:int,v:int> +struct<k:int,CAST(udf(cast(v as string)) AS INT):int> -- !query 15 output 1 3 -217,83 +217,83 ExceptAll can only be performed on tables with the same number of columns, but t -- !query 17 -SELECT * FROM tab3 +SELECT udf(k), udf(v) FROM tab3 EXCEPT ALL -SELECT * FROM tab4 +SELECT udf(k), udf(v) FROM tab4 UNION -SELECT * FROM tab3 +SELECT udf(k), udf(v) FROM tab3 EXCEPT DISTINCT -SELECT * FROM tab4 +SELECT udf(k), udf(v) FROM tab4 -- !query 17 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 17 output 1 3 -- !query 18 -SELECT * FROM tab3 +SELECT udf(k), udf(v) FROM tab3 MINUS ALL -SELECT * FROM tab4 +SELECT k, udf(v) FROM tab4 UNION -SELECT * FROM tab3 +SELECT udf(k), udf(v) FROM tab3 MINUS DISTINCT -SELECT * FROM tab4 +SELECT k, udf(v) FROM tab4 -- !query 18 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 18 output 1 3 -- !query 19 -SELECT * FROM tab3 +SELECT k, udf(v) FROM tab3 EXCEPT ALL -SELECT * FROM tab4 +SELECT udf(k), v FROM tab4 EXCEPT DISTINCT -SELECT * FROM tab3 +SELECT k, udf(v) FROM tab3 EXCEPT DISTINCT -SELECT * FROM tab4 +SELECT udf(k), v FROM tab4 -- !query 19 schema -struct<k:int,v:int> +struct<k:int,CAST(udf(cast(v as string)) AS INT):int> -- !query 19 output -- !query 20 SELECT * -FROM (SELECT tab3.k, - tab4.v +FROM (SELECT tab3.k, + udf(tab4.v) FROM tab3 JOIN tab4 - ON tab3.k = tab4.k) + ON udf(tab3.k) = tab4.k) EXCEPT ALL SELECT * -FROM (SELECT tab3.k, - tab4.v +FROM (SELECT udf(tab3.k), + tab4.v FROM tab3 JOIN tab4 - ON tab3.k = tab4.k) + ON tab3.k = udf(tab4.k)) -- !query 20 schema -struct<k:int,v:int> +struct<k:int,CAST(udf(cast(v as string)) AS INT):int> -- !query 20 output -- !query 21 SELECT * -FROM (SELECT tab3.k, - tab4.v +FROM (SELECT udf(udf(tab3.k)), + udf(tab4.v) FROM tab3 JOIN tab4 - ON tab3.k = tab4.k) + ON udf(udf(tab3.k)) = udf(tab4.k)) EXCEPT ALL SELECT * -FROM (SELECT tab4.v AS k, - tab3.k AS v +FROM (SELECT udf(tab4.v) AS k, + udf(udf(tab3.k)) AS v FROM tab3 JOIN tab4 - ON tab3.k = tab4.k) + ON udf(tab3.k) = udf(tab4.k)) -- !query 21 schema -struct<k:int,v:int> +struct<CAST(udf(cast(cast(udf(cast(k as string)) as int) as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 21 output 1 2 1 2 -305,11 +305,11 struct<k:int,v:int> -- !query 22 -SELECT v FROM tab3 GROUP BY v +SELECT udf(v) FROM tab3 GROUP BY v EXCEPT ALL -SELECT k FROM tab4 GROUP BY k +SELECT udf(k) FROM tab4 GROUP BY k -- !query 22 schema -struct<v:int> +struct<CAST(udf(cast(v as string)) AS INT):int> -- !query 22 output 3 ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25090 from imback82/except-all. Authored-by: Terry Kim <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent 62004f1 commit eaaf1aa

File tree

2 files changed

+508
-0
lines changed

2 files changed

+508
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
-- This test file was converted from except-all.sql.
2+
3+
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
4+
(0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1);
5+
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
6+
(1), (2), (2), (3), (5), (5), (null) AS tab2(c1);
7+
CREATE TEMPORARY VIEW tab3 AS SELECT * FROM VALUES
8+
(1, 2),
9+
(1, 2),
10+
(1, 3),
11+
(2, 3),
12+
(2, 2)
13+
AS tab3(k, v);
14+
CREATE TEMPORARY VIEW tab4 AS SELECT * FROM VALUES
15+
(1, 2),
16+
(2, 3),
17+
(2, 2),
18+
(2, 2),
19+
(2, 20)
20+
AS tab4(k, v);
21+
22+
-- Basic EXCEPT ALL
23+
SELECT udf(c1) FROM tab1
24+
EXCEPT ALL
25+
SELECT udf(c1) FROM tab2;
26+
27+
-- MINUS ALL (synonym for EXCEPT)
28+
SELECT udf(c1) FROM tab1
29+
MINUS ALL
30+
SELECT udf(c1) FROM tab2;
31+
32+
-- EXCEPT ALL same table in both branches
33+
SELECT udf(c1) FROM tab1
34+
EXCEPT ALL
35+
SELECT udf(c1) FROM tab2 WHERE udf(c1) IS NOT NULL;
36+
37+
-- Empty left relation
38+
SELECT udf(c1) FROM tab1 WHERE udf(c1) > 5
39+
EXCEPT ALL
40+
SELECT udf(c1) FROM tab2;
41+
42+
-- Empty right relation
43+
SELECT udf(c1) FROM tab1
44+
EXCEPT ALL
45+
SELECT udf(c1) FROM tab2 WHERE udf(c1 > udf(6));
46+
47+
-- Type Coerced ExceptAll
48+
SELECT udf(c1) FROM tab1
49+
EXCEPT ALL
50+
SELECT CAST(udf(1) AS BIGINT);
51+
52+
-- Error as types of two side are not compatible
53+
SELECT udf(c1) FROM tab1
54+
EXCEPT ALL
55+
SELECT array(1);
56+
57+
-- Basic
58+
SELECT udf(k), v FROM tab3
59+
EXCEPT ALL
60+
SELECT k, udf(v) FROM tab4;
61+
62+
-- Basic
63+
SELECT k, udf(v) FROM tab4
64+
EXCEPT ALL
65+
SELECT udf(k), v FROM tab3;
66+
67+
-- EXCEPT ALL + INTERSECT
68+
SELECT udf(k), udf(v) FROM tab4
69+
EXCEPT ALL
70+
SELECT udf(k), udf(v) FROM tab3
71+
INTERSECT DISTINCT
72+
SELECT udf(k), udf(v) FROM tab4;
73+
74+
-- EXCEPT ALL + EXCEPT
75+
SELECT udf(k), v FROM tab4
76+
EXCEPT ALL
77+
SELECT k, udf(v) FROM tab3
78+
EXCEPT DISTINCT
79+
SELECT udf(k), udf(v) FROM tab4;
80+
81+
-- Chain of set operations
82+
SELECT k, udf(v) FROM tab3
83+
EXCEPT ALL
84+
SELECT udf(k), udf(v) FROM tab4
85+
UNION ALL
86+
SELECT udf(k), v FROM tab3
87+
EXCEPT DISTINCT
88+
SELECT k, udf(v) FROM tab4;
89+
90+
-- Mismatch on number of columns across both branches
91+
SELECT k FROM tab3
92+
EXCEPT ALL
93+
SELECT k, v FROM tab4;
94+
95+
-- Chain of set operations
96+
SELECT udf(k), udf(v) FROM tab3
97+
EXCEPT ALL
98+
SELECT udf(k), udf(v) FROM tab4
99+
UNION
100+
SELECT udf(k), udf(v) FROM tab3
101+
EXCEPT DISTINCT
102+
SELECT udf(k), udf(v) FROM tab4;
103+
104+
-- Using MINUS ALL
105+
SELECT udf(k), udf(v) FROM tab3
106+
MINUS ALL
107+
SELECT k, udf(v) FROM tab4
108+
UNION
109+
SELECT udf(k), udf(v) FROM tab3
110+
MINUS DISTINCT
111+
SELECT k, udf(v) FROM tab4;
112+
113+
-- Chain of set operations
114+
SELECT k, udf(v) FROM tab3
115+
EXCEPT ALL
116+
SELECT udf(k), v FROM tab4
117+
EXCEPT DISTINCT
118+
SELECT k, udf(v) FROM tab3
119+
EXCEPT DISTINCT
120+
SELECT udf(k), v FROM tab4;
121+
122+
-- Join under except all. Should produce empty resultset since both left and right sets
123+
-- are same.
124+
SELECT *
125+
FROM (SELECT tab3.k,
126+
udf(tab4.v)
127+
FROM tab3
128+
JOIN tab4
129+
ON udf(tab3.k) = tab4.k)
130+
EXCEPT ALL
131+
SELECT *
132+
FROM (SELECT udf(tab3.k),
133+
tab4.v
134+
FROM tab3
135+
JOIN tab4
136+
ON tab3.k = udf(tab4.k));
137+
138+
-- Join under except all (2)
139+
SELECT *
140+
FROM (SELECT udf(udf(tab3.k)),
141+
udf(tab4.v)
142+
FROM tab3
143+
JOIN tab4
144+
ON udf(udf(tab3.k)) = udf(tab4.k))
145+
EXCEPT ALL
146+
SELECT *
147+
FROM (SELECT udf(tab4.v) AS k,
148+
udf(udf(tab3.k)) AS v
149+
FROM tab3
150+
JOIN tab4
151+
ON udf(tab3.k) = udf(tab4.k));
152+
153+
-- Group by under ExceptAll
154+
SELECT udf(v) FROM tab3 GROUP BY v
155+
EXCEPT ALL
156+
SELECT udf(k) FROM tab4 GROUP BY k;
157+
158+
-- Clean-up
159+
DROP VIEW IF EXISTS tab1;
160+
DROP VIEW IF EXISTS tab2;
161+
DROP VIEW IF EXISTS tab3;
162+
DROP VIEW IF EXISTS tab4;

0 commit comments

Comments
 (0)