Skip to content

Commit 62004f1

Browse files
imback82HyukjinKwon
authored andcommitted
[SPARK-28283][SQL][PYTHON][TESTS] Convert and port 'intersect-all.sql' into UDF test base
## What changes were proposed in this pull request? This PR adds some tests converted from `intersect-all.sql` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'intersect-all.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out index 63dd56c..0cb82be 100644 --- a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out -34,11 +34,11 struct<> -- !query 2 -SELECT * FROM tab1 +SELECT udf(k), v FROM tab1 INTERSECT ALL -SELECT * FROM tab2 +SELECT k, udf(v) FROM tab2 -- !query 2 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 2 output 1 2 1 2 -48,11 +48,11 NULL NULL -- !query 3 -SELECT * FROM tab1 +SELECT k, udf(v) FROM tab1 INTERSECT ALL -SELECT * FROM tab1 WHERE k = 1 +SELECT udf(k), v FROM tab1 WHERE udf(k) = 1 -- !query 3 schema -struct<k:int,v:int> +struct<k:int,CAST(udf(cast(v as string)) AS INT):int> -- !query 3 output 1 2 1 2 -61,39 +61,39 struct<k:int,v:int> -- !query 4 -SELECT * FROM tab1 WHERE k > 2 +SELECT udf(k), udf(v) FROM tab1 WHERE k > udf(2) INTERSECT ALL -SELECT * FROM tab2 +SELECT udf(k), udf(v) FROM tab2 -- !query 4 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 4 output -- !query 5 -SELECT * FROM tab1 +SELECT udf(k), v FROM tab1 INTERSECT ALL -SELECT * FROM tab2 WHERE k > 3 +SELECT udf(k), v FROM tab2 WHERE udf(udf(k)) > 3 -- !query 5 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 5 output -- !query 6 -SELECT * FROM tab1 +SELECT udf(k), v FROM tab1 INTERSECT ALL -SELECT CAST(1 AS BIGINT), CAST(2 AS BIGINT) +SELECT CAST(udf(1) AS BIGINT), CAST(udf(2) AS BIGINT) -- !query 6 schema -struct<k:bigint,v:bigint> +struct<CAST(udf(cast(k as string)) AS INT):bigint,v:bigint> -- !query 6 output 1 2 -- !query 7 -SELECT * FROM tab1 +SELECT k, udf(v) FROM tab1 INTERSECT ALL -SELECT array(1), 2 +SELECT array(1), udf(2) -- !query 7 schema struct<> -- !query 7 output -102,9 +102,9 IntersectAll can only be performed on tables with the compatible column types. a -- !query 8 -SELECT k FROM tab1 +SELECT udf(k) FROM tab1 INTERSECT ALL -SELECT k, v FROM tab2 +SELECT udf(k), udf(v) FROM tab2 -- !query 8 schema struct<> -- !query 8 output -113,13 +113,13 IntersectAll can only be performed on tables with the same number of columns, bu -- !query 9 -SELECT * FROM tab2 +SELECT udf(k), v FROM tab2 INTERSECT ALL -SELECT * FROM tab1 +SELECT k, udf(v) FROM tab1 INTERSECT ALL -SELECT * FROM tab2 +SELECT udf(k), udf(v) FROM tab2 -- !query 9 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 9 output 1 2 1 2 -129,15 +129,15 NULL NULL -- !query 10 -SELECT * FROM tab1 +SELECT udf(k), v FROM tab1 EXCEPT -SELECT * FROM tab2 +SELECT k, udf(v) FROM tab2 UNION ALL -SELECT * FROM tab1 +SELECT k, udf(udf(v)) FROM tab1 INTERSECT ALL -SELECT * FROM tab2 +SELECT udf(k), v FROM tab2 -- !query 10 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 10 output 1 2 1 2 -148,15 +148,15 NULL NULL -- !query 11 -SELECT * FROM tab1 +SELECT udf(k), udf(v) FROM tab1 EXCEPT -SELECT * FROM tab2 +SELECT udf(k), v FROM tab2 EXCEPT -SELECT * FROM tab1 +SELECT k, udf(v) FROM tab1 INTERSECT ALL -SELECT * FROM tab2 +SELECT udf(k), udf(udf(v)) FROM tab2 -- !query 11 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 11 output 1 3 -165,38 +165,38 struct<k:int,v:int> ( ( ( - SELECT * FROM tab1 + SELECT udf(k), v FROM tab1 EXCEPT - SELECT * FROM tab2 + SELECT k, udf(v) FROM tab2 ) EXCEPT - SELECT * FROM tab1 + SELECT udf(k), udf(v) FROM tab1 ) INTERSECT ALL - SELECT * FROM tab2 + SELECT udf(k), udf(v) FROM tab2 ) -- !query 12 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 12 output -- !query 13 SELECT * -FROM (SELECT tab1.k, - tab2.v +FROM (SELECT udf(tab1.k), + udf(tab2.v) FROM tab1 JOIN tab2 - ON tab1.k = tab2.k) + ON udf(udf(tab1.k)) = tab2.k) INTERSECT ALL SELECT * -FROM (SELECT tab1.k, - tab2.v +FROM (SELECT udf(tab1.k), + udf(tab2.v) FROM tab1 JOIN tab2 - ON tab1.k = tab2.k) + ON udf(tab1.k) = udf(udf(tab2.k))) -- !query 13 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 13 output 1 2 1 2 -211,30 +211,30 struct<k:int,v:int> -- !query 14 SELECT * -FROM (SELECT tab1.k, - tab2.v +FROM (SELECT udf(tab1.k), + udf(tab2.v) FROM tab1 JOIN tab2 - ON tab1.k = tab2.k) + ON udf(tab1.k) = udf(tab2.k)) INTERSECT ALL SELECT * -FROM (SELECT tab2.v AS k, - tab1.k AS v +FROM (SELECT udf(tab2.v) AS k, + udf(tab1.k) AS v FROM tab1 JOIN tab2 - ON tab1.k = tab2.k) + ON tab1.k = udf(tab2.k)) -- !query 14 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int> -- !query 14 output -- !query 15 -SELECT v FROM tab1 GROUP BY v +SELECT udf(v) FROM tab1 GROUP BY v INTERSECT ALL -SELECT k FROM tab2 GROUP BY k +SELECT udf(udf(k)) FROM tab2 GROUP BY k -- !query 15 schema -struct<v:int> +struct<CAST(udf(cast(v as string)) AS INT):int> -- !query 15 output 2 3 -250,15 +250,15 spark.sql.legacy.setopsPrecedence.enabled true -- !query 17 -SELECT * FROM tab1 +SELECT udf(k), v FROM tab1 EXCEPT -SELECT * FROM tab2 +SELECT k, udf(v) FROM tab2 UNION ALL -SELECT * FROM tab1 +SELECT udf(k), udf(v) FROM tab1 INTERSECT ALL -SELECT * FROM tab2 +SELECT udf(udf(k)), udf(v) FROM tab2 -- !query 17 schema -struct<k:int,v:int> +struct<CAST(udf(cast(k as string)) AS INT):int,v:int> -- !query 17 output 1 2 1 2 -268,15 +268,15 NULL NULL -- !query 18 -SELECT * FROM tab1 +SELECT k, udf(v) FROM tab1 EXCEPT -SELECT * FROM tab2 +SELECT udf(k), v FROM tab2 UNION ALL -SELECT * FROM tab1 +SELECT udf(k), udf(v) FROM tab1 INTERSECT -SELECT * FROM tab2 +SELECT udf(k), udf(udf(v)) FROM tab2 -- !query 18 schema -struct<k:int,v:int> +struct<k:int,CAST(udf(cast(v as string)) AS INT):int> -- !query 18 output 1 2 2 3 ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25119 from imback82/intersect-all-sql. Authored-by: Terry Kim <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent 4645ffb commit 62004f1

File tree

2 files changed

+469
-0
lines changed

2 files changed

+469
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
-- This test file was converted from intersect-all.sql.
2+
3+
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
4+
(1, 2),
5+
(1, 2),
6+
(1, 3),
7+
(1, 3),
8+
(2, 3),
9+
(null, null),
10+
(null, null)
11+
AS tab1(k, v);
12+
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
13+
(1, 2),
14+
(1, 2),
15+
(2, 3),
16+
(3, 4),
17+
(null, null),
18+
(null, null)
19+
AS tab2(k, v);
20+
21+
-- Basic INTERSECT ALL
22+
SELECT udf(k), v FROM tab1
23+
INTERSECT ALL
24+
SELECT k, udf(v) FROM tab2;
25+
26+
-- INTERSECT ALL same table in both branches
27+
SELECT k, udf(v) FROM tab1
28+
INTERSECT ALL
29+
SELECT udf(k), v FROM tab1 WHERE udf(k) = 1;
30+
31+
-- Empty left relation
32+
SELECT udf(k), udf(v) FROM tab1 WHERE k > udf(2)
33+
INTERSECT ALL
34+
SELECT udf(k), udf(v) FROM tab2;
35+
36+
-- Empty right relation
37+
SELECT udf(k), v FROM tab1
38+
INTERSECT ALL
39+
SELECT udf(k), v FROM tab2 WHERE udf(udf(k)) > 3;
40+
41+
-- Type Coerced INTERSECT ALL
42+
SELECT udf(k), v FROM tab1
43+
INTERSECT ALL
44+
SELECT CAST(udf(1) AS BIGINT), CAST(udf(2) AS BIGINT);
45+
46+
-- Error as types of two side are not compatible
47+
SELECT k, udf(v) FROM tab1
48+
INTERSECT ALL
49+
SELECT array(1), udf(2);
50+
51+
-- Mismatch on number of columns across both branches
52+
SELECT udf(k) FROM tab1
53+
INTERSECT ALL
54+
SELECT udf(k), udf(v) FROM tab2;
55+
56+
-- Basic
57+
SELECT udf(k), v FROM tab2
58+
INTERSECT ALL
59+
SELECT k, udf(v) FROM tab1
60+
INTERSECT ALL
61+
SELECT udf(k), udf(v) FROM tab2;
62+
63+
-- Chain of different `set operations
64+
SELECT udf(k), v FROM tab1
65+
EXCEPT
66+
SELECT k, udf(v) FROM tab2
67+
UNION ALL
68+
SELECT k, udf(udf(v)) FROM tab1
69+
INTERSECT ALL
70+
SELECT udf(k), v FROM tab2
71+
;
72+
73+
-- Chain of different `set operations
74+
SELECT udf(k), udf(v) FROM tab1
75+
EXCEPT
76+
SELECT udf(k), v FROM tab2
77+
EXCEPT
78+
SELECT k, udf(v) FROM tab1
79+
INTERSECT ALL
80+
SELECT udf(k), udf(udf(v)) FROM tab2
81+
;
82+
83+
-- test use parenthesis to control order of evaluation
84+
(
85+
(
86+
(
87+
SELECT udf(k), v FROM tab1
88+
EXCEPT
89+
SELECT k, udf(v) FROM tab2
90+
)
91+
EXCEPT
92+
SELECT udf(k), udf(v) FROM tab1
93+
)
94+
INTERSECT ALL
95+
SELECT udf(k), udf(v) FROM tab2
96+
)
97+
;
98+
99+
-- Join under intersect all
100+
SELECT *
101+
FROM (SELECT udf(tab1.k),
102+
udf(tab2.v)
103+
FROM tab1
104+
JOIN tab2
105+
ON udf(udf(tab1.k)) = tab2.k)
106+
INTERSECT ALL
107+
SELECT *
108+
FROM (SELECT udf(tab1.k),
109+
udf(tab2.v)
110+
FROM tab1
111+
JOIN tab2
112+
ON udf(tab1.k) = udf(udf(tab2.k)));
113+
114+
-- Join under intersect all (2)
115+
SELECT *
116+
FROM (SELECT udf(tab1.k),
117+
udf(tab2.v)
118+
FROM tab1
119+
JOIN tab2
120+
ON udf(tab1.k) = udf(tab2.k))
121+
INTERSECT ALL
122+
SELECT *
123+
FROM (SELECT udf(tab2.v) AS k,
124+
udf(tab1.k) AS v
125+
FROM tab1
126+
JOIN tab2
127+
ON tab1.k = udf(tab2.k));
128+
129+
-- Group by under intersect all
130+
SELECT udf(v) FROM tab1 GROUP BY v
131+
INTERSECT ALL
132+
SELECT udf(udf(k)) FROM tab2 GROUP BY k;
133+
134+
-- Test pre spark2.4 behaviour of set operation precedence
135+
-- All the set operators are given equal precedence and are evaluated
136+
-- from left to right as they appear in the query.
137+
138+
-- Set the property
139+
SET spark.sql.legacy.setopsPrecedence.enabled= true;
140+
141+
SELECT udf(k), v FROM tab1
142+
EXCEPT
143+
SELECT k, udf(v) FROM tab2
144+
UNION ALL
145+
SELECT udf(k), udf(v) FROM tab1
146+
INTERSECT ALL
147+
SELECT udf(udf(k)), udf(v) FROM tab2;
148+
149+
SELECT k, udf(v) FROM tab1
150+
EXCEPT
151+
SELECT udf(k), v FROM tab2
152+
UNION ALL
153+
SELECT udf(k), udf(v) FROM tab1
154+
INTERSECT
155+
SELECT udf(k), udf(udf(v)) FROM tab2;
156+
157+
-- Restore the property
158+
SET spark.sql.legacy.setopsPrecedence.enabled = false;
159+
160+
-- Clean-up
161+
DROP VIEW IF EXISTS tab1;
162+
DROP VIEW IF EXISTS tab2;

0 commit comments

Comments
 (0)