Skip to content

Commit 8d686f3

Browse files
imback82HyukjinKwon
authored andcommitted
[SPARK-28271][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part2.sql' into UDF test base
## What changes were proposed in this pull request? This PR adds some tests converted from `pgSQL/aggregates_part2.sql'` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'pgSQL/aggregates_part2.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out index 2606d2e..00c06f9 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out -57,23 +57,23 true false true false true true true true true -- !query 3 -select min(unique1) from tenk1 +select min(udf(unique1)) from tenk1 -- !query 3 schema -struct<min(unique1):int> +struct<min(udf(unique1)):string> -- !query 3 output 0 -- !query 4 -select max(unique1) from tenk1 +select udf(max(unique1)) from tenk1 -- !query 4 schema -struct<max(unique1):int> +struct<udf(max(unique1)):string> -- !query 4 output 9999 -- !query 5 -select max(unique1) from tenk1 where unique1 < 42 +select max(unique1) from tenk1 where udf(unique1) < 42 -- !query 5 schema struct<max(unique1):int> -- !query 5 output -81,7 +81,7 struct<max(unique1):int> -- !query 6 -select max(unique1) from tenk1 where unique1 > 42 +select max(unique1) from tenk1 where unique1 > udf(42) -- !query 6 schema struct<max(unique1):int> -- !query 6 output -89,7 +89,7 struct<max(unique1):int> -- !query 7 -select max(unique1) from tenk1 where unique1 > 42000 +select max(unique1) from tenk1 where udf(unique1) > 42000 -- !query 7 schema struct<max(unique1):int> -- !query 7 output -97,7 +97,7 NULL -- !query 8 -select max(tenthous) from tenk1 where thousand = 33 +select max(tenthous) from tenk1 where udf(thousand) = 33 -- !query 8 schema struct<max(tenthous):int> -- !query 8 output -105,7 +105,7 struct<max(tenthous):int> -- !query 9 -select min(tenthous) from tenk1 where thousand = 33 +select min(tenthous) from tenk1 where udf(thousand) = 33 -- !query 9 schema struct<min(tenthous):int> -- !query 9 output -113,15 +113,15 struct<min(tenthous):int> -- !query 10 -select distinct max(unique2) from tenk1 +select distinct max(udf(unique2)) from tenk1 -- !query 10 schema -struct<max(unique2):int> +struct<max(udf(unique2)):string> -- !query 10 output 9999 -- !query 11 -select max(unique2) from tenk1 order by 1 +select max(unique2) from tenk1 order by udf(1) -- !query 11 schema struct<max(unique2):int> -- !query 11 output -129,7 +129,7 struct<max(unique2):int> -- !query 12 -select max(unique2) from tenk1 order by max(unique2) +select max(unique2) from tenk1 order by max(udf(unique2)) -- !query 12 schema struct<max(unique2):int> -- !query 12 output -137,7 +137,7 struct<max(unique2):int> -- !query 13 -select max(unique2) from tenk1 order by max(unique2)+1 +select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1 -- !query 13 schema -struct<max(unique2):int> +struct<udf(max(udf(unique2))):string> -- !query 13 output 9999 -- !query 14 -select t1.max_unique2, g from (select max(unique2) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc +select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc -- !query 14 schema -struct<max_unique2:int,g:int> +struct<max_unique2:string,udf(g):string> -- !query 14 output 9999 3 9999 2 -155,8 +155,8 struct<max_unique2:int,g:int> -- !query 15 -select max(100) from tenk1 +select udf(max(100)) from tenk1 -- !query 15 schema -struct<max(100):int> +struct<udf(max(100)):string> -- !query 15 output 100 ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25086 from imback82/udf_test. Authored-by: Terry Kim <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent b598dfd commit 8d686f3

File tree

2 files changed

+394
-0
lines changed

2 files changed

+394
-0
lines changed
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
--
2+
-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
3+
--
4+
--
5+
-- AGGREGATES [Part 2]
6+
-- https://github.com/postgres/postgres/blob/REL_12_BETA1/src/test/regress/sql/aggregates.sql#L145-L350
7+
--
8+
-- This test file was converted from pgSQL/aggregates_part2.sql.
9+
-- Note that currently registered UDF returns a string. So there are some differences, for instance
10+
-- in string cast within UDF in Scala and Python.
11+
12+
create temporary view int4_tbl as select * from values
13+
(0),
14+
(123456),
15+
(-123456),
16+
(2147483647),
17+
(-2147483647)
18+
as int4_tbl(f1);
19+
20+
-- Test handling of Params within aggregate arguments in hashed aggregation.
21+
-- Per bug report from Jeevan Chalke.
22+
-- [SPARK-27877] Implement SQL-standard LATERAL subqueries
23+
-- explain (verbose, costs off)
24+
-- select s1, s2, sm
25+
-- from generate_series(1, 3) s1,
26+
-- lateral (select s2, sum(s1 + s2) sm
27+
-- from generate_series(1, 3) s2 group by s2) ss
28+
-- order by 1, 2;
29+
-- select s1, s2, sm
30+
-- from generate_series(1, 3) s1,
31+
-- lateral (select s2, sum(s1 + s2) sm
32+
-- from generate_series(1, 3) s2 group by s2) ss
33+
-- order by 1, 2;
34+
35+
-- [SPARK-27878] Support ARRAY(sub-SELECT) expressions
36+
-- explain (verbose, costs off)
37+
-- select array(select sum(x+y) s
38+
-- from generate_series(1,3) y group by y order by s)
39+
-- from generate_series(1,3) x;
40+
-- select array(select sum(x+y) s
41+
-- from generate_series(1,3) y group by y order by s)
42+
-- from generate_series(1,3) x;
43+
44+
-- [SPARK-27879] Implement bitwise integer aggregates(BIT_AND and BIT_OR)
45+
--
46+
-- test for bitwise integer aggregates
47+
--
48+
-- CREATE TEMPORARY TABLE bitwise_test(
49+
-- i2 INT2,
50+
-- i4 INT4,
51+
-- i8 INT8,
52+
-- i INTEGER,
53+
-- x INT2,
54+
-- y BIT(4)
55+
-- );
56+
57+
-- empty case
58+
-- SELECT
59+
-- BIT_AND(i2) AS "?",
60+
-- BIT_OR(i4) AS "?"
61+
-- FROM bitwise_test;
62+
63+
-- COPY bitwise_test FROM STDIN NULL 'null';
64+
-- 1 1 1 1 1 B0101
65+
-- 3 3 3 null 2 B0100
66+
-- 7 7 7 3 4 B1100
67+
-- \.
68+
69+
-- SELECT
70+
-- BIT_AND(i2) AS "1",
71+
-- BIT_AND(i4) AS "1",
72+
-- BIT_AND(i8) AS "1",
73+
-- BIT_AND(i) AS "?",
74+
-- BIT_AND(x) AS "0",
75+
-- BIT_AND(y) AS "0100",
76+
--
77+
-- BIT_OR(i2) AS "7",
78+
-- BIT_OR(i4) AS "7",
79+
-- BIT_OR(i8) AS "7",
80+
-- BIT_OR(i) AS "?",
81+
-- BIT_OR(x) AS "7",
82+
-- BIT_OR(y) AS "1101"
83+
-- FROM bitwise_test;
84+
85+
--
86+
-- test boolean aggregates
87+
--
88+
-- first test all possible transition and final states
89+
90+
-- The result is inconsistent with PostgreSQL because our AND does not have strict mode
91+
SELECT
92+
-- boolean and transitions
93+
-- null because strict
94+
(NULL AND NULL) IS NULL AS `t`,
95+
(TRUE AND NULL) IS NULL AS `t`,
96+
(FALSE AND NULL) IS NULL AS `t`,
97+
(NULL AND TRUE) IS NULL AS `t`,
98+
(NULL AND FALSE) IS NULL AS `t`,
99+
-- and actual computations
100+
(TRUE AND TRUE) AS `t`,
101+
NOT (TRUE AND FALSE) AS `t`,
102+
NOT (FALSE AND TRUE) AS `t`,
103+
NOT (FALSE AND FALSE) AS `t`;
104+
105+
-- The result is inconsistent with PostgreSQL because our OR does not have strict mode
106+
SELECT
107+
-- boolean or transitions
108+
-- null because strict
109+
(NULL OR NULL) IS NULL AS `t`,
110+
(TRUE OR NULL) IS NULL AS `t`,
111+
(FALSE OR NULL) IS NULL AS `t`,
112+
(NULL OR TRUE) IS NULL AS `t`,
113+
(NULL OR FALSE) IS NULL AS `t`,
114+
-- actual computations
115+
(TRUE OR TRUE) AS `t`,
116+
(TRUE OR FALSE) AS `t`,
117+
(FALSE OR TRUE) AS `t`,
118+
NOT (FALSE OR FALSE) AS `t`;
119+
120+
-- [SPARK-27880] Implement boolean aggregates(BOOL_AND, BOOL_OR and EVERY)
121+
-- CREATE TEMPORARY TABLE bool_test(
122+
-- b1 BOOL,
123+
-- b2 BOOL,
124+
-- b3 BOOL,
125+
-- b4 BOOL);
126+
127+
-- empty case
128+
-- SELECT
129+
-- BOOL_AND(b1) AS "n",
130+
-- BOOL_OR(b3) AS "n"
131+
-- FROM bool_test;
132+
133+
-- COPY bool_test FROM STDIN NULL 'null';
134+
-- TRUE null FALSE null
135+
-- FALSE TRUE null null
136+
-- null TRUE FALSE null
137+
-- \.
138+
139+
-- SELECT
140+
-- BOOL_AND(b1) AS "f",
141+
-- BOOL_AND(b2) AS "t",
142+
-- BOOL_AND(b3) AS "f",
143+
-- BOOL_AND(b4) AS "n",
144+
-- BOOL_AND(NOT b2) AS "f",
145+
-- BOOL_AND(NOT b3) AS "t"
146+
-- FROM bool_test;
147+
148+
-- SELECT
149+
-- EVERY(b1) AS "f",
150+
-- EVERY(b2) AS "t",
151+
-- EVERY(b3) AS "f",
152+
-- EVERY(b4) AS "n",
153+
-- EVERY(NOT b2) AS "f",
154+
-- EVERY(NOT b3) AS "t"
155+
-- FROM bool_test;
156+
157+
-- SELECT
158+
-- BOOL_OR(b1) AS "t",
159+
-- BOOL_OR(b2) AS "t",
160+
-- BOOL_OR(b3) AS "f",
161+
-- BOOL_OR(b4) AS "n",
162+
-- BOOL_OR(NOT b2) AS "f",
163+
-- BOOL_OR(NOT b3) AS "t"
164+
-- FROM bool_test;
165+
166+
--
167+
-- Test cases that should be optimized into indexscans instead of
168+
-- the generic aggregate implementation.
169+
--
170+
171+
-- Basic cases
172+
-- explain
173+
-- select min(unique1) from tenk1;
174+
select min(udf(unique1)) from tenk1;
175+
-- explain
176+
-- select max(unique1) from tenk1;
177+
select udf(max(unique1)) from tenk1;
178+
-- explain
179+
-- select max(unique1) from tenk1 where unique1 < 42;
180+
select max(unique1) from tenk1 where udf(unique1) < 42;
181+
-- explain
182+
-- select max(unique1) from tenk1 where unique1 > 42;
183+
select max(unique1) from tenk1 where unique1 > udf(42);
184+
185+
-- the planner may choose a generic aggregate here if parallel query is
186+
-- enabled, since that plan will be parallel safe and the "optimized"
187+
-- plan, which has almost identical cost, will not be. we want to test
188+
-- the optimized plan, so temporarily disable parallel query.
189+
-- begin;
190+
-- set local max_parallel_workers_per_gather = 0;
191+
-- explain
192+
-- select max(unique1) from tenk1 where unique1 > 42000;
193+
select max(unique1) from tenk1 where udf(unique1) > 42000;
194+
-- rollback;
195+
196+
-- multi-column index (uses tenk1_thous_tenthous)
197+
-- explain
198+
-- select max(tenthous) from tenk1 where thousand = 33;
199+
select max(tenthous) from tenk1 where udf(thousand) = 33;
200+
-- explain
201+
-- select min(tenthous) from tenk1 where thousand = 33;
202+
select min(tenthous) from tenk1 where udf(thousand) = 33;
203+
204+
-- [SPARK-17348] Correlated column is not allowed in a non-equality predicate
205+
-- check parameter propagation into an indexscan subquery
206+
-- explain
207+
-- select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt
208+
-- from int4_tbl;
209+
-- select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt
210+
-- from int4_tbl;
211+
212+
-- check some cases that were handled incorrectly in 8.3.0
213+
-- explain
214+
-- select distinct max(unique2) from tenk1;
215+
select distinct max(udf(unique2)) from tenk1;
216+
-- explain
217+
-- select max(unique2) from tenk1 order by 1;
218+
select max(unique2) from tenk1 order by udf(1);
219+
-- explain
220+
-- select max(unique2) from tenk1 order by max(unique2);
221+
select max(unique2) from tenk1 order by max(udf(unique2));
222+
-- explain
223+
-- select max(unique2) from tenk1 order by max(unique2)+1;
224+
select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1;
225+
-- explain
226+
-- select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
227+
select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc;
228+
229+
-- interesting corner case: constant gets optimized into a seqscan
230+
-- explain
231+
-- select max(100) from tenk1;
232+
select udf(max(100)) from tenk1;
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
-- Automatically generated by SQLQueryTestSuite
2+
-- Number of queries: 16
3+
4+
5+
-- !query 0
6+
create temporary view int4_tbl as select * from values
7+
(0),
8+
(123456),
9+
(-123456),
10+
(2147483647),
11+
(-2147483647)
12+
as int4_tbl(f1)
13+
-- !query 0 schema
14+
struct<>
15+
-- !query 0 output
16+
17+
18+
19+
-- !query 1
20+
SELECT
21+
-- boolean and transitions
22+
-- null because strict
23+
(NULL AND NULL) IS NULL AS `t`,
24+
(TRUE AND NULL) IS NULL AS `t`,
25+
(FALSE AND NULL) IS NULL AS `t`,
26+
(NULL AND TRUE) IS NULL AS `t`,
27+
(NULL AND FALSE) IS NULL AS `t`,
28+
-- and actual computations
29+
(TRUE AND TRUE) AS `t`,
30+
NOT (TRUE AND FALSE) AS `t`,
31+
NOT (FALSE AND TRUE) AS `t`,
32+
NOT (FALSE AND FALSE) AS `t`
33+
-- !query 1 schema
34+
struct<t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean>
35+
-- !query 1 output
36+
true true false true false true true true true
37+
38+
39+
-- !query 2
40+
SELECT
41+
-- boolean or transitions
42+
-- null because strict
43+
(NULL OR NULL) IS NULL AS `t`,
44+
(TRUE OR NULL) IS NULL AS `t`,
45+
(FALSE OR NULL) IS NULL AS `t`,
46+
(NULL OR TRUE) IS NULL AS `t`,
47+
(NULL OR FALSE) IS NULL AS `t`,
48+
-- actual computations
49+
(TRUE OR TRUE) AS `t`,
50+
(TRUE OR FALSE) AS `t`,
51+
(FALSE OR TRUE) AS `t`,
52+
NOT (FALSE OR FALSE) AS `t`
53+
-- !query 2 schema
54+
struct<t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean>
55+
-- !query 2 output
56+
true false true false true true true true true
57+
58+
59+
-- !query 3
60+
select min(udf(unique1)) from tenk1
61+
-- !query 3 schema
62+
struct<min(udf(unique1)):string>
63+
-- !query 3 output
64+
0
65+
66+
67+
-- !query 4
68+
select udf(max(unique1)) from tenk1
69+
-- !query 4 schema
70+
struct<udf(max(unique1)):string>
71+
-- !query 4 output
72+
9999
73+
74+
75+
-- !query 5
76+
select max(unique1) from tenk1 where udf(unique1) < 42
77+
-- !query 5 schema
78+
struct<max(unique1):int>
79+
-- !query 5 output
80+
41
81+
82+
83+
-- !query 6
84+
select max(unique1) from tenk1 where unique1 > udf(42)
85+
-- !query 6 schema
86+
struct<max(unique1):int>
87+
-- !query 6 output
88+
9999
89+
90+
91+
-- !query 7
92+
select max(unique1) from tenk1 where udf(unique1) > 42000
93+
-- !query 7 schema
94+
struct<max(unique1):int>
95+
-- !query 7 output
96+
NULL
97+
98+
99+
-- !query 8
100+
select max(tenthous) from tenk1 where udf(thousand) = 33
101+
-- !query 8 schema
102+
struct<max(tenthous):int>
103+
-- !query 8 output
104+
9033
105+
106+
107+
-- !query 9
108+
select min(tenthous) from tenk1 where udf(thousand) = 33
109+
-- !query 9 schema
110+
struct<min(tenthous):int>
111+
-- !query 9 output
112+
33
113+
114+
115+
-- !query 10
116+
select distinct max(udf(unique2)) from tenk1
117+
-- !query 10 schema
118+
struct<max(udf(unique2)):string>
119+
-- !query 10 output
120+
9999
121+
122+
123+
-- !query 11
124+
select max(unique2) from tenk1 order by udf(1)
125+
-- !query 11 schema
126+
struct<max(unique2):int>
127+
-- !query 11 output
128+
9999
129+
130+
131+
-- !query 12
132+
select max(unique2) from tenk1 order by max(udf(unique2))
133+
-- !query 12 schema
134+
struct<max(unique2):int>
135+
-- !query 12 output
136+
9999
137+
138+
139+
-- !query 13
140+
select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1
141+
-- !query 13 schema
142+
struct<udf(max(udf(unique2))):string>
143+
-- !query 13 output
144+
9999
145+
146+
147+
-- !query 14
148+
select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc
149+
-- !query 14 schema
150+
struct<max_unique2:string,udf(g):string>
151+
-- !query 14 output
152+
9999 3
153+
9999 2
154+
9999 1
155+
156+
157+
-- !query 15
158+
select udf(max(100)) from tenk1
159+
-- !query 15 schema
160+
struct<udf(max(100)):string>
161+
-- !query 15 output
162+
100

0 commit comments

Comments
 (0)