[SPARK-28271][SQL][PYTHON][TESTS] Convert and port 'pgSQL/aggregates_part2.sql' into UDF test base

imback82 · HyukjinKwon · commit 8d686f34fc2e · 2019-07-11T09:45:38.000+09:00
## What changes were proposed in this pull request? This PR adds some tests converted from `pgSQL/aggregates_part2.sql'` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'pgSQL/aggregates_part2.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out index 2606d2e..00c06f9 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out -57,23 +57,23 true false true false true true true true true -- !query 3 -select min(unique1) from tenk1 +select min(udf(unique1)) from tenk1 -- !query 3 schema -struct<min(unique1):int> +struct<min(udf(unique1)):string> -- !query 3 output 0 -- !query 4 -select max(unique1) from tenk1 +select udf(max(unique1)) from tenk1 -- !query 4 schema -struct<max(unique1):int> +struct<udf(max(unique1)):string> -- !query 4 output 9999 -- !query 5 -select max(unique1) from tenk1 where unique1 < 42 +select max(unique1) from tenk1 where udf(unique1) < 42 -- !query 5 schema struct<max(unique1):int> -- !query 5 output -81,7 +81,7 struct<max(unique1):int> -- !query 6 -select max(unique1) from tenk1 where unique1 > 42 +select max(unique1) from tenk1 where unique1 > udf(42) -- !query 6 schema struct<max(unique1):int> -- !query 6 output -89,7 +89,7 struct<max(unique1):int> -- !query 7 -select max(unique1) from tenk1 where unique1 > 42000 +select max(unique1) from tenk1 where udf(unique1) > 42000 -- !query 7 schema struct<max(unique1):int> -- !query 7 output -97,7 +97,7 NULL -- !query 8 -select max(tenthous) from tenk1 where thousand = 33 +select max(tenthous) from tenk1 where udf(thousand) = 33 -- !query 8 schema struct<max(tenthous):int> -- !query 8 output -105,7 +105,7 struct<max(tenthous):int> -- !query 9 -select min(tenthous) from tenk1 where thousand = 33 +select min(tenthous) from tenk1 where udf(thousand) = 33 -- !query 9 schema struct<min(tenthous):int> -- !query 9 output -113,15 +113,15 struct<min(tenthous):int> -- !query 10 -select distinct max(unique2) from tenk1 +select distinct max(udf(unique2)) from tenk1 -- !query 10 schema -struct<max(unique2):int> +struct<max(udf(unique2)):string> -- !query 10 output 9999 -- !query 11 -select max(unique2) from tenk1 order by 1 +select max(unique2) from tenk1 order by udf(1) -- !query 11 schema struct<max(unique2):int> -- !query 11 output -129,7 +129,7 struct<max(unique2):int> -- !query 12 -select max(unique2) from tenk1 order by max(unique2) +select max(unique2) from tenk1 order by max(udf(unique2)) -- !query 12 schema struct<max(unique2):int> -- !query 12 output -137,7 +137,7 struct<max(unique2):int> -- !query 13 -select max(unique2) from tenk1 order by max(unique2)+1 +select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1 -- !query 13 schema -struct<max(unique2):int> +struct<udf(max(udf(unique2))):string> -- !query 13 output 9999 -- !query 14 -select t1.max_unique2, g from (select max(unique2) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc +select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc -- !query 14 schema -struct<max_unique2:int,g:int> +struct<max_unique2:string,udf(g):string> -- !query 14 output 9999 3 9999 2 -155,8 +155,8 struct<max_unique2:int,g:int> -- !query 15 -select max(100) from tenk1 +select udf(max(100)) from tenk1 -- !query 15 schema -struct<max(100):int> +struct<udf(max(100)):string> -- !query 15 output 100 ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25086 from imback82/udf_test. Authored-by: Terry Kim <yuminkim@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part2.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part2.sql
@@ -0,0 +1,232 @@
+--
+-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+--
+--
+-- AGGREGATES [Part 2]
+-- https://github.com/postgres/postgres/blob/REL_12_BETA1/src/test/regress/sql/aggregates.sql#L145-L350
+--
+-- This test file was converted from pgSQL/aggregates_part2.sql.
+-- Note that currently registered UDF returns a string. So there are some differences, for instance
+-- in string cast within UDF in Scala and Python.
+
+create temporary view int4_tbl as select * from values
+  (0),
+  (123456),
+  (-123456),
+  (2147483647),
+  (-2147483647)
+  as int4_tbl(f1);
+
+-- Test handling of Params within aggregate arguments in hashed aggregation.
+-- Per bug report from Jeevan Chalke.
+-- [SPARK-27877] Implement SQL-standard LATERAL subqueries
+-- explain (verbose, costs off)
+-- select s1, s2, sm
+-- from generate_series(1, 3) s1,
+--      lateral (select s2, sum(s1 + s2) sm
+--               from generate_series(1, 3) s2 group by s2) ss
+-- order by 1, 2;
+-- select s1, s2, sm
+-- from generate_series(1, 3) s1,
+--      lateral (select s2, sum(s1 + s2) sm
+--               from generate_series(1, 3) s2 group by s2) ss
+-- order by 1, 2;
+
+-- [SPARK-27878] Support ARRAY(sub-SELECT) expressions
+-- explain (verbose, costs off)
+-- select array(select sum(x+y) s
+--             from generate_series(1,3) y group by y order by s)
+--   from generate_series(1,3) x;
+-- select array(select sum(x+y) s
+--             from generate_series(1,3) y group by y order by s)
+--   from generate_series(1,3) x;
+
+-- [SPARK-27879] Implement bitwise integer aggregates(BIT_AND and BIT_OR)
+--
+-- test for bitwise integer aggregates
+--
+-- CREATE TEMPORARY TABLE bitwise_test(
+--   i2 INT2,
+--   i4 INT4,
+--   i8 INT8,
+--   i INTEGER,
+--   x INT2,
+--   y BIT(4)
+-- );
+
+-- empty case
+-- SELECT
+--   BIT_AND(i2) AS "?",
+--   BIT_OR(i4)  AS "?"
+-- FROM bitwise_test;
+
+-- COPY bitwise_test FROM STDIN NULL 'null';
+-- 1	1	1	1	1	B0101
+-- 3	3	3	null	2	B0100
+-- 7	7	7	3	4	B1100
+-- \.
+
+-- SELECT
+--   BIT_AND(i2) AS "1",
+--   BIT_AND(i4) AS "1",
+--   BIT_AND(i8) AS "1",
+--   BIT_AND(i)  AS "?",
+--   BIT_AND(x)  AS "0",
+--   BIT_AND(y)  AS "0100",
+--
+--   BIT_OR(i2)  AS "7",
+--   BIT_OR(i4)  AS "7",
+--   BIT_OR(i8)  AS "7",
+--   BIT_OR(i)   AS "?",
+--   BIT_OR(x)   AS "7",
+--   BIT_OR(y)   AS "1101"
+-- FROM bitwise_test;
+
+--
+-- test boolean aggregates
+--
+-- first test all possible transition and final states
+
+-- The result is inconsistent with PostgreSQL because our AND does not have strict mode
+SELECT
+  -- boolean and transitions
+  -- null because strict
+  (NULL AND NULL) IS NULL AS `t`,
+  (TRUE AND NULL) IS NULL AS `t`,
+  (FALSE AND NULL) IS NULL AS `t`,
+  (NULL AND TRUE) IS NULL AS `t`,
+  (NULL AND FALSE) IS NULL AS `t`,
+  -- and actual computations
+  (TRUE AND TRUE) AS `t`,
+  NOT (TRUE AND FALSE) AS `t`,
+  NOT (FALSE AND TRUE) AS `t`,
+  NOT (FALSE AND FALSE) AS `t`;
+
+-- The result is inconsistent with PostgreSQL because our OR does not have strict mode
+SELECT
+  -- boolean or transitions
+  -- null because strict
+  (NULL OR NULL) IS NULL AS `t`,
+  (TRUE OR NULL) IS NULL AS `t`,
+  (FALSE OR NULL) IS NULL AS `t`,
+  (NULL OR TRUE) IS NULL AS `t`,
+  (NULL OR FALSE) IS NULL AS `t`,
+  -- actual computations
+  (TRUE OR TRUE) AS `t`,
+  (TRUE OR FALSE) AS `t`,
+  (FALSE OR TRUE) AS `t`,
+  NOT (FALSE OR FALSE) AS `t`;
+
+-- [SPARK-27880] Implement boolean aggregates(BOOL_AND, BOOL_OR and EVERY)
+-- CREATE TEMPORARY TABLE bool_test(
+--   b1 BOOL,
+--   b2 BOOL,
+--   b3 BOOL,
+--   b4 BOOL);
+
+-- empty case
+-- SELECT
+--   BOOL_AND(b1)   AS "n",
+--   BOOL_OR(b3)    AS "n"
+-- FROM bool_test;
+
+-- COPY bool_test FROM STDIN NULL 'null';
+-- TRUE	null	FALSE	null
+-- FALSE	TRUE	null	null
+-- null	TRUE	FALSE	null
+-- \.
+
+-- SELECT
+--   BOOL_AND(b1)     AS "f",
+--   BOOL_AND(b2)     AS "t",
+--   BOOL_AND(b3)     AS "f",
+--   BOOL_AND(b4)     AS "n",
+--   BOOL_AND(NOT b2) AS "f",
+--   BOOL_AND(NOT b3) AS "t"
+-- FROM bool_test;
+
+-- SELECT
+--   EVERY(b1)     AS "f",
+--   EVERY(b2)     AS "t",
+--   EVERY(b3)     AS "f",
+--   EVERY(b4)     AS "n",
+--   EVERY(NOT b2) AS "f",
+--   EVERY(NOT b3) AS "t"
+-- FROM bool_test;
+
+-- SELECT
+--   BOOL_OR(b1)      AS "t",
+--   BOOL_OR(b2)      AS "t",
+--   BOOL_OR(b3)      AS "f",
+--   BOOL_OR(b4)      AS "n",
+--   BOOL_OR(NOT b2)  AS "f",
+--   BOOL_OR(NOT b3)  AS "t"
+-- FROM bool_test;
+
+--
+-- Test cases that should be optimized into indexscans instead of
+-- the generic aggregate implementation.
+--
+
+-- Basic cases
+-- explain
+--  select min(unique1) from tenk1;
+select min(udf(unique1)) from tenk1;
+-- explain
+--  select max(unique1) from tenk1;
+select udf(max(unique1)) from tenk1;
+-- explain
+--  select max(unique1) from tenk1 where unique1 < 42;
+select max(unique1) from tenk1 where udf(unique1) < 42;
+-- explain
+--  select max(unique1) from tenk1 where unique1 > 42;
+select max(unique1) from tenk1 where unique1 > udf(42);
+
+-- the planner may choose a generic aggregate here if parallel query is
+-- enabled, since that plan will be parallel safe and the "optimized"
+-- plan, which has almost identical cost, will not be.  we want to test
+-- the optimized plan, so temporarily disable parallel query.
+-- begin;
+-- set local max_parallel_workers_per_gather = 0;
+-- explain
+--  select max(unique1) from tenk1 where unique1 > 42000;
+select max(unique1) from tenk1 where udf(unique1) > 42000;
+-- rollback;
+
+-- multi-column index (uses tenk1_thous_tenthous)
+-- explain
+--  select max(tenthous) from tenk1 where thousand = 33;
+select max(tenthous) from tenk1 where udf(thousand) = 33;
+-- explain
+--  select min(tenthous) from tenk1 where thousand = 33;
+select min(tenthous) from tenk1 where udf(thousand) = 33;
+
+-- [SPARK-17348] Correlated column is not allowed in a non-equality predicate
+-- check parameter propagation into an indexscan subquery
+-- explain
+--  select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt
+--    from int4_tbl;
+-- select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt
+--  from int4_tbl;
+
+-- check some cases that were handled incorrectly in 8.3.0
+-- explain
+--  select distinct max(unique2) from tenk1;
+select distinct max(udf(unique2)) from tenk1;
+-- explain
+--  select max(unique2) from tenk1 order by 1;
+select max(unique2) from tenk1 order by udf(1);
+-- explain
+--  select max(unique2) from tenk1 order by max(unique2);
+select max(unique2) from tenk1 order by max(udf(unique2));
+-- explain
+--  select max(unique2) from tenk1 order by max(unique2)+1;
+select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1;
+-- explain
+--  select max(unique2), generate_series(1,3) as g from tenk1 order by g desc;
+select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc;
+
+-- interesting corner case: constant gets optimized into a seqscan
+-- explain
+--  select max(100) from tenk1;
+select udf(max(100)) from tenk1;
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out
@@ -0,0 +1,162 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 16
+
+
+-- !query 0
+create temporary view int4_tbl as select * from values
+  (0),
+  (123456),
+  (-123456),
+  (2147483647),
+  (-2147483647)
+  as int4_tbl(f1)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT
+  -- boolean and transitions
+  -- null because strict
+  (NULL AND NULL) IS NULL AS `t`,
+  (TRUE AND NULL) IS NULL AS `t`,
+  (FALSE AND NULL) IS NULL AS `t`,
+  (NULL AND TRUE) IS NULL AS `t`,
+  (NULL AND FALSE) IS NULL AS `t`,
+  -- and actual computations
+  (TRUE AND TRUE) AS `t`,
+  NOT (TRUE AND FALSE) AS `t`,
+  NOT (FALSE AND TRUE) AS `t`,
+  NOT (FALSE AND FALSE) AS `t`
+-- !query 1 schema
+struct<t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean>
+-- !query 1 output
+true	true	false	true	false	true	true	true	true
+
+
+-- !query 2
+SELECT
+  -- boolean or transitions
+  -- null because strict
+  (NULL OR NULL) IS NULL AS `t`,
+  (TRUE OR NULL) IS NULL AS `t`,
+  (FALSE OR NULL) IS NULL AS `t`,
+  (NULL OR TRUE) IS NULL AS `t`,
+  (NULL OR FALSE) IS NULL AS `t`,
+  -- actual computations
+  (TRUE OR TRUE) AS `t`,
+  (TRUE OR FALSE) AS `t`,
+  (FALSE OR TRUE) AS `t`,
+  NOT (FALSE OR FALSE) AS `t`
+-- !query 2 schema
+struct<t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean,t:boolean>
+-- !query 2 output
+true	false	true	false	true	true	true	true	true
+
+
+-- !query 3
+select min(udf(unique1)) from tenk1
+-- !query 3 schema
+struct<min(udf(unique1)):string>
+-- !query 3 output
+0
+
+
+-- !query 4
+select udf(max(unique1)) from tenk1
+-- !query 4 schema
+struct<udf(max(unique1)):string>
+-- !query 4 output
+9999
+
+
+-- !query 5
+select max(unique1) from tenk1 where udf(unique1) < 42
+-- !query 5 schema
+struct<max(unique1):int>
+-- !query 5 output
+41
+
+
+-- !query 6
+select max(unique1) from tenk1 where unique1 > udf(42)
+-- !query 6 schema
+struct<max(unique1):int>
+-- !query 6 output
+9999
+
+
+-- !query 7
+select max(unique1) from tenk1 where udf(unique1) > 42000
+-- !query 7 schema
+struct<max(unique1):int>
+-- !query 7 output
+NULL
+
+
+-- !query 8
+select max(tenthous) from tenk1 where udf(thousand) = 33
+-- !query 8 schema
+struct<max(tenthous):int>
+-- !query 8 output
+9033
+
+
+-- !query 9
+select min(tenthous) from tenk1 where udf(thousand) = 33
+-- !query 9 schema
+struct<min(tenthous):int>
+-- !query 9 output
+33
+
+
+-- !query 10
+select distinct max(udf(unique2)) from tenk1
+-- !query 10 schema
+struct<max(udf(unique2)):string>
+-- !query 10 output
+9999
+
+
+-- !query 11
+select max(unique2) from tenk1 order by udf(1)
+-- !query 11 schema
+struct<max(unique2):int>
+-- !query 11 output
+9999
+
+
+-- !query 12
+select max(unique2) from tenk1 order by max(udf(unique2))
+-- !query 12 schema
+struct<max(unique2):int>
+-- !query 12 output
+9999
+
+
+-- !query 13
+select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1
+-- !query 13 schema
+struct<udf(max(udf(unique2))):string>
+-- !query 13 output
+9999
+
+
+-- !query 14
+select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc
+-- !query 14 schema
+struct<max_unique2:string,udf(g):string>
+-- !query 14 output
+9999	3
+9999	2
+9999	1
+
+
+-- !query 15
+select udf(max(100)) from tenk1
+-- !query 15 schema
+struct<udf(max(100)):string>
+-- !query 15 output
+100