[SPARK-28285][SQL][PYTHON][TESTS] Convert and port 'outer-join.sql' into UDF test base

huaxingao · HyukjinKwon · commit 20578e81a734 · 2019-07-19T12:16:41.000+09:00
## What changes were proposed in this pull request? This PR adds some tests converted from ```outer-join.sql``` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'outer-join.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out index 5db3bae..819f786 100644 --- a/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out -24,17 +24,17 struct<> -- !query 2 SELECT - (SUM(COALESCE(t1.int_col1, t2.int_col0))), - ((COALESCE(t1.int_col1, t2.int_col0)) * 2) + (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))), + (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) FROM t1 RIGHT JOIN t2 - ON (t2.int_col0) = (t1.int_col1) -GROUP BY GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449)), + ON udf(t2.int_col0) = udf(t1.int_col1) +GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))), COALESCE(t1.int_col1, t2.int_col0) -HAVING (SUM(COALESCE(t1.int_col1, t2.int_col0))) - > ((COALESCE(t1.int_col1, t2.int_col0)) * 2) +HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0))))) + > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) -- !query 2 schema -struct<sum(coalesce(int_col1, int_col0)):bigint,(coalesce(int_col1, int_col0) * 2):int> +struct<CAST(udf(cast(sum(cast(cast(udf(cast(coalesce(int_col1, int_col0) as string)) as int) as bigint)) as string)) AS BIGINT):bigint,(CAST(udf(cast(coalesce(int_col1, int_col0) as string)) AS INT) * 2):int> -- !query 2 output -367 -734 -507 -1014 -70,10 +70,10 spark.sql.crossJoin.enabled true SELECT * FROM ( SELECT - COALESCE(t2.int_col1, t1.int_col1) AS int_col + udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col FROM t1 LEFT JOIN t2 ON false -) t where (t.int_col) is not null +) t where (udf(t.int_col)) is not null -- !query 6 schema struct<int_col:int> -- !query 6 output ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25103 from huaxingao/spark-28285. Authored-by: Huaxin Gao <huaxing@us.ibm.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql
@@ -0,0 +1,45 @@
+-- This test file was converted from outer-join.sql.
+-- List of configuration the test suite is run against:
+--SET spark.sql.autoBroadcastJoinThreshold=10485760
+--SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true
+--SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false
+
+-- SPARK-17099: Incorrect result when HAVING clause is added to group by query
+CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
+(-234), (145), (367), (975), (298)
+as t1(int_col1);
+
+CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES
+(-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158)
+as t2(int_col0, int_col1);
+
+SELECT
+  (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))),
+     (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2)
+FROM t1
+RIGHT JOIN t2
+  ON udf(t2.int_col0) = udf(t1.int_col1)
+GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))),
+         COALESCE(t1.int_col1, t2.int_col0)
+HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0)))))
+            > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2);
+
+
+-- SPARK-17120: Analyzer incorrectly optimizes plan to empty LocalRelation
+CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1);
+
+CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1);
+
+-- Set the cross join enabled flag for the LEFT JOIN test since there's no join condition.
+-- Ultimately the join should be optimized away.
+set spark.sql.crossJoin.enabled = true;
+SELECT *
+FROM (
+SELECT
+    udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col
+    FROM t1
+    LEFT JOIN t2 ON false
+) t where (udf(t.int_col)) is not null;
+set spark.sql.crossJoin.enabled = false;
+
+
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out
@@ -0,0 +1,88 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
+(-234), (145), (367), (975), (298)
+as t1(int_col1)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES
+(-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158)
+as t2(int_col0, int_col1)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+SELECT
+  (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))),
+     (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2)
+FROM t1
+RIGHT JOIN t2
+  ON udf(t2.int_col0) = udf(t1.int_col1)
+GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))),
+         COALESCE(t1.int_col1, t2.int_col0)
+HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0)))))
+            > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2)
+-- !query 2 schema
+struct<CAST(udf(cast(sum(cast(cast(udf(cast(coalesce(int_col1, int_col0) as string)) as int) as bigint)) as string)) AS BIGINT):bigint,(CAST(udf(cast(coalesce(int_col1, int_col0) as string)) AS INT) * 2):int>
+-- !query 2 output
+-367	-734
+-507	-1014
+-769	-1538
+-800	-1600
+
+
+-- !query 3
+CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+
+
+
+-- !query 4
+CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1)
+-- !query 4 schema
+struct<>
+-- !query 4 output
+
+
+
+-- !query 5
+set spark.sql.crossJoin.enabled = true
+-- !query 5 schema
+struct<key:string,value:string>
+-- !query 5 output
+spark.sql.crossJoin.enabled	true
+
+
+-- !query 6
+SELECT *
+FROM (
+SELECT
+    udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col
+    FROM t1
+    LEFT JOIN t2 ON false
+) t where (udf(t.int_col)) is not null
+-- !query 6 schema
+struct<int_col:int>
+-- !query 6 output
+97
+
+
+-- !query 7
+set spark.sql.crossJoin.enabled = false
+-- !query 7 schema
+struct<key:string,value:string>
+-- !query 7 output
+spark.sql.crossJoin.enabled	false