Skip to content

Commit 20578e8

Browse files
huaxingaoHyukjinKwon
authored andcommitted
[SPARK-28285][SQL][PYTHON][TESTS] Convert and port 'outer-join.sql' into UDF test base
## What changes were proposed in this pull request? This PR adds some tests converted from ```outer-join.sql``` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). <details><summary>Diff comparing to 'outer-join.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out index 5db3bae..819f786 100644 --- a/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out -24,17 +24,17 struct<> -- !query 2 SELECT - (SUM(COALESCE(t1.int_col1, t2.int_col0))), - ((COALESCE(t1.int_col1, t2.int_col0)) * 2) + (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))), + (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) FROM t1 RIGHT JOIN t2 - ON (t2.int_col0) = (t1.int_col1) -GROUP BY GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449)), + ON udf(t2.int_col0) = udf(t1.int_col1) +GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))), COALESCE(t1.int_col1, t2.int_col0) -HAVING (SUM(COALESCE(t1.int_col1, t2.int_col0))) - > ((COALESCE(t1.int_col1, t2.int_col0)) * 2) +HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0))))) + > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) -- !query 2 schema -struct<sum(coalesce(int_col1, int_col0)):bigint,(coalesce(int_col1, int_col0) * 2):int> +struct<CAST(udf(cast(sum(cast(cast(udf(cast(coalesce(int_col1, int_col0) as string)) as int) as bigint)) as string)) AS BIGINT):bigint,(CAST(udf(cast(coalesce(int_col1, int_col0) as string)) AS INT) * 2):int> -- !query 2 output -367 -734 -507 -1014 -70,10 +70,10 spark.sql.crossJoin.enabled true SELECT * FROM ( SELECT - COALESCE(t2.int_col1, t1.int_col1) AS int_col + udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col FROM t1 LEFT JOIN t2 ON false -) t where (t.int_col) is not null +) t where (udf(t.int_col)) is not null -- !query 6 schema struct<int_col:int> -- !query 6 output ``` </p> </details> ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25103 from huaxingao/spark-28285. Authored-by: Huaxin Gao <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent d2598fe commit 20578e8

File tree

2 files changed

+133
-0
lines changed

2 files changed

+133
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
-- This test file was converted from outer-join.sql.
2+
-- List of configuration the test suite is run against:
3+
--SET spark.sql.autoBroadcastJoinThreshold=10485760
4+
--SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true
5+
--SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false
6+
7+
-- SPARK-17099: Incorrect result when HAVING clause is added to group by query
8+
CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
9+
(-234), (145), (367), (975), (298)
10+
as t1(int_col1);
11+
12+
CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES
13+
(-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158)
14+
as t2(int_col0, int_col1);
15+
16+
SELECT
17+
(udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))),
18+
(udf(COALESCE(t1.int_col1, t2.int_col0)) * 2)
19+
FROM t1
20+
RIGHT JOIN t2
21+
ON udf(t2.int_col0) = udf(t1.int_col1)
22+
GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))),
23+
COALESCE(t1.int_col1, t2.int_col0)
24+
HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0)))))
25+
> (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2);
26+
27+
28+
-- SPARK-17120: Analyzer incorrectly optimizes plan to empty LocalRelation
29+
CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1);
30+
31+
CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1);
32+
33+
-- Set the cross join enabled flag for the LEFT JOIN test since there's no join condition.
34+
-- Ultimately the join should be optimized away.
35+
set spark.sql.crossJoin.enabled = true;
36+
SELECT *
37+
FROM (
38+
SELECT
39+
udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col
40+
FROM t1
41+
LEFT JOIN t2 ON false
42+
) t where (udf(t.int_col)) is not null;
43+
set spark.sql.crossJoin.enabled = false;
44+
45+
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
-- Automatically generated by SQLQueryTestSuite
2+
-- Number of queries: 8
3+
4+
5+
-- !query 0
6+
CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES
7+
(-234), (145), (367), (975), (298)
8+
as t1(int_col1)
9+
-- !query 0 schema
10+
struct<>
11+
-- !query 0 output
12+
13+
14+
15+
-- !query 1
16+
CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES
17+
(-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158)
18+
as t2(int_col0, int_col1)
19+
-- !query 1 schema
20+
struct<>
21+
-- !query 1 output
22+
23+
24+
25+
-- !query 2
26+
SELECT
27+
(udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))),
28+
(udf(COALESCE(t1.int_col1, t2.int_col0)) * 2)
29+
FROM t1
30+
RIGHT JOIN t2
31+
ON udf(t2.int_col0) = udf(t1.int_col1)
32+
GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf(-449)))),
33+
COALESCE(t1.int_col1, t2.int_col0)
34+
HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0)))))
35+
> (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2)
36+
-- !query 2 schema
37+
struct<CAST(udf(cast(sum(cast(cast(udf(cast(coalesce(int_col1, int_col0) as string)) as int) as bigint)) as string)) AS BIGINT):bigint,(CAST(udf(cast(coalesce(int_col1, int_col0) as string)) AS INT) * 2):int>
38+
-- !query 2 output
39+
-367 -734
40+
-507 -1014
41+
-769 -1538
42+
-800 -1600
43+
44+
45+
-- !query 3
46+
CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1)
47+
-- !query 3 schema
48+
struct<>
49+
-- !query 3 output
50+
51+
52+
53+
-- !query 4
54+
CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1)
55+
-- !query 4 schema
56+
struct<>
57+
-- !query 4 output
58+
59+
60+
61+
-- !query 5
62+
set spark.sql.crossJoin.enabled = true
63+
-- !query 5 schema
64+
struct<key:string,value:string>
65+
-- !query 5 output
66+
spark.sql.crossJoin.enabled true
67+
68+
69+
-- !query 6
70+
SELECT *
71+
FROM (
72+
SELECT
73+
udf(COALESCE(udf(t2.int_col1), udf(t1.int_col1))) AS int_col
74+
FROM t1
75+
LEFT JOIN t2 ON false
76+
) t where (udf(t.int_col)) is not null
77+
-- !query 6 schema
78+
struct<int_col:int>
79+
-- !query 6 output
80+
97
81+
82+
83+
-- !query 7
84+
set spark.sql.crossJoin.enabled = false
85+
-- !query 7 schema
86+
struct<key:string,value:string>
87+
-- !query 7 output
88+
spark.sql.crossJoin.enabled false

0 commit comments

Comments
 (0)