Skip to content

Commit 92e051c

Browse files
committed
[SPARK-28270][SQL][PYTHON] Convert and port 'pgSQL/aggregates_part1.sql' into UDF test base
## What changes were proposed in this pull request? This PR adds some tests converted from `pgSQL/aggregates_part1.sql'` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). This PR also contains two minor fixes: 1. Change name of Scala UDF from `UDF:name(...)` to `name(...)` to be consistent with Python' 2. Fix Scala UDF at `IntegratedUDFTestUtils.scala ` to handle `null` in strings. <details><summary>Diff comparing to 'pgSQL/aggregates_part1.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out index 51ca1d5..124fdd6416e 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out -3,7 +3,7 -- !query 0 -SELECT avg(four) AS avg_1 FROM onek +SELECT avg(udf(four)) AS avg_1 FROM onek -- !query 0 schema struct<avg_1:double> -- !query 0 output -11,15 +11,15 struct<avg_1:double> -- !query 1 -SELECT avg(a) AS avg_32 FROM aggtest WHERE a < 100 +SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100 -- !query 1 schema -struct<avg_32:double> +struct<avg_32:string> -- !query 1 output 32.666666666666664 -- !query 2 -select CAST(avg(b) AS Decimal(10,3)) AS avg_107_943 FROM aggtest +select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest -- !query 2 schema struct<avg_107_943:decimal(10,3)> -- !query 2 output -27,285 +27,286 struct<avg_107_943:decimal(10,3)> -- !query 3 -SELECT sum(four) AS sum_1500 FROM onek +SELECT sum(udf(four)) AS sum_1500 FROM onek -- !query 3 schema -struct<sum_1500:bigint> +struct<sum_1500:double> -- !query 3 output -1500 +1500.0 -- !query 4 -SELECT sum(a) AS sum_198 FROM aggtest +SELECT udf(sum(a)) AS sum_198 FROM aggtest -- !query 4 schema -struct<sum_198:bigint> +struct<sum_198:string> -- !query 4 output 198 -- !query 5 -SELECT sum(b) AS avg_431_773 FROM aggtest +SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest -- !query 5 schema -struct<avg_431_773:double> +struct<avg_431_773:string> -- !query 5 output 431.77260909229517 -- !query 6 -SELECT max(four) AS max_3 FROM onek +SELECT udf(max(four)) AS max_3 FROM onek -- !query 6 schema -struct<max_3:int> +struct<max_3:string> -- !query 6 output 3 -- !query 7 -SELECT max(a) AS max_100 FROM aggtest +SELECT max(udf(a)) AS max_100 FROM aggtest -- !query 7 schema -struct<max_100:int> +struct<max_100:string> -- !query 7 output -100 +56 -- !query 8 -SELECT max(aggtest.b) AS max_324_78 FROM aggtest +SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest -- !query 8 schema -struct<max_324_78:float> +struct<max_324_78:int> -- !query 8 output -324.78 +324 -- !query 9 -SELECT stddev_pop(b) FROM aggtest +SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest -- !query 9 schema -struct<stddev_pop(CAST(b AS DOUBLE)):double> +struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS INT):int> -- !query 9 output -131.10703231895047 +131 -- !query 10 -SELECT stddev_samp(b) FROM aggtest +SELECT udf(stddev_samp(b)) FROM aggtest -- !query 10 schema -struct<stddev_samp(CAST(b AS DOUBLE)):double> +struct<udf(stddev_samp(cast(b as double))):string> -- !query 10 output 151.38936080399804 -- !query 11 -SELECT var_pop(b) FROM aggtest +SELECT CAST(var_pop(udf(b)) as int) FROM aggtest -- !query 11 schema -struct<var_pop(CAST(b AS DOUBLE)):double> +struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS INT):int> -- !query 11 output -17189.053923482323 +17189 -- !query 12 -SELECT var_samp(b) FROM aggtest +SELECT udf(var_samp(b)) FROM aggtest -- !query 12 schema -struct<var_samp(CAST(b AS DOUBLE)):double> +struct<udf(var_samp(cast(b as double))):string> -- !query 12 output 22918.738564643096 -- !query 13 -SELECT stddev_pop(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest -- !query 13 schema -struct<stddev_pop(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))):string> -- !query 13 output 131.18117242958306 -- !query 14 -SELECT stddev_samp(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest -- !query 14 schema -struct<stddev_samp(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)):double> -- !query 14 output 151.47497042966097 -- !query 15 -SELECT var_pop(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest -- !query 15 schema -struct<var_pop(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<udf(var_pop(cast(cast(b as decimal(38,0)) as double))):string> -- !query 15 output 17208.5 -- !query 16 -SELECT var_samp(CAST(b AS Decimal(38,0))) FROM aggtest +SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest -- !query 16 schema -struct<var_samp(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)):double> -- !query 16 output 22944.666666666668 -- !query 17 -SELECT var_pop(1.0), var_samp(2.0) +SELECT udf(var_pop(1.0)), var_samp(udf(2.0)) -- !query 17 schema -struct<var_pop(CAST(1.0 AS DOUBLE)):double,var_samp(CAST(2.0 AS DOUBLE)):double> +struct<udf(var_pop(cast(1.0 as double))):string,var_samp(CAST(udf(2.0) AS DOUBLE)):double> -- !query 17 output 0.0 NaN -- !query 18 -SELECT stddev_pop(CAST(3.0 AS Decimal(38,0))), stddev_samp(CAST(4.0 AS Decimal(38,0))) +SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0))) -- !query 18 schema -struct<stddev_pop(CAST(CAST(3.0 AS DECIMAL(38,0)) AS DOUBLE)):double,stddev_samp(CAST(CAST(4.0 AS DECIMAL(38,0)) AS DOUBLE)):double> +struct<stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)):double,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double> -- !query 18 output 0.0 NaN -- !query 19 -select sum(CAST(null AS int)) from range(1,4) +select sum(udf(CAST(null AS int))) from range(1,4) -- !query 19 schema -struct<sum(CAST(NULL AS INT)):bigint> +struct<sum(CAST(udf(cast(null as int)) AS DOUBLE)):double> -- !query 19 output NULL -- !query 20 -select sum(CAST(null AS long)) from range(1,4) +select sum(udf(CAST(null AS long))) from range(1,4) -- !query 20 schema -struct<sum(CAST(NULL AS BIGINT)):bigint> +struct<sum(CAST(udf(cast(null as bigint)) AS DOUBLE)):double> -- !query 20 output NULL -- !query 21 -select sum(CAST(null AS Decimal(38,0))) from range(1,4) +select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query 21 schema -struct<sum(CAST(NULL AS DECIMAL(38,0))):decimal(38,0)> +struct<sum(CAST(udf(cast(null as decimal(38,0))) AS DOUBLE)):double> -- !query 21 output NULL -- !query 22 -select sum(CAST(null AS DOUBLE)) from range(1,4) +select sum(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query 22 schema -struct<sum(CAST(NULL AS DOUBLE)):double> +struct<sum(CAST(udf(cast(null as double)) AS DOUBLE)):double> -- !query 22 output NULL -- !query 23 -select avg(CAST(null AS int)) from range(1,4) +select avg(udf(CAST(null AS int))) from range(1,4) -- !query 23 schema -struct<avg(CAST(NULL AS INT)):double> +struct<avg(CAST(udf(cast(null as int)) AS DOUBLE)):double> -- !query 23 output NULL -- !query 24 -select avg(CAST(null AS long)) from range(1,4) +select avg(udf(CAST(null AS long))) from range(1,4) -- !query 24 schema -struct<avg(CAST(NULL AS BIGINT)):double> +struct<avg(CAST(udf(cast(null as bigint)) AS DOUBLE)):double> -- !query 24 output NULL -- !query 25 -select avg(CAST(null AS Decimal(38,0))) from range(1,4) +select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4) -- !query 25 schema -struct<avg(CAST(NULL AS DECIMAL(38,0))):decimal(38,4)> +struct<avg(CAST(udf(cast(null as decimal(38,0))) AS DOUBLE)):double> -- !query 25 output NULL -- !query 26 -select avg(CAST(null AS DOUBLE)) from range(1,4) +select avg(udf(CAST(null AS DOUBLE))) from range(1,4) -- !query 26 schema -struct<avg(CAST(NULL AS DOUBLE)):double> +struct<avg(CAST(udf(cast(null as double)) AS DOUBLE)):double> -- !query 26 output NULL -- !query 27 -select sum(CAST('NaN' AS DOUBLE)) from range(1,4) +select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query 27 schema -struct<sum(CAST(NaN AS DOUBLE)):double> +struct<sum(CAST(udf(NaN) AS DOUBLE)):double> -- !query 27 output NaN -- !query 28 -select avg(CAST('NaN' AS DOUBLE)) from range(1,4) +select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4) -- !query 28 schema -struct<avg(CAST(NaN AS DOUBLE)):double> +struct<avg(CAST(udf(NaN) AS DOUBLE)):double> -- !query 28 output NaN -- !query 29 SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) -FROM (VALUES (CAST('1' AS DOUBLE)), (CAST('Infinity' AS DOUBLE))) v(x) +FROM (VALUES (CAST(udf('1') AS DOUBLE)), (CAST(udf('Infinity') AS DOUBLE))) v(x) -- !query 29 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<> -- !query 29 output -Infinity NaN +org.apache.spark.sql.AnalysisException +cannot evaluate expression CAST(udf(1) AS DOUBLE) in inline table definition; line 2 pos 14 -- !query 30 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('1')) v(x) -- !query 30 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double> -- !query 30 output Infinity NaN -- !query 31 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('Infinity')) v(x) -- !query 31 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double> -- !query 31 output Infinity NaN -- !query 32 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('-Infinity'), ('Infinity')) v(x) -- !query 32 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double> -- !query 32 output NaN NaN -- !query 33 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x) -- !query 33 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string> -- !query 33 output 1.00000005E8 2.5 -- !query 34 -SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) +SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) FROM (VALUES (7000000000005), (7000000000007)) v(x) -- !query 34 schema -struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double> +struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string> -- !query 34 output 7.000000000006E12 1.0 -- !query 35 -SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest +SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest -- !query 35 schema -struct<covar_pop(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double,covar_samp(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double> +struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS INT):int,CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS INT):int> -- !query 35 output -653.6289553875104 871.5052738500139 +653 871 -- !query 36 -SELECT corr(b, a) FROM aggtest +SELECT corr(b, udf(a)) FROM aggtest -- !query 36 schema -struct<corr(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double> +struct<corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)):double> -- !query 36 output 0.1396345165178734 -- !query 37 -SELECT count(four) AS cnt_1000 FROM onek +SELECT count(udf(four)) AS cnt_1000 FROM onek -- !query 37 schema struct<cnt_1000:bigint> -- !query 37 output -313,36 +314,36 struct<cnt_1000:bigint> -- !query 38 -SELECT count(DISTINCT four) AS cnt_4 FROM onek +SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek -- !query 38 schema -struct<cnt_4:bigint> +struct<cnt_4:string> -- !query 38 output 4 -- !query 39 -select ten, count(*), sum(four) from onek +select ten, udf(count(*)), sum(udf(four)) from onek group by ten order by ten -- !query 39 schema -struct<ten:int,count(1):bigint,sum(four):bigint> +struct<ten:int,udf(count(1)):string,sum(CAST(udf(four) AS DOUBLE)):double> -- !query 39 output -0 100 100 -1 100 200 -2 100 100 -3 100 200 -4 100 100 -5 100 200 -6 100 100 -7 100 200 -8 100 100 -9 100 200 +0 100 100.0 +1 100 200.0 +2 100 100.0 +3 100 200.0 +4 100 100.0 +5 100 200.0 +6 100 100.0 +7 100 200.0 +8 100 100.0 +9 100 200.0 -- !query 40 -select ten, count(four), sum(DISTINCT four) from onek +select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek group by ten order by ten -- !query 40 schema -struct<ten:int,count(four):bigint,sum(DISTINCT four):bigint> +struct<ten:int,count(udf(four)):bigint,udf(sum(distinct cast(four as bigint))):string> -- !query 40 output 0 100 2 1 100 4 -357,11 +358,11 struct<ten:int,count(four):bigint,sum(DISTINCT four):bigint> -- !query 41 -select ten, sum(distinct four) from onek a +select ten, udf(sum(distinct four)) from onek a group by ten -having exists (select 1 from onek b where sum(distinct a.four) = b.four) +having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four) -- !query 41 schema -struct<ten:int,sum(DISTINCT four):bigint> +struct<ten:int,udf(sum(distinct cast(four as bigint))):string> -- !query 41 output 0 2 2 2 -374,23 +375,23 struct<ten:int,sum(DISTINCT four):bigint> select ten, sum(distinct four) from onek a group by ten having exists (select 1 from onek b - where sum(distinct a.four + b.four) = b.four) + where sum(distinct a.four + b.four) = udf(b.four)) -- !query 42 schema struct<> -- !query 42 output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. -Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(b.`four` AS BIGINT))] +Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(udf(four) AS BIGINT))] Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; -- !query 43 select - (select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1))) + (select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))) from tenk1 o -- !query 43 schema struct<> -- !query 43 output org.apache.spark.sql.AnalysisException -cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 63 +cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 67 ``` </p> </details> Note that, currently, `IntegratedUDFTestUtils.scala`'s UDFs only return strings. There are some differences between those UDFs (Scala, Pandas and Python): - Python's string representation of floats can make the tests flaky. (See https://docs.python.org/3/tutorial/floatingpoint.html). To work around this, I had to `CAST(... as int)`. - There are string representation differences between `Inf` `-Inf` <> `Infinity` `-Infinity` and `nan` <> `NaN` - Maybe we should add other type versions of UDFs if this makes adding tests difficult. Note that one issue found - [SPARK-28291](https://issues.apache.org/jira/browse/SPARK-28291). The test was commented for now. ## How was this patch tested? Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921). Closes apache#25069 from HyukjinKwon/SPARK-28270. Authored-by: HyukjinKwon <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent 6532153 commit 92e051c

File tree

3 files changed

+544
-1
lines changed

3 files changed

+544
-1
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
--
2+
-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
3+
--
4+
--
5+
-- AGGREGATES [Part 1]
6+
-- https://github.com/postgres/postgres/blob/REL_12_BETA1/src/test/regress/sql/aggregates.sql#L1-L143
7+
8+
-- avoid bit-exact output here because operations may not be bit-exact.
9+
-- SET extra_float_digits = 0;
10+
11+
-- This test file was converted from pgSQL/aggregates_part1.sql.
12+
-- Note that currently registered UDF returns a string. So there are some differences, for instance
13+
-- in string cast within UDF in Scala and Python.
14+
15+
SELECT avg(udf(four)) AS avg_1 FROM onek;
16+
17+
SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100;
18+
19+
-- In 7.1, avg(float4) is computed using float8 arithmetic.
20+
-- Round the result to 3 digits to avoid platform-specific results.
21+
22+
select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest;
23+
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
24+
-- SELECT avg(gpa) AS avg_3_4 FROM ONLY student;
25+
26+
SELECT sum(udf(four)) AS sum_1500 FROM onek;
27+
SELECT udf(sum(a)) AS sum_198 FROM aggtest;
28+
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest;
29+
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
30+
-- SELECT sum(gpa) AS avg_6_8 FROM ONLY student;
31+
32+
SELECT udf(max(four)) AS max_3 FROM onek;
33+
SELECT max(udf(a)) AS max_100 FROM aggtest;
34+
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest;
35+
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
36+
-- SELECT max(student.gpa) AS max_3_7 FROM student;
37+
38+
SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest;
39+
SELECT udf(stddev_samp(b)) FROM aggtest;
40+
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest;
41+
SELECT udf(var_samp(b)) FROM aggtest;
42+
43+
SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
44+
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest;
45+
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
46+
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest;
47+
48+
-- population variance is defined for a single tuple, sample variance
49+
-- is not
50+
SELECT udf(var_pop(1.0)), var_samp(udf(2.0));
51+
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));
52+
53+
54+
-- verify correct results for null and NaN inputs
55+
select sum(udf(CAST(null AS int))) from range(1,4);
56+
select sum(udf(CAST(null AS long))) from range(1,4);
57+
select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4);
58+
select sum(udf(CAST(null AS DOUBLE))) from range(1,4);
59+
select avg(udf(CAST(null AS int))) from range(1,4);
60+
select avg(udf(CAST(null AS long))) from range(1,4);
61+
select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4);
62+
select avg(udf(CAST(null AS DOUBLE))) from range(1,4);
63+
select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4);
64+
select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4);
65+
66+
-- [SPARK-27768] verify correct results for infinite inputs
67+
-- [SPARK-28291] UDFs cannot be evaluated within inline table definition
68+
-- SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
69+
-- FROM (VALUES (CAST(udf('1') AS DOUBLE)), (CAST(udf('Infinity') AS DOUBLE))) v(x);
70+
SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
71+
FROM (VALUES ('Infinity'), ('1')) v(x);
72+
SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
73+
FROM (VALUES ('Infinity'), ('Infinity')) v(x);
74+
SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
75+
FROM (VALUES ('-Infinity'), ('Infinity')) v(x);
76+
77+
78+
-- test accuracy with a large input offset
79+
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
80+
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
81+
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
82+
FROM (VALUES (7000000000005), (7000000000007)) v(x);
83+
84+
-- SQL2003 binary aggregates [SPARK-23907]
85+
-- SELECT regr_count(b, a) FROM aggtest;
86+
-- SELECT regr_sxx(b, a) FROM aggtest;
87+
-- SELECT regr_syy(b, a) FROM aggtest;
88+
-- SELECT regr_sxy(b, a) FROM aggtest;
89+
-- SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
90+
-- SELECT regr_r2(b, a) FROM aggtest;
91+
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
92+
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest;
93+
SELECT corr(b, udf(a)) FROM aggtest;
94+
95+
96+
-- test accum and combine functions directly [SPARK-23907]
97+
-- CREATE TABLE regr_test (x float8, y float8);
98+
-- INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200);
99+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
100+
-- FROM regr_test WHERE x IN (10,20,30,80);
101+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
102+
-- FROM regr_test;
103+
-- SELECT float8_accum('{4,140,2900}'::float8[], 100);
104+
-- SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100);
105+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
106+
-- FROM regr_test WHERE x IN (10,20,30);
107+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
108+
-- FROM regr_test WHERE x IN (80,100);
109+
-- SELECT float8_combine('{3,60,200}'::float8[],ELECT CAST(udf(covar_pop(b, udf(a))) AS '{0,0,0}'::float8[]);
110+
-- SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]);
111+
-- SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]);
112+
-- SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
113+
-- '{0,0,0,0,0,0}'::float8[]);
114+
-- SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[],
115+
-- '{2,180,200,740,57800,-3400}'::float8[]);
116+
-- SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
117+
-- '{2,180,200,740,57800,-3400}'::float8[]);
118+
-- DROP TABLE regr_test;
119+
120+
121+
-- test count, distinct
122+
SELECT count(udf(four)) AS cnt_1000 FROM onek;
123+
SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek;
124+
125+
select ten, udf(count(*)), sum(udf(four)) from onek
126+
group by ten order by ten;
127+
128+
select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek
129+
group by ten order by ten;
130+
131+
-- user-defined aggregates
132+
-- SELECT newavg(four) AS avg_1 FROM onek;
133+
-- SELECT newsum(four) AS sum_1500 FROM onek;
134+
-- SELECT newcnt(four) AS cnt_1000 FROM onek;
135+
-- SELECT newcnt(*) AS cnt_1000 FROM onek;
136+
-- SELECT oldcnt(*) AS cnt_1000 FROM onek;
137+
-- SELECT sum2(q1,q2) FROM int8_tbl;
138+
139+
-- test for outer-level aggregates
140+
141+
-- this should work
142+
select ten, udf(sum(distinct four)) from onek a
143+
group by ten
144+
having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four);
145+
146+
-- this should fail because subquery has an agg of its own in WHERE
147+
select ten, sum(distinct four) from onek a
148+
group by ten
149+
having exists (select 1 from onek b
150+
where sum(distinct a.four + b.four) = udf(b.four));
151+
152+
-- [SPARK-27769] Test handling of sublinks within outer-level aggregates.
153+
-- Per bug report from Daniel Grace.
154+
select
155+
(select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1))))
156+
from tenk1 o;

0 commit comments

Comments
 (0)