diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index fe93eab251..44a6cd881a 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1,5 +1,6 @@ from __future__ import annotations +from decimal import Decimal from itertools import groupby import re import typing as t @@ -1402,7 +1403,7 @@ def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: if isinstance(this, exp.First): this = exp.AnyValue(this=this.this) - if not isinstance(this, exp.AnyValue): + if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): self.unsupported("IGNORE NULLS is not supported for non-window functions.") return self.sql(this) @@ -1594,3 +1595,40 @@ def round_sql(self, expression: exp.Round) -> str: truncate = None return self.func(func, this, decimals, truncate) + + def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: + """ + BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values + dividing the input distribution into n equal-sized buckets. + + Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery + does not document the specific algorithm used so results may differ. DuckDB does not + support RESPECT NULLS. + """ + this = expression.this + if isinstance(this, exp.Distinct): + # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both + if len(this.expressions) < 2: + self.unsupported("APPROX_QUANTILES requires a bucket count argument") + return self.function_fallback_sql(expression) + num_quantiles_expr = this.expressions[1].pop() + else: + num_quantiles_expr = expression.expression + + if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: + self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") + return self.function_fallback_sql(expression) + + num_quantiles = t.cast(int, num_quantiles_expr.to_py()) + if num_quantiles <= 0: + self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") + return self.function_fallback_sql(expression) + + quantiles = [ + exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) + for i in range(num_quantiles + 1) + ] + + return self.sql( + exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) + ) diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py index 08229ddb02..e9629cb132 100644 --- a/tests/dialects/test_bigquery.py +++ b/tests/dialects/test_bigquery.py @@ -3451,3 +3451,93 @@ def test_round(self): "duckdb": "SELECT ROUND_EVEN(CAST('2.25' AS DECIMAL), 1) AS value", }, ) + + def test_approx_quantiles(self): + self.validate_identity("APPROX_QUANTILES(x, 2)") + self.validate_identity("APPROX_QUANTILES(FALSE OR TRUE, 2)") + self.validate_identity("APPROX_QUANTILES((SELECT 1 AS val), CAST(2.1 AS INT64))") + self.validate_identity("APPROX_QUANTILES(DISTINCT x, 2)") + self.validate_identity("APPROX_QUANTILES(x, 2 RESPECT NULLS)") + self.validate_identity("APPROX_QUANTILES(x, 2 IGNORE NULLS)") + self.validate_identity("APPROX_QUANTILES(DISTINCT x, 2 RESPECT NULLS)") + + def test_approx_quantiles_to_duckdb(self): + self.validate_all( + "APPROX_QUANTILES(x, 1)", + write={"duckdb": "APPROX_QUANTILE(x, [0, 1])"}, + ) + self.validate_all( + "APPROX_QUANTILES(x, 2)", + write={"duckdb": "APPROX_QUANTILE(x, [0, 0.5, 1])"}, + ) + self.validate_all( + "APPROX_QUANTILES(x, 4)", + write={"duckdb": "APPROX_QUANTILE(x, [0, 0.25, 0.5, 0.75, 1])"}, + ) + self.validate_all( + "APPROX_QUANTILES(DISTINCT x, 2)", + write={"duckdb": "APPROX_QUANTILE(DISTINCT x, [0, 0.5, 1])"}, + ) + + with self.subTest("APPROX_QUANTILES 100 buckets"): + result = self.parse_one("APPROX_QUANTILES(x, 100)").sql("duckdb") + self.assertEqual(result.count("APPROX_QUANTILE("), 1) + self.assertIn("0.01", result) + self.assertIn("0.99", result) + self.assertRegex(result, r"APPROX_QUANTILE\(x, \[.*\]\)") + + for expr in ("x + y", "CASE WHEN x > 0 THEN x ELSE 0 END", "ABS(x)"): + with self.subTest(expr=expr): + self.validate_all( + f"APPROX_QUANTILES({expr}, 2)", + write={"duckdb": f"APPROX_QUANTILE({expr}, [0, 0.5, 1])"}, + ) + + with self.subTest("non-literal bucket count"): + with self.assertRaises(UnsupportedError): + self.parse_one("APPROX_QUANTILES(x, bucket_count)").sql( + "duckdb", unsupported_level=ErrorLevel.RAISE + ) + + with self.subTest("non-integer bucket count"): + for value in ("0", "-1", "2.5"): + with self.subTest(value=value): + with self.assertRaises(UnsupportedError): + self.parse_one(f"APPROX_QUANTILES(x, {value})").sql( + "duckdb", unsupported_level=ErrorLevel.RAISE + ) + + with self.subTest("NULL bucket count"): + with self.assertRaises(UnsupportedError): + self.parse_one("APPROX_QUANTILES(x, NULL)").sql( + "duckdb", unsupported_level=ErrorLevel.RAISE + ) + + with self.subTest("missing bucket count"): + with self.assertRaises(UnsupportedError): + self.parse_one("APPROX_QUANTILES(x)").sql( + "duckdb", unsupported_level=ErrorLevel.RAISE + ) + + with self.subTest("missing bucket count with DISTINCT"): + with self.assertRaises(UnsupportedError): + self.parse_one("APPROX_QUANTILES(DISTINCT x)").sql( + "duckdb", unsupported_level=ErrorLevel.RAISE + ) + + with self.subTest("APPROX_QUANTILES IGNORE NULLS"): + # No warning: IGNORE NULLS is the default behavior in DuckDB + from sqlglot.generator import logger as generator_logger + + with mock.patch.object(generator_logger, "warning") as mock_warning: + self.validate_all( + "APPROX_QUANTILES(x, 2 IGNORE NULLS)", + write={"duckdb": "APPROX_QUANTILE(x, [0, 0.5, 1])"}, + ) + mock_warning.assert_not_called() + + with self.subTest("APPROX_QUANTILES RESPECT NULLS"): + with self.assertRaises(UnsupportedError): + self.parse_one("APPROX_QUANTILES(x, 2 RESPECT NULLS)").sql( + "duckdb", unsupported_level=ErrorLevel.RAISE + )