Skip to content

Commit e4ea6cc

Browse files
treyspgeorgesittas
andauthored
Feat: transpile BQ APPROX_QUANTILES to DuckDB (#6349)
* fix bq approx_quantiles parsing * add duckdb transpilation * first arg should use _parse_disjunction * num quantiles arg should use _parse_bitwise * pass array of quantiles * remove BQ parser, handle distinct in duck generator * pr feedback * Update sqlglot/dialects/duckdb.py --------- Co-authored-by: Jo <[email protected]>
1 parent 50348ac commit e4ea6cc

File tree

2 files changed

+129
-1
lines changed

2 files changed

+129
-1
lines changed

sqlglot/dialects/duckdb.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from decimal import Decimal
34
from itertools import groupby
45
import re
56
import typing as t
@@ -1402,7 +1403,7 @@ def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str:
14021403
if isinstance(this, exp.First):
14031404
this = exp.AnyValue(this=this.this)
14041405

1405-
if not isinstance(this, exp.AnyValue):
1406+
if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)):
14061407
self.unsupported("IGNORE NULLS is not supported for non-window functions.")
14071408

14081409
return self.sql(this)
@@ -1594,3 +1595,40 @@ def round_sql(self, expression: exp.Round) -> str:
15941595
truncate = None
15951596

15961597
return self.func(func, this, decimals, truncate)
1598+
1599+
def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str:
1600+
"""
1601+
BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values
1602+
dividing the input distribution into n equal-sized buckets.
1603+
1604+
Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery
1605+
does not document the specific algorithm used so results may differ. DuckDB does not
1606+
support RESPECT NULLS.
1607+
"""
1608+
this = expression.this
1609+
if isinstance(this, exp.Distinct):
1610+
# APPROX_QUANTILES requires 2 args and DISTINCT node grabs both
1611+
if len(this.expressions) < 2:
1612+
self.unsupported("APPROX_QUANTILES requires a bucket count argument")
1613+
return self.function_fallback_sql(expression)
1614+
num_quantiles_expr = this.expressions[1].pop()
1615+
else:
1616+
num_quantiles_expr = expression.expression
1617+
1618+
if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int:
1619+
self.unsupported("APPROX_QUANTILES bucket count must be a positive integer")
1620+
return self.function_fallback_sql(expression)
1621+
1622+
num_quantiles = t.cast(int, num_quantiles_expr.to_py())
1623+
if num_quantiles <= 0:
1624+
self.unsupported("APPROX_QUANTILES bucket count must be a positive integer")
1625+
return self.function_fallback_sql(expression)
1626+
1627+
quantiles = [
1628+
exp.Literal.number(Decimal(i) / Decimal(num_quantiles))
1629+
for i in range(num_quantiles + 1)
1630+
]
1631+
1632+
return self.sql(
1633+
exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))
1634+
)

tests/dialects/test_bigquery.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3451,3 +3451,93 @@ def test_round(self):
34513451
"duckdb": "SELECT ROUND_EVEN(CAST('2.25' AS DECIMAL), 1) AS value",
34523452
},
34533453
)
3454+
3455+
def test_approx_quantiles(self):
3456+
self.validate_identity("APPROX_QUANTILES(x, 2)")
3457+
self.validate_identity("APPROX_QUANTILES(FALSE OR TRUE, 2)")
3458+
self.validate_identity("APPROX_QUANTILES((SELECT 1 AS val), CAST(2.1 AS INT64))")
3459+
self.validate_identity("APPROX_QUANTILES(DISTINCT x, 2)")
3460+
self.validate_identity("APPROX_QUANTILES(x, 2 RESPECT NULLS)")
3461+
self.validate_identity("APPROX_QUANTILES(x, 2 IGNORE NULLS)")
3462+
self.validate_identity("APPROX_QUANTILES(DISTINCT x, 2 RESPECT NULLS)")
3463+
3464+
def test_approx_quantiles_to_duckdb(self):
3465+
self.validate_all(
3466+
"APPROX_QUANTILES(x, 1)",
3467+
write={"duckdb": "APPROX_QUANTILE(x, [0, 1])"},
3468+
)
3469+
self.validate_all(
3470+
"APPROX_QUANTILES(x, 2)",
3471+
write={"duckdb": "APPROX_QUANTILE(x, [0, 0.5, 1])"},
3472+
)
3473+
self.validate_all(
3474+
"APPROX_QUANTILES(x, 4)",
3475+
write={"duckdb": "APPROX_QUANTILE(x, [0, 0.25, 0.5, 0.75, 1])"},
3476+
)
3477+
self.validate_all(
3478+
"APPROX_QUANTILES(DISTINCT x, 2)",
3479+
write={"duckdb": "APPROX_QUANTILE(DISTINCT x, [0, 0.5, 1])"},
3480+
)
3481+
3482+
with self.subTest("APPROX_QUANTILES 100 buckets"):
3483+
result = self.parse_one("APPROX_QUANTILES(x, 100)").sql("duckdb")
3484+
self.assertEqual(result.count("APPROX_QUANTILE("), 1)
3485+
self.assertIn("0.01", result)
3486+
self.assertIn("0.99", result)
3487+
self.assertRegex(result, r"APPROX_QUANTILE\(x, \[.*\]\)")
3488+
3489+
for expr in ("x + y", "CASE WHEN x > 0 THEN x ELSE 0 END", "ABS(x)"):
3490+
with self.subTest(expr=expr):
3491+
self.validate_all(
3492+
f"APPROX_QUANTILES({expr}, 2)",
3493+
write={"duckdb": f"APPROX_QUANTILE({expr}, [0, 0.5, 1])"},
3494+
)
3495+
3496+
with self.subTest("non-literal bucket count"):
3497+
with self.assertRaises(UnsupportedError):
3498+
self.parse_one("APPROX_QUANTILES(x, bucket_count)").sql(
3499+
"duckdb", unsupported_level=ErrorLevel.RAISE
3500+
)
3501+
3502+
with self.subTest("non-integer bucket count"):
3503+
for value in ("0", "-1", "2.5"):
3504+
with self.subTest(value=value):
3505+
with self.assertRaises(UnsupportedError):
3506+
self.parse_one(f"APPROX_QUANTILES(x, {value})").sql(
3507+
"duckdb", unsupported_level=ErrorLevel.RAISE
3508+
)
3509+
3510+
with self.subTest("NULL bucket count"):
3511+
with self.assertRaises(UnsupportedError):
3512+
self.parse_one("APPROX_QUANTILES(x, NULL)").sql(
3513+
"duckdb", unsupported_level=ErrorLevel.RAISE
3514+
)
3515+
3516+
with self.subTest("missing bucket count"):
3517+
with self.assertRaises(UnsupportedError):
3518+
self.parse_one("APPROX_QUANTILES(x)").sql(
3519+
"duckdb", unsupported_level=ErrorLevel.RAISE
3520+
)
3521+
3522+
with self.subTest("missing bucket count with DISTINCT"):
3523+
with self.assertRaises(UnsupportedError):
3524+
self.parse_one("APPROX_QUANTILES(DISTINCT x)").sql(
3525+
"duckdb", unsupported_level=ErrorLevel.RAISE
3526+
)
3527+
3528+
with self.subTest("APPROX_QUANTILES IGNORE NULLS"):
3529+
# No warning: IGNORE NULLS is the default behavior in DuckDB
3530+
from sqlglot.generator import logger as generator_logger
3531+
3532+
with mock.patch.object(generator_logger, "warning") as mock_warning:
3533+
self.validate_all(
3534+
"APPROX_QUANTILES(x, 2 IGNORE NULLS)",
3535+
write={"duckdb": "APPROX_QUANTILE(x, [0, 0.5, 1])"},
3536+
)
3537+
mock_warning.assert_not_called()
3538+
3539+
with self.subTest("APPROX_QUANTILES RESPECT NULLS"):
3540+
with self.assertRaises(UnsupportedError):
3541+
self.parse_one("APPROX_QUANTILES(x, 2 RESPECT NULLS)").sql(
3542+
"duckdb", unsupported_level=ErrorLevel.RAISE
3543+
)

0 commit comments

Comments
 (0)