Skip to content

Commit c8acca3

Browse files
fivetran-kwoodbeckgeorgesittas
authored andcommitted
feat(snowflake)!: support Snowflake to DuckDB transpilation of ZIPF (#6618)
* ZIP transpilation for Snowflake to DuckDB * updated tests * changed s and n to force constnatns * improved parameter checking * tweaked to fix random issue * templated the ZIPF transpilation * templated the ZIPF transpilation
1 parent 1f42813 commit c8acca3

File tree

2 files changed

+77
-2
lines changed

2 files changed

+77
-2
lines changed

sqlglot/dialects/duckdb.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,6 +1646,67 @@ def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str:
16461646
)
16471647
return f"({self.sql(query)})"
16481648

1649+
# Template for ZIPF transpilation - placeholders get replaced with actual parameters
1650+
ZIPF_TEMPLATE: t.ClassVar[exp.Expression] = exp.maybe_parse(
1651+
"""
1652+
WITH rand AS (SELECT :random_expr AS r),
1653+
weights AS (
1654+
SELECT i, 1.0 / POWER(i, :s) AS w
1655+
FROM RANGE(1, :n + 1) AS t(i)
1656+
),
1657+
cdf AS (
1658+
SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p
1659+
FROM weights
1660+
)
1661+
SELECT MIN(i)
1662+
FROM cdf
1663+
WHERE p >= (SELECT r FROM rand)
1664+
"""
1665+
)
1666+
1667+
def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str:
1668+
"""
1669+
Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling.
1670+
Uses a pre-parsed template with placeholders replaced by expression nodes.
1671+
"""
1672+
s = expression.this
1673+
n = expression.args.get("elementcount")
1674+
gen = expression.args.get("gen")
1675+
1676+
random_expr: exp.Expression
1677+
if isinstance(gen, exp.Rand):
1678+
# Use RANDOM() for non-deterministic output
1679+
random_expr = exp.Rand()
1680+
elif gen:
1681+
# (ABS(HASH(seed)) % 1000000) / 1000000.0
1682+
random_expr = exp.Div(
1683+
this=exp.Paren(
1684+
this=exp.Mod(
1685+
this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])),
1686+
expression=exp.Literal.number(1000000),
1687+
)
1688+
),
1689+
expression=exp.Literal.number(1000000.0),
1690+
)
1691+
else:
1692+
random_expr = exp.Rand()
1693+
1694+
# s, n are required args per Zipf.arg_types
1695+
assert s is not None and n is not None
1696+
replacements: dict[str, exp.Expression] = {
1697+
"s": s,
1698+
"n": n,
1699+
"random_expr": random_expr,
1700+
}
1701+
1702+
def replace_placeholder(node: exp.Expression) -> exp.Expression:
1703+
if isinstance(node, exp.Placeholder) and node.name in replacements:
1704+
return replacements[node.name].copy()
1705+
return node
1706+
1707+
query = self.ZIPF_TEMPLATE.copy().transform(replace_placeholder)
1708+
return f"({self.sql(query)})"
1709+
16491710
def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str:
16501711
"""
16511712
TO_BINARY(value, format) transpilation if the return type is BINARY:

tests/dialects/test_snowflake.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,22 @@ def test_snowflake(self):
279279
},
280280
)
281281

282-
self.validate_identity("SELECT ZIPF(1, 10, RANDOM())")
283-
self.validate_identity("SELECT ZIPF(2, 100, 1234)")
282+
self.validate_all(
283+
"SELECT ZIPF(1, 10, 1234)",
284+
write={
285+
"duckdb": "SELECT (WITH rand AS (SELECT (ABS(HASH(1234)) % 1000000) / 1000000.0 AS r), weights AS (SELECT i, 1.0 / POWER(i, 1) AS w FROM RANGE(1, 10 + 1) AS t(i)), cdf AS (SELECT i, SUM(w) OVER (ORDER BY i NULLS FIRST) / SUM(w) OVER () AS p FROM weights) SELECT MIN(i) FROM cdf WHERE p >= (SELECT r FROM rand))",
286+
"snowflake": "SELECT ZIPF(1, 10, 1234)",
287+
},
288+
)
289+
290+
self.validate_all(
291+
"SELECT ZIPF(2, 100, RANDOM())",
292+
write={
293+
"duckdb": "SELECT (WITH rand AS (SELECT RANDOM() AS r), weights AS (SELECT i, 1.0 / POWER(i, 2) AS w FROM RANGE(1, 100 + 1) AS t(i)), cdf AS (SELECT i, SUM(w) OVER (ORDER BY i NULLS FIRST) / SUM(w) OVER () AS p FROM weights) SELECT MIN(i) FROM cdf WHERE p >= (SELECT r FROM rand))",
294+
"snowflake": "SELECT ZIPF(2, 100, RANDOM())",
295+
},
296+
)
297+
284298
self.validate_identity("SELECT GROUPING_ID(a, b) AS g_id FROM x GROUP BY ROLLUP (a, b)")
285299
self.validate_identity("PARSE_URL('https://example.com/path')")
286300
self.validate_identity("PARSE_URL('https://example.com/path', 1)")

0 commit comments

Comments
 (0)