Skip to content

Commit 6df0288

Browse files
feat(snowflake)!: Transpilation support for Snowflake's BITMAP_CONSTRUCT_AGG function to DuckDB (#6745)
* first attempt to transpile bitmap_construct_agg * made into template * optimization * enhanced explanation * expanded documentation, added range check * tweaked documentation * updated test
1 parent d58377e commit 6df0288

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

sqlglot/dialects/duckdb.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,6 +1814,67 @@ class Generator(generator.Generator):
18141814
"(ABS(HASH(:seed)) % 1000000) / 1000000.0"
18151815
)
18161816

1817+
# Template for BITMAP_CONSTRUCT_AGG transpilation
1818+
#
1819+
# BACKGROUND:
1820+
# Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap.
1821+
# Supports values in range 0-32767, this version returns NULL if any value is out of range
1822+
# See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg
1823+
# See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts
1824+
#
1825+
# Snowflake uses two different formats based on the number of unique values:
1826+
#
1827+
# Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes
1828+
# Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003)
1829+
# Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes
1830+
# Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex)
1831+
# count v1 v2 v3 pad
1832+
#
1833+
# Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes
1834+
# Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes
1835+
# Bytes 10+: Each value as 2-byte little-endian integer (no padding)
1836+
# Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500
1837+
# hdr ----9 zero bytes---- v1 v2 v3 v4 v5
1838+
#
1839+
# TEMPLATE STRUCTURE
1840+
#
1841+
# Phase 1 - Innermost subquery: Data preparation
1842+
# SELECT LIST_SORT(...) AS l
1843+
# - Aggregates all input values into a list, remove NULLs, duplicates and sorts
1844+
# Result: Clean, sorted list of unique non-null integers stored as 'l'
1845+
#
1846+
# Phase 2 - Middle subquery: Hex string construction
1847+
# LIST_TRANSFORM(...)
1848+
# - Converts each integer to 2-byte little-endian hex representation
1849+
# - & 255 extracts low byte, >> 8 extracts high byte
1850+
# - LIST_REDUCE: Concatenates all hex pairs into single string 'h'
1851+
# Result: Hex string of all values
1852+
#
1853+
# Phase 3 - Outer SELECT: Final bitmap assembly
1854+
# LENGTH(l) < 5:
1855+
# - Small format: 2-byte count (big-endian via %04X) + values + zero padding
1856+
# LENGTH(l) >= 5:
1857+
# - Large format: Fixed 10-byte header + values (no padding needed)
1858+
# Result: Complete binary bitmap as BLOB
1859+
#
1860+
BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse(
1861+
"""
1862+
SELECT CASE
1863+
WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL
1864+
WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL
1865+
WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2))
1866+
ELSE UNHEX('08000000000000000000' || h)
1867+
END
1868+
FROM (
1869+
SELECT l, COALESCE(LIST_REDUCE(
1870+
LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)),
1871+
(__a, __b) -> __a || __b, ''
1872+
), '') AS h
1873+
FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l)
1874+
)
1875+
"""
1876+
)
1877+
18171878
# Template for RANDSTR transpilation - placeholders get replaced with actual parameters
18181879
RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse(
18191880
f"""
@@ -1882,6 +1943,20 @@ def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosit
18821943
)
18831944
)
18841945

1946+
def bitmapconstructagg_sql(
1947+
self: DuckDB.Generator, expression: exp.BitmapConstructAgg
1948+
) -> str:
1949+
"""
1950+
Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent.
1951+
Uses a pre-parsed template with placeholders replaced by expression nodes.
1952+
1953+
Snowflake bitmap format:
1954+
- Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes
1955+
- Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian)
1956+
"""
1957+
arg = expression.this
1958+
return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})"
1959+
18851960
def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str:
18861961
"""
18871962
Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random.

tests/dialects/test_snowflake.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ def test_snowflake(self):
8989
)
9090
self.validate_identity("SELECT BITMAP_BUCKET_NUMBER(32769)")
9191
self.validate_identity("SELECT BITMAP_CONSTRUCT_AGG(value)")
92+
self.validate_all(
93+
"SELECT BITMAP_CONSTRUCT_AGG(v) FROM t",
94+
write={
95+
"snowflake": "SELECT BITMAP_CONSTRUCT_AGG(v) FROM t",
96+
"duckdb": "SELECT (SELECT CASE WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL WHEN LENGTH(l) <> LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) ELSE UNHEX('08000000000000000000' || h) END FROM (SELECT l, COALESCE(LIST_REDUCE(LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), (__a, __b) -> __a || __b, ''), '') AS h FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(v) FILTER(WHERE NOT v IS NULL))) AS l))) FROM t",
97+
},
98+
)
9299
self.validate_identity(
93100
"SELECT BITMAP_COUNT(BITMAP_CONSTRUCT_AGG(value)) FROM TABLE(FLATTEN(INPUT => ARRAY_CONSTRUCT(1, 2, 3, 5)))",
94101
"SELECT BITMAP_COUNT(BITMAP_CONSTRUCT_AGG(value)) FROM TABLE(FLATTEN(INPUT => [1, 2, 3, 5]))",

0 commit comments

Comments
 (0)