-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Expand file tree
/
Copy pathduckdb.py
More file actions
2791 lines (2295 loc) · 113 KB
/
duckdb.py
File metadata and controls
2791 lines (2295 loc) · 113 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations
from decimal import Decimal
from itertools import groupby
import re
import typing as t
from sqlglot import exp, generator, parser, tokens, transforms
from sqlglot.dialects.dialect import (
DATETIME_DELTA,
Dialect,
JSON_EXTRACT_TYPE,
NormalizationStrategy,
approx_count_distinct_sql,
arrow_json_extract_sql,
binary_from_function,
bool_xor_sql,
build_default_decimal_type,
build_formatted_time,
build_regexp_extract,
count_if_to_sum,
date_delta_to_binary_interval_op,
date_trunc_to_time,
datestrtodate_sql,
encode_decode_sql,
explode_to_unnest_sql,
getbit_sql,
groupconcat_sql,
inline_array_unless_query,
months_between_sql,
no_datetime_sql,
no_comment_column_constraint_sql,
no_make_interval_sql,
no_time_sql,
no_timestamp_sql,
pivot_column_names,
regexp_replace_global_modifier,
rename_func,
remove_from_array_using_filter,
sha2_digest_sql,
sha256_sql,
strposition_sql,
str_to_time_sql,
timestrtotime_sql,
unit_to_str,
)
from sqlglot.generator import unsupported_args
from sqlglot.helper import is_date_unit, seq_get
from sqlglot.tokens import TokenType
from sqlglot.parser import binary_range_parser
# Regex to detect time zones in timestamps of the form [+|-]TT[:tt]
# The pattern matches timezone offsets that appear after the time portion
TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?")
# Characters that must be escaped when building regex expressions in INITCAP
REGEX_ESCAPE_REPLACEMENTS = {
"\\": "\\\\",
"-": r"\-",
"^": r"\^",
"[": r"\[",
"]": r"\]",
}
# Used to in RANDSTR transpilation
RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
RANDSTR_SEED = 123456
# Whitespace control characters that DuckDB must process with `CHR({val})` calls
WS_CONTROL_CHARS_TO_DUCK = {
"\u000b": 11,
"\u001c": 28,
"\u001d": 29,
"\u001e": 30,
"\u001f": 31,
}
# Days of week to ISO 8601 day-of-week numbers
# ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7
WEEK_START_DAY_TO_DOW = {
"MONDAY": 1,
"TUESDAY": 2,
"WEDNESDAY": 3,
"THURSDAY": 4,
"FRIDAY": 5,
"SATURDAY": 6,
"SUNDAY": 7,
}
MAX_BIT_POSITION = exp.Literal.number(32768)
def _last_day_sql(self: DuckDB.Generator, expression: exp.LastDay) -> str:
"""
DuckDB's LAST_DAY only supports finding the last day of a month.
For other date parts (year, quarter, week), we need to implement equivalent logic.
"""
date_expr = expression.this
unit = expression.text("unit")
if not unit or unit.upper() == "MONTH":
# Default behavior - use DuckDB's native LAST_DAY
return self.func("LAST_DAY", date_expr)
if unit.upper() == "YEAR":
# Last day of year: December 31st of the same year
year_expr = exp.func("EXTRACT", "YEAR", date_expr)
make_date_expr = exp.func(
"MAKE_DATE", year_expr, exp.Literal.number(12), exp.Literal.number(31)
)
return self.sql(make_date_expr)
if unit.upper() == "QUARTER":
# Last day of quarter
year_expr = exp.func("EXTRACT", "YEAR", date_expr)
quarter_expr = exp.func("EXTRACT", "QUARTER", date_expr)
# Calculate last month of quarter: quarter * 3. Quarter can be 1 to 4
last_month_expr = exp.Mul(this=quarter_expr, expression=exp.Literal.number(3))
first_day_last_month_expr = exp.func(
"MAKE_DATE", year_expr, last_month_expr, exp.Literal.number(1)
)
# Last day of the last month of the quarter
last_day_expr = exp.func("LAST_DAY", first_day_last_month_expr)
return self.sql(last_day_expr)
if unit.upper() == "WEEK":
# DuckDB DAYOFWEEK: Sunday=0, Monday=1, ..., Saturday=6
dow = exp.func("EXTRACT", "DAYOFWEEK", date_expr)
# Days to the last day of week: (7 - dayofweek) % 7, assuming the last day of week is Sunday (Snowflake)
# Wrap in parentheses to ensure correct precedence
days_to_sunday_expr = exp.Mod(
this=exp.Paren(this=exp.Sub(this=exp.Literal.number(7), expression=dow)),
expression=exp.Literal.number(7),
)
interval_expr = exp.Interval(this=days_to_sunday_expr, unit=exp.var("DAY"))
add_expr = exp.Add(this=date_expr, expression=interval_expr)
cast_expr = exp.cast(add_expr, exp.DataType.Type.DATE)
return self.sql(cast_expr)
self.unsupported(f"Unsupported date part '{unit}' in LAST_DAY function")
return self.function_fallback_sql(expression)
def _next_day_sql(self: DuckDB.Generator, expression: exp.NextDay) -> str:
"""
Transpile Snowflake's NEXT_DAY to DuckDB using date arithmetic.
Returns the DATE of the next occurrence of the specified weekday.
Formula: (target_dow - current_dow + 6) % 7 + 1
The +6 normalizes negative differences and the final +1 prevents zero results.
Examples:
NEXT_DAY('2024-01-01' (Monday), 'Monday')
→ (1 - 1 + 6) % 7 + 1 = 6 % 7 + 1 = 7 days → 2024-01-08
NEXT_DAY('2024-01-01' (Monday), 'Friday')
→ (5 - 1 + 6) % 7 + 1 = 10 % 7 + 1 = 4 days → 2024-01-05
"""
date_expr = expression.this
day_name_expr = expression.expression
# Handle NULL inputs - return CAST(NULL AS DATE)
if isinstance(date_expr, exp.Null) or isinstance(day_name_expr, exp.Null):
return self.sql(exp.cast(exp.Null(), exp.DataType.Type.DATE))
# Only support literal day names (not columns/expressions)
if not isinstance(day_name_expr, exp.Literal):
self.unsupported("NEXT_DAY with non-literal day name not supported in DuckDB")
return self.function_fallback_sql(expression)
# Extract and normalize day name
day_name_str = str(day_name_expr.this).upper()
if len(day_name_str) < 2:
self.unsupported("Day name must be at least 2 characters")
return self.function_fallback_sql(expression)
# Find matching day in WEEK_START_DAY_TO_DOW (handles both full names and abbreviations)
# e.g., "MONDAY" matches "MONDAY", "MO" matches "MONDAY", "FRI" matches "FRIDAY"
matching_day = next(
(day for day in WEEK_START_DAY_TO_DOW if day.startswith(day_name_str)), None
)
if not matching_day:
self.unsupported(f"Invalid day name or abbreviation: {day_name_str}")
return self.function_fallback_sql(expression)
target_dow = WEEK_START_DAY_TO_DOW[matching_day]
# Build the calculation: (target - ISODOW(date) + 6) % 7 + 1
isodow_call = exp.func("ISODOW", date_expr)
# Step 1: target - ISODOW(date) + 6
days_expr = exp.Add(
this=exp.Sub(this=exp.Literal.number(target_dow), expression=isodow_call),
expression=exp.Literal.number(6),
)
# Step 2: (...) % 7
mod_expr = exp.Mod(this=exp.Paren(this=days_expr), expression=exp.Literal.number(7))
# Step 3: ... + 1
days_to_add = exp.Add(this=mod_expr, expression=exp.Literal.number(1))
# Build final: CAST(date + INTERVAL (days_to_add) DAY AS DATE)
result = exp.cast(
exp.Add(
this=date_expr,
expression=exp.Interval(this=days_to_add, unit=exp.var("DAY")),
),
exp.DataType.Type.DATE,
)
return self.sql(result)
def _is_nanosecond_unit(unit: t.Optional[exp.Expression]) -> bool:
return isinstance(unit, (exp.Var, exp.Literal)) and unit.name.upper() == "NANOSECOND"
def _handle_nanosecond_diff(
self: DuckDB.Generator,
end_time: exp.Expression,
start_time: exp.Expression,
) -> str:
"""Generate NANOSECOND diff using EPOCH_NS since DATE_DIFF doesn't support it."""
end_ns = exp.cast(end_time, exp.DataType.Type.TIMESTAMP_NS)
start_ns = exp.cast(start_time, exp.DataType.Type.TIMESTAMP_NS)
# Build expression tree: EPOCH_NS(end) - EPOCH_NS(start)
return self.sql(
exp.Sub(this=exp.func("EPOCH_NS", end_ns), expression=exp.func("EPOCH_NS", start_ns))
)
def _to_boolean_sql(self: DuckDB.Generator, expression: exp.ToBoolean) -> str:
"""
Transpile TO_BOOLEAN and TRY_TO_BOOLEAN functions from Snowflake to DuckDB equivalent.
DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'.
We need to handle the 'on'/'off' cases explicitly.
For TO_BOOLEAN (safe=False): NaN and INF values cause errors. We use DuckDB's native ERROR()
function to replicate this behavior with a clear error message.
For TRY_TO_BOOLEAN (safe=True): Use DuckDB's TRY_CAST for conversion, which returns NULL
for invalid inputs instead of throwing errors.
"""
arg = expression.this
is_safe = expression.args.get("safe", False)
base_case_expr = (
exp.case()
.when(
# Handle 'on' -> TRUE (case insensitive)
exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("ON")),
exp.true(),
)
.when(
# Handle 'off' -> FALSE (case insensitive)
exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("OFF")),
exp.false(),
)
)
if is_safe:
# TRY_TO_BOOLEAN: handle 'on'/'off' and use TRY_CAST for everything else
case_expr = base_case_expr.else_(exp.func("TRY_CAST", arg, exp.DataType.build("BOOLEAN")))
else:
# TO_BOOLEAN: handle NaN/INF errors, 'on'/'off', and use regular CAST
cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL"))
# Check for NaN and INF values
nan_inf_check = exp.Or(
this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real)
)
case_expr = base_case_expr.when(
nan_inf_check,
exp.func(
"ERROR",
exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"),
),
).else_(exp.cast(arg, exp.DataType.Type.BOOLEAN))
return self.sql(case_expr)
# BigQuery -> DuckDB conversion for the DATE function
def _date_sql(self: DuckDB.Generator, expression: exp.Date) -> str:
this = expression.this
zone = self.sql(expression, "zone")
if zone:
# BigQuery considers "this" at UTC, converts it to the specified
# time zone and then keeps only the DATE part
# To micmic that, we:
# (1) Cast to TIMESTAMP to remove DuckDB's local tz
# (2) Apply consecutive AtTimeZone calls for UTC -> zone conversion
this = exp.cast(this, exp.DataType.Type.TIMESTAMP)
at_utc = exp.AtTimeZone(this=this, zone=exp.Literal.string("UTC"))
this = exp.AtTimeZone(this=at_utc, zone=zone)
return self.sql(exp.cast(expression=this, to=exp.DataType.Type.DATE))
# BigQuery -> DuckDB conversion for the TIME_DIFF function
def _timediff_sql(self: DuckDB.Generator, expression: exp.TimeDiff) -> str:
unit = expression.unit
if _is_nanosecond_unit(unit):
return _handle_nanosecond_diff(self, expression.expression, expression.this)
this = exp.cast(expression.this, exp.DataType.Type.TIME)
expr = exp.cast(expression.expression, exp.DataType.Type.TIME)
# Although the 2 dialects share similar signatures, BQ seems to inverse
# the sign of the result so the start/end time operands are flipped
return self.func("DATE_DIFF", unit_to_str(expression), expr, this)
def _date_delta_to_binary_interval_op(
cast: bool = True,
) -> t.Callable[[DuckDB.Generator, DATETIME_DELTA], str]:
"""
DuckDB override to handle:
1. NANOSECOND operations (DuckDB doesn't support INTERVAL ... NANOSECOND)
2. Float/decimal interval values (DuckDB INTERVAL requires integers)
"""
base_impl = date_delta_to_binary_interval_op(cast=cast)
def _duckdb_date_delta_sql(self: DuckDB.Generator, expression: DATETIME_DELTA) -> str:
unit = expression.unit
interval_value = expression.expression
# Handle NANOSECOND unit (DuckDB doesn't support INTERVAL ... NANOSECOND)
if _is_nanosecond_unit(unit):
if isinstance(interval_value, exp.Interval):
interval_value = interval_value.this
timestamp_ns = exp.cast(expression.this, exp.DataType.Type.TIMESTAMP_NS)
return self.sql(
exp.func(
"MAKE_TIMESTAMP_NS",
exp.Add(this=exp.func("EPOCH_NS", timestamp_ns), expression=interval_value),
)
)
# Handle float/decimal interval values as duckDB INTERVAL requires integer expressions
if not interval_value or isinstance(interval_value, exp.Interval):
return base_impl(self, expression)
if interval_value.is_type(*exp.DataType.REAL_TYPES):
expression.set("expression", exp.cast(exp.func("ROUND", interval_value), "INT"))
return base_impl(self, expression)
return _duckdb_date_delta_sql
@unsupported_args(("expression", "DuckDB's ARRAY_SORT does not support a comparator."))
def _array_sort_sql(self: DuckDB.Generator, expression: exp.ArraySort) -> str:
return self.func("ARRAY_SORT", expression.this)
def _sort_array_sql(self: DuckDB.Generator, expression: exp.SortArray) -> str:
name = "ARRAY_REVERSE_SORT" if expression.args.get("asc") == exp.false() else "ARRAY_SORT"
return self.func(name, expression.this)
def _build_sort_array_desc(args: t.List) -> exp.Expression:
return exp.SortArray(this=seq_get(args, 0), asc=exp.false())
def _build_array_prepend(args: t.List) -> exp.Expression:
return exp.ArrayPrepend(this=seq_get(args, 1), expression=seq_get(args, 0))
def _build_date_diff(args: t.List) -> exp.Expression:
return exp.DateDiff(this=seq_get(args, 2), expression=seq_get(args, 1), unit=seq_get(args, 0))
def _build_generate_series(end_exclusive: bool = False) -> t.Callable[[t.List], exp.GenerateSeries]:
def _builder(args: t.List) -> exp.GenerateSeries:
# Check https://duckdb.org/docs/sql/functions/nested.html#range-functions
if len(args) == 1:
# DuckDB uses 0 as a default for the series' start when it's omitted
args.insert(0, exp.Literal.number("0"))
gen_series = exp.GenerateSeries.from_arg_list(args)
gen_series.set("is_end_exclusive", end_exclusive)
return gen_series
return _builder
def _build_make_timestamp(args: t.List) -> exp.Expression:
if len(args) == 1:
return exp.UnixToTime(this=seq_get(args, 0), scale=exp.UnixToTime.MICROS)
return exp.TimestampFromParts(
year=seq_get(args, 0),
month=seq_get(args, 1),
day=seq_get(args, 2),
hour=seq_get(args, 3),
min=seq_get(args, 4),
sec=seq_get(args, 5),
)
def _show_parser(*args: t.Any, **kwargs: t.Any) -> t.Callable[[DuckDB.Parser], exp.Show]:
def _parse(self: DuckDB.Parser) -> exp.Show:
return self._parse_show_duckdb(*args, **kwargs)
return _parse
def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str:
args: t.List[str] = []
# BigQuery allows inline construction such as "STRUCT<a STRING, b INTEGER>('str', 1)" which is
# canonicalized to "ROW('str', 1) AS STRUCT(a TEXT, b INT)" in DuckDB
# The transformation to ROW will take place if:
# 1. The STRUCT itself does not have proper fields (key := value) as a "proper" STRUCT would
# 2. A cast to STRUCT / ARRAY of STRUCTs is found
ancestor_cast = expression.find_ancestor(exp.Cast)
is_bq_inline_struct = (
(expression.find(exp.PropertyEQ) is None)
and ancestor_cast
and any(
casted_type.is_type(exp.DataType.Type.STRUCT)
for casted_type in ancestor_cast.find_all(exp.DataType)
)
)
for i, expr in enumerate(expression.expressions):
is_property_eq = isinstance(expr, exp.PropertyEQ)
value = expr.expression if is_property_eq else expr
if is_bq_inline_struct:
args.append(self.sql(value))
else:
if is_property_eq:
if isinstance(expr.this, exp.Identifier):
key = self.sql(exp.Literal.string(expr.name))
else:
key = self.sql(expr.this)
else:
key = self.sql(exp.Literal.string(f"_{i}"))
args.append(f"{key}: {self.sql(value)}")
csv_args = ", ".join(args)
return f"ROW({csv_args})" if is_bq_inline_struct else f"{{{csv_args}}}"
def _datatype_sql(self: DuckDB.Generator, expression: exp.DataType) -> str:
if expression.is_type("array"):
return f"{self.expressions(expression, flat=True)}[{self.expressions(expression, key='values', flat=True)}]"
# Modifiers are not supported for TIME, [TIME | TIMESTAMP] WITH TIME ZONE
if expression.is_type(
exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ, exp.DataType.Type.TIMESTAMPTZ
):
return expression.this.value
return self.datatype_sql(expression)
def _json_format_sql(self: DuckDB.Generator, expression: exp.JSONFormat) -> str:
sql = self.func("TO_JSON", expression.this, expression.args.get("options"))
return f"CAST({sql} AS TEXT)"
def _unix_to_time_sql(self: DuckDB.Generator, expression: exp.UnixToTime) -> str:
scale = expression.args.get("scale")
timestamp = expression.this
target_type = expression.args.get("target_type")
# Check if we need NTZ (naive timestamp in UTC)
is_ntz = target_type and target_type.this in (
exp.DataType.Type.TIMESTAMP,
exp.DataType.Type.TIMESTAMPNTZ,
)
if scale == exp.UnixToTime.MILLIS:
# EPOCH_MS already returns TIMESTAMP (naive, UTC)
return self.func("EPOCH_MS", timestamp)
if scale == exp.UnixToTime.MICROS:
# MAKE_TIMESTAMP already returns TIMESTAMP (naive, UTC)
return self.func("MAKE_TIMESTAMP", timestamp)
# Other scales: divide and use TO_TIMESTAMP
if scale not in (None, exp.UnixToTime.SECONDS):
timestamp = exp.Div(this=timestamp, expression=exp.func("POW", 10, scale))
to_timestamp: exp.Expression = exp.Anonymous(this="TO_TIMESTAMP", expressions=[timestamp])
if is_ntz:
to_timestamp = exp.AtTimeZone(this=to_timestamp, zone=exp.Literal.string("UTC"))
return self.sql(to_timestamp)
WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In, exp.Not)
def _arrow_json_extract_sql(self: DuckDB.Generator, expression: JSON_EXTRACT_TYPE) -> str:
arrow_sql = arrow_json_extract_sql(self, expression)
if not expression.same_parent and isinstance(
expression.parent, WRAPPED_JSON_EXTRACT_EXPRESSIONS
):
arrow_sql = self.wrap(arrow_sql)
return arrow_sql
def _implicit_datetime_cast(
arg: t.Optional[exp.Expression], type: exp.DataType.Type = exp.DataType.Type.DATE
) -> t.Optional[exp.Expression]:
if isinstance(arg, exp.Literal) and arg.is_string:
ts = arg.name
if type == exp.DataType.Type.DATE and ":" in ts:
type = (
exp.DataType.Type.TIMESTAMPTZ
if TIMEZONE_PATTERN.search(ts)
else exp.DataType.Type.TIMESTAMP
)
arg = exp.cast(arg, type)
return arg
def _week_unit_to_dow(unit: t.Optional[exp.Expression]) -> t.Optional[int]:
"""
Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming
from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts.
Args:
unit: The unit expression (Var for ISOWEEK or WeekStart)
Returns:
The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant).
Examples:
"WEEK(SUNDAY)" -> 7
"WEEK(MONDAY)" -> 1
"ISOWEEK" -> 1
"""
# Handle plain Var expressions for ISOWEEK only
if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK":
return 1
# Handle WeekStart expressions with explicit day
if isinstance(unit, exp.WeekStart):
return WEEK_START_DAY_TO_DOW.get(unit.name.upper())
return None
def _build_week_trunc_expression(date_expr: exp.Expression, start_dow: int) -> exp.Expression:
"""
Build DATE_TRUNC expression for week boundaries with custom start day.
Args:
date_expr: The date expression to truncate
shift_days: ISO 8601 day-of-week number (Monday=0, ..., Sunday=6)
DuckDB's DATE_TRUNC('WEEK', ...) aligns weeks to Monday (ISO standard).
To align to a different start day, we shift the date before truncating.
Shift formula: Sunday (7) gets +1, others get (1 - start_dow)
Examples:
Monday (1): shift = 0 (no shift needed)
Tuesday (2): shift = -1 (shift back 1 day) ...
Sunday (7): shift = +1 (shift forward 1 day, wraps to next Monday-based week)
"""
shift_days = 1 if start_dow == 7 else 1 - start_dow
# Shift date to align week boundaries with the desired start day
# No shift needed for Monday-based weeks (shift_days == 0)
shifted_date = (
exp.DateAdd(
this=date_expr,
expression=exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")),
)
if shift_days != 0
else date_expr
)
return exp.DateTrunc(unit=exp.var("WEEK"), this=shifted_date)
def _date_diff_sql(self: DuckDB.Generator, expression: exp.DateDiff) -> str:
unit = expression.unit
if _is_nanosecond_unit(unit):
return _handle_nanosecond_diff(self, expression.this, expression.expression)
this = _implicit_datetime_cast(expression.this)
expr = _implicit_datetime_cast(expression.expression)
# DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7:
# SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed)
# SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference)
# Whereas for other units such as MONTH it does respect month boundaries:
# SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed)
date_part_boundary = expression.args.get("date_part_boundary")
# Extract week start day; returns None if day is dynamic (column/placeholder)
week_start = _week_unit_to_dow(unit)
if date_part_boundary and week_start and this and expr:
expression.set("unit", exp.Literal.string("WEEK"))
# Truncate both dates to week boundaries to respect input dialect semantics
this = _build_week_trunc_expression(this, week_start)
expr = _build_week_trunc_expression(expr, week_start)
return self.func("DATE_DIFF", unit_to_str(expression), expr, this)
def _generate_datetime_array_sql(
self: DuckDB.Generator, expression: t.Union[exp.GenerateDateArray, exp.GenerateTimestampArray]
) -> str:
is_generate_date_array = isinstance(expression, exp.GenerateDateArray)
type = exp.DataType.Type.DATE if is_generate_date_array else exp.DataType.Type.TIMESTAMP
start = _implicit_datetime_cast(expression.args.get("start"), type=type)
end = _implicit_datetime_cast(expression.args.get("end"), type=type)
# BQ's GENERATE_DATE_ARRAY & GENERATE_TIMESTAMP_ARRAY are transformed to DuckDB'S GENERATE_SERIES
gen_series: t.Union[exp.GenerateSeries, exp.Cast] = exp.GenerateSeries(
start=start, end=end, step=expression.args.get("step")
)
if is_generate_date_array:
# The GENERATE_SERIES result type is TIMESTAMP array, so to match BQ's semantics for
# GENERATE_DATE_ARRAY we must cast it back to DATE array
gen_series = exp.cast(gen_series, exp.DataType.build("ARRAY<DATE>"))
return self.sql(gen_series)
def _json_extract_value_array_sql(
self: DuckDB.Generator, expression: exp.JSONValueArray | exp.JSONExtractArray
) -> str:
json_extract = exp.JSONExtract(this=expression.this, expression=expression.expression)
data_type = "ARRAY<STRING>" if isinstance(expression, exp.JSONValueArray) else "ARRAY<JSON>"
return self.sql(exp.cast(json_extract, to=exp.DataType.build(data_type)))
def _cast_to_varchar(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if arg and arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN):
return exp.cast(arg, exp.DataType.Type.VARCHAR)
return arg
def _cast_to_boolean(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if arg and not arg.is_type(exp.DataType.Type.BOOLEAN):
return exp.cast(arg, exp.DataType.Type.BOOLEAN)
return arg
def _is_binary(arg: exp.Expression) -> bool:
return arg.is_type(
exp.DataType.Type.BINARY,
exp.DataType.Type.VARBINARY,
exp.DataType.Type.BLOB,
)
def _gen_with_cast_to_blob(
self: DuckDB.Generator, expression: exp.Expression, result_sql: str
) -> str:
if _is_binary(expression):
blob = exp.DataType.build("BLOB", dialect="duckdb")
result_sql = self.sql(exp.Cast(this=result_sql, to=blob))
return result_sql
def _cast_to_bit(arg: exp.Expression) -> exp.Expression:
if not _is_binary(arg):
return arg
if isinstance(arg, exp.HexString):
arg = exp.Unhex(this=exp.Literal.string(arg.this))
return exp.cast(arg, exp.DataType.Type.BIT)
def _prepare_binary_bitwise_args(expression: exp.Binary) -> None:
if _is_binary(expression.this):
expression.set("this", _cast_to_bit(expression.this))
if _is_binary(expression.expression):
expression.set("expression", _cast_to_bit(expression.expression))
def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str:
# Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL
having = expression.this
if isinstance(having, exp.HavingMax):
func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL"
return self.func(func_name, having.this, having.expression)
return self.function_fallback_sql(expression)
def _bitwise_agg_sql(
self: DuckDB.Generator,
expression: t.Union[exp.BitwiseOrAgg, exp.BitwiseAndAgg, exp.BitwiseXorAgg],
) -> str:
"""
DuckDB's bitwise aggregate functions only accept integer types. For other types:
- DECIMAL/STRING: Use CAST(arg AS INT) to convert directly, will round to nearest int
- FLOAT/DOUBLE: Use ROUND(arg)::INT to round to nearest integer, required due to float precision loss
"""
if isinstance(expression, exp.BitwiseOrAgg):
func_name = "BIT_OR"
elif isinstance(expression, exp.BitwiseAndAgg):
func_name = "BIT_AND"
else: # exp.BitwiseXorAgg
func_name = "BIT_XOR"
arg = expression.this
if not arg.type:
from sqlglot.optimizer.annotate_types import annotate_types
arg = annotate_types(arg, dialect=self.dialect)
if arg.is_type(*exp.DataType.REAL_TYPES, *exp.DataType.TEXT_TYPES):
if arg.is_type(*exp.DataType.FLOAT_TYPES):
# float types need to be rounded first due to precision loss
arg = exp.func("ROUND", arg)
arg = exp.cast(arg, exp.DataType.Type.INT)
return self.func(func_name, arg)
def _literal_sql_with_ws_chr(self: DuckDB.Generator, literal: str) -> str:
# DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly
if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal):
return self.sql(exp.Literal.string(literal))
sql_segments: t.List[str] = []
for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK):
if is_ws_control:
for ch in group:
duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch]
sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code))))
else:
sql_segments.append(self.sql(exp.Literal.string("".join(group))))
sql = " || ".join(sql_segments)
return sql if len(sql_segments) == 1 else f"({sql})"
def _escape_regex_metachars(
self: DuckDB.Generator, delimiters: t.Optional[exp.Expression], delimiters_sql: str
) -> str:
r"""
Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions.
Literal strings are escaped at transpile time, expressions handled with REPLACE() calls.
"""
if not delimiters:
return delimiters_sql
if delimiters.is_string:
literal_value = delimiters.this
escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value)
return _literal_sql_with_ws_chr(self, escaped_literal)
escaped_sql = delimiters_sql
for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items():
escaped_sql = self.func(
"REPLACE",
escaped_sql,
self.sql(exp.Literal.string(raw)),
self.sql(exp.Literal.string(escaped)),
)
return escaped_sql
def _build_capitalization_sql(
self: DuckDB.Generator,
value_to_split: str,
delimiters_sql: str,
) -> str:
# empty string delimiter --> treat value as one word, no need to split
if delimiters_sql == "''":
return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))"
delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')"
split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')"
# REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments.
# We do not know whether the first segment is a delimiter or not, so we check the first character of the string
# with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd.
return self.func(
"ARRAY_TO_STRING",
exp.case()
.when(
f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})",
self.func(
"LIST_TRANSFORM",
self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql),
"(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END",
),
)
.else_(
self.func(
"LIST_TRANSFORM",
self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql),
"(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END",
),
),
"''",
)
def _initcap_sql(self: DuckDB.Generator, expression: exp.Initcap) -> str:
this_sql = self.sql(expression, "this")
delimiters = expression.args.get("expression")
if delimiters is None:
# fallback for manually created exp.Initcap w/o delimiters arg
delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS)
delimiters_sql = self.sql(delimiters)
escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql)
return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql)
def _boolxor_agg_sql(self: DuckDB.Generator, expression: exp.BoolxorAgg) -> str:
"""
Snowflake's `BOOLXOR_AGG(col)` returns TRUE if exactly one input in `col` is TRUE, FALSE otherwise;
Since DuckDB does not have a mapping function, we mimic the behavior by generating `COUNT_IF(col) = 1`.
DuckDB's COUNT_IF strictly requires boolean inputs, so cast if not already boolean.
"""
return self.sql(
exp.EQ(
this=exp.CountIf(this=_cast_to_boolean(expression.this)),
expression=exp.Literal.number(1),
)
)
def _bitshift_sql(
self: DuckDB.Generator, expression: exp.BitwiseLeftShift | exp.BitwiseRightShift
) -> str:
"""
Transform bitshift expressions for DuckDB by injecting BIT/INT128 casts.
DuckDB's bitwise shift operators don't work with BLOB/BINARY types, so we cast
them to BIT for the operation, then cast the result back to the original type.
Note: Assumes type annotation has been applied with the source dialect.
"""
operator = "<<" if isinstance(expression, exp.BitwiseLeftShift) else ">>"
result_is_blob = False
this = expression.this
if _is_binary(this):
result_is_blob = True
expression.set("this", exp.cast(this, exp.DataType.Type.BIT))
elif expression.args.get("requires_int128"):
this.replace(exp.cast(this, exp.DataType.Type.INT128))
result_sql = self.binary(expression, operator)
# Wrap in parentheses if parent is a bitwise operator to "fix" DuckDB precedence issue
# DuckDB parses: a << b | c << d as (a << b | c) << d
if isinstance(expression.parent, exp.Binary):
result_sql = self.sql(exp.Paren(this=result_sql))
if result_is_blob:
result_sql = self.sql(
exp.Cast(this=result_sql, to=exp.DataType.build("BLOB", dialect="duckdb"))
)
return result_sql
def _scale_rounding_sql(
self: DuckDB.Generator,
expression: exp.Expression,
rounding_func: type[exp.Expression],
) -> str | None:
"""
Handle scale parameter transformation for rounding functions.
DuckDB doesn't support the scale parameter for certain functions (e.g., FLOOR, CEIL),
so we transform: FUNC(x, n) to ROUND(FUNC(x * 10^n) / 10^n, n)
Args:
self: The DuckDB generator instance
expression: The expression to transform (must have 'this', 'decimals', and 'to' args)
rounding_func: The rounding function class to use in the transformation
Returns:
The transformed SQL string if decimals parameter exists, None otherwise
"""
decimals = expression.args.get("decimals")
if decimals is None or expression.args.get("to") is not None:
return None
this = expression.this
if isinstance(this, exp.Binary):
this = exp.Paren(this=this)
n_int = decimals
if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)):
n_int = exp.cast(decimals, exp.DataType.Type.INT)
pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int)
rounded = rounding_func(this=exp.Mul(this=this, expression=pow_))
result = exp.Div(this=rounded, expression=pow_.copy())
return self.round_sql(
exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True)
)
def _ceil_floor(self: DuckDB.Generator, expression: exp.Floor | exp.Ceil) -> str:
scaled_sql = _scale_rounding_sql(self, expression, type(expression))
if scaled_sql is not None:
return scaled_sql
return self.ceil_floor(expression)
def _regr_val_sql(
self: DuckDB.Generator,
expression: exp.RegrValx | exp.RegrValy,
) -> str:
"""
Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent.
REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x.
REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y.
"""
from sqlglot.optimizer.annotate_types import annotate_types
y = expression.this
x = expression.expression
# Determine which argument to check for NULL and which to return based on expression type
if isinstance(expression, exp.RegrValx):
# REGR_VALX: check y for NULL, return x
check_for_null = y
return_value = x
return_value_attr = "expression"
else:
# REGR_VALY: check x for NULL, return y
check_for_null = x
return_value = y
return_value_attr = "this"
# Get the type from the return argument
result_type = return_value.type
# If no type info, annotate the expression to infer types
if not result_type or result_type.this == exp.DataType.Type.UNKNOWN:
try:
annotated = annotate_types(expression.copy(), dialect=self.dialect)
result_type = getattr(annotated, return_value_attr).type
except Exception:
pass
# Default to DOUBLE for regression functions if type still unknown
if not result_type or result_type.this == exp.DataType.Type.UNKNOWN:
result_type = exp.DataType.build("DOUBLE")
# Cast NULL to the same type as return_value to avoid DuckDB type inference issues
typed_null = exp.Cast(this=exp.Null(), to=result_type)
return self.sql(
exp.If(
this=exp.Is(this=check_for_null.copy(), expression=exp.Null()),
true=typed_null,
false=return_value.copy(),
)
)
def _maybe_corr_null_to_false(
expression: t.Union[exp.Filter, exp.Window, exp.Corr],
) -> t.Optional[t.Union[exp.Filter, exp.Window, exp.Corr]]:
corr = expression
while isinstance(corr, (exp.Window, exp.Filter)):