25
25
import sqlglot .expressions as sge
26
26
27
27
from bigframes import dtypes
28
- from bigframes .core import guid
28
+ from bigframes .core import guid , utils
29
29
from bigframes .core .compile .sqlglot .expressions import typed_expr
30
30
import bigframes .core .compile .sqlglot .sqlglot_types as sgt
31
31
import bigframes .core .local_data as local_data
@@ -71,7 +71,10 @@ def from_pyarrow(
71
71
schema : bf_schema .ArraySchema ,
72
72
uid_gen : guid .SequentialUIDGenerator ,
73
73
) -> SQLGlotIR :
74
- """Builds SQLGlot expression from pyarrow table."""
74
+ """Builds SQLGlot expression from a pyarrow table.
75
+
76
+ This is used to represent in-memory data as a SQL query.
77
+ """
75
78
dtype_expr = sge .DataType (
76
79
this = sge .DataType .Type .STRUCT ,
77
80
expressions = [
@@ -117,6 +120,16 @@ def from_table(
117
120
alias_names : typing .Sequence [str ],
118
121
uid_gen : guid .SequentialUIDGenerator ,
119
122
) -> SQLGlotIR :
123
+ """Builds a SQLGlotIR expression from a BigQuery table.
124
+
125
+ Args:
126
+ project_id (str): The project ID of the BigQuery table.
127
+ dataset_id (str): The dataset ID of the BigQuery table.
128
+ table_id (str): The table ID of the BigQuery table.
129
+ col_names (typing.Sequence[str]): The names of the columns to select.
130
+ alias_names (typing.Sequence[str]): The aliases for the selected columns.
131
+ uid_gen (guid.SequentialUIDGenerator): A generator for unique identifiers.
132
+ """
120
133
selections = [
121
134
sge .Alias (
122
135
this = sge .to_identifier (col_name , quoted = cls .quoted ),
@@ -137,7 +150,7 @@ def from_query_string(
137
150
cls ,
138
151
query_string : str ,
139
152
) -> SQLGlotIR :
140
- """Builds SQLGlot expression from a query string"""
153
+ """Builds a SQLGlot expression from a query string"""
141
154
uid_gen : guid .SequentialUIDGenerator = guid .SequentialUIDGenerator ()
142
155
cte_name = sge .to_identifier (
143
156
next (uid_gen .get_uid_stream ("bfcte_" )), quoted = cls .quoted
@@ -157,7 +170,7 @@ def from_union(
157
170
output_ids : typing .Sequence [str ],
158
171
uid_gen : guid .SequentialUIDGenerator ,
159
172
) -> SQLGlotIR :
160
- """Builds SQLGlot expression by union of multiple select expressions."""
173
+ """Builds a SQLGlot expression by unioning of multiple select expressions."""
161
174
assert (
162
175
len (list (selects )) >= 2
163
176
), f"At least two select expressions must be provided, but got { selects } ."
@@ -205,6 +218,7 @@ def select(
205
218
self ,
206
219
selected_cols : tuple [tuple [str , sge .Expression ], ...],
207
220
) -> SQLGlotIR :
221
+ """Replaces new selected columns of the current SELECT clause."""
208
222
selections = [
209
223
sge .Alias (
210
224
this = expr ,
@@ -213,15 +227,41 @@ def select(
213
227
for id , expr in selected_cols
214
228
]
215
229
216
- new_expr , _ = self ._encapsulate_as_cte ()
230
+ new_expr = _select_to_cte (
231
+ self .expr ,
232
+ sge .to_identifier (
233
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
234
+ ),
235
+ )
217
236
new_expr = new_expr .select (* selections , append = False )
218
237
return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
219
238
239
+ def project (
240
+ self ,
241
+ projected_cols : tuple [tuple [str , sge .Expression ], ...],
242
+ ) -> SQLGlotIR :
243
+ """Adds new columns to the SELECT clause."""
244
+ projected_cols_expr = [
245
+ sge .Alias (
246
+ this = expr ,
247
+ alias = sge .to_identifier (id , quoted = self .quoted ),
248
+ )
249
+ for id , expr in projected_cols
250
+ ]
251
+ new_expr = _select_to_cte (
252
+ self .expr ,
253
+ sge .to_identifier (
254
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
255
+ ),
256
+ )
257
+ new_expr = new_expr .select (* projected_cols_expr , append = True )
258
+ return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
259
+
220
260
def order_by (
221
261
self ,
222
262
ordering : tuple [sge .Ordered , ...],
223
263
) -> SQLGlotIR :
224
- """Adds ORDER BY clause to the query."""
264
+ """Adds an ORDER BY clause to the query."""
225
265
if len (ordering ) == 0 :
226
266
return SQLGlotIR (expr = self .expr .copy (), uid_gen = self .uid_gen )
227
267
new_expr = self .expr .order_by (* ordering )
@@ -231,34 +271,24 @@ def limit(
231
271
self ,
232
272
limit : int | None ,
233
273
) -> SQLGlotIR :
234
- """Adds LIMIT clause to the query."""
274
+ """Adds a LIMIT clause to the query."""
235
275
if limit is not None :
236
276
new_expr = self .expr .limit (limit )
237
277
else :
238
278
new_expr = self .expr .copy ()
239
279
return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
240
280
241
- def project (
242
- self ,
243
- projected_cols : tuple [tuple [str , sge .Expression ], ...],
244
- ) -> SQLGlotIR :
245
- projected_cols_expr = [
246
- sge .Alias (
247
- this = expr ,
248
- alias = sge .to_identifier (id , quoted = self .quoted ),
249
- )
250
- for id , expr in projected_cols
251
- ]
252
- new_expr , _ = self ._encapsulate_as_cte ()
253
- new_expr = new_expr .select (* projected_cols_expr , append = True )
254
- return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
255
-
256
281
def filter (
257
282
self ,
258
283
condition : sge .Expression ,
259
284
) -> SQLGlotIR :
260
- """Filters the query with the given condition."""
261
- new_expr , _ = self ._encapsulate_as_cte ()
285
+ """Filters the query by adding a WHERE clause."""
286
+ new_expr = _select_to_cte (
287
+ self .expr ,
288
+ sge .to_identifier (
289
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
290
+ ),
291
+ )
262
292
return SQLGlotIR (
263
293
expr = new_expr .where (condition , append = False ), uid_gen = self .uid_gen
264
294
)
@@ -272,8 +302,15 @@ def join(
272
302
joins_nulls : bool = True ,
273
303
) -> SQLGlotIR :
274
304
"""Joins the current query with another SQLGlotIR instance."""
275
- left_select , left_table = self ._encapsulate_as_cte ()
276
- right_select , right_table = right ._encapsulate_as_cte ()
305
+ left_cte_name = sge .to_identifier (
306
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
307
+ )
308
+ right_cte_name = sge .to_identifier (
309
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
310
+ )
311
+
312
+ left_select = _select_to_cte (self .expr , left_cte_name )
313
+ right_select = _select_to_cte (right .expr , right_cte_name )
277
314
278
315
left_ctes = left_select .args .pop ("with" , [])
279
316
right_ctes = right_select .args .pop ("with" , [])
@@ -288,17 +325,50 @@ def join(
288
325
new_expr = (
289
326
sge .Select ()
290
327
.select (sge .Star ())
291
- .from_ (left_table )
292
- .join (right_table , on = join_on , join_type = join_type_str )
328
+ .from_ (sge . Table ( this = left_cte_name ) )
329
+ .join (sge . Table ( this = right_cte_name ) , on = join_on , join_type = join_type_str )
293
330
)
294
331
new_expr .set ("with" , sge .With (expressions = merged_ctes ))
295
332
296
333
return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
297
334
335
+ def explode (
336
+ self ,
337
+ column_names : tuple [str , ...],
338
+ offsets_col : typing .Optional [str ],
339
+ ) -> SQLGlotIR :
340
+ """Unnests one or more array columns."""
341
+ num_columns = len (list (column_names ))
342
+ assert num_columns > 0 , "At least one column must be provided for explode."
343
+ if num_columns == 1 :
344
+ return self ._explode_single_column (column_names [0 ], offsets_col )
345
+ else :
346
+ return self ._explode_multiple_columns (column_names , offsets_col )
347
+
348
+ def sample (self , fraction : float ) -> SQLGlotIR :
349
+ """Uniform samples a fraction of the rows."""
350
+ uuid_col = sge .to_identifier (
351
+ next (self .uid_gen .get_uid_stream ("bfcol_" )), quoted = self .quoted
352
+ )
353
+ uuid_expr = sge .Alias (this = sge .func ("RAND" ), alias = uuid_col )
354
+ condition = sge .LT (
355
+ this = uuid_col ,
356
+ expression = _literal (fraction , dtypes .FLOAT_DTYPE ),
357
+ )
358
+
359
+ new_cte_name = sge .to_identifier (
360
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
361
+ )
362
+ new_expr = _select_to_cte (
363
+ self .expr .select (uuid_expr , append = True ), new_cte_name
364
+ ).where (condition , append = False )
365
+ return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
366
+
298
367
def insert (
299
368
self ,
300
369
destination : bigquery .TableReference ,
301
370
) -> str :
371
+ """Generates an INSERT INTO SQL statement from the current SELECT clause."""
302
372
return sge .insert (self .expr .subquery (), _table (destination )).sql (
303
373
dialect = self .dialect , pretty = self .pretty
304
374
)
@@ -307,6 +377,9 @@ def replace(
307
377
self ,
308
378
destination : bigquery .TableReference ,
309
379
) -> str :
380
+ """Generates a MERGE statement to replace the destination table's contents.
381
+ by the current SELECT clause.
382
+ """
310
383
# Workaround for SQLGlot breaking change:
311
384
# https://github.com/tobymao/sqlglot/pull/4495
312
385
whens_expr = [
@@ -325,23 +398,10 @@ def replace(
325
398
).sql (dialect = self .dialect , pretty = self .pretty )
326
399
return f"{ merge_str } \n { whens_str } "
327
400
328
- def explode (
329
- self ,
330
- column_names : tuple [str , ...],
331
- offsets_col : typing .Optional [str ],
332
- ) -> SQLGlotIR :
333
- num_columns = len (list (column_names ))
334
- assert num_columns > 0 , "At least one column must be provided for explode."
335
- if num_columns == 1 :
336
- return self ._explode_single_column (column_names [0 ], offsets_col )
337
- else :
338
- return self ._explode_multiple_columns (column_names , offsets_col )
339
-
340
401
def _explode_single_column (
341
402
self , column_name : str , offsets_col : typing .Optional [str ]
342
403
) -> SQLGlotIR :
343
404
"""Helper method to handle the case of exploding a single column."""
344
-
345
405
offset = (
346
406
sge .to_identifier (offsets_col , quoted = self .quoted ) if offsets_col else None
347
407
)
@@ -358,7 +418,12 @@ def _explode_single_column(
358
418
359
419
# TODO: "CROSS" if not keep_empty else "LEFT"
360
420
# TODO: overlaps_with_parent to replace existing column.
361
- new_expr , _ = self ._encapsulate_as_cte ()
421
+ new_expr = _select_to_cte (
422
+ self .expr ,
423
+ sge .to_identifier (
424
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
425
+ ),
426
+ )
362
427
new_expr = new_expr .select (selection , append = False ).join (
363
428
unnest_expr , join_type = "CROSS"
364
429
)
@@ -408,33 +473,32 @@ def _explode_multiple_columns(
408
473
for column in columns
409
474
]
410
475
)
411
- new_expr , _ = self ._encapsulate_as_cte ()
476
+ new_expr = _select_to_cte (
477
+ self .expr ,
478
+ sge .to_identifier (
479
+ next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
480
+ ),
481
+ )
412
482
new_expr = new_expr .select (selection , append = False ).join (
413
483
unnest_expr , join_type = "CROSS"
414
484
)
415
485
return SQLGlotIR (expr = new_expr , uid_gen = self .uid_gen )
416
486
417
- def _encapsulate_as_cte (
418
- self ,
419
- ) -> typing .Tuple [sge .Select , sge .Table ]:
420
- """Transforms a given sge.Select query by pushing its main SELECT statement
421
- into a new CTE and then generates a 'SELECT * FROM new_cte_name'
422
- for the new query."""
423
- select_expr = self .expr .copy ()
424
487
425
- existing_ctes = select_expr .args .pop ("with" , [])
426
- new_cte_name = sge .to_identifier (
427
- next (self .uid_gen .get_uid_stream ("bfcte_" )), quoted = self .quoted
428
- )
429
- new_cte = sge .CTE (
430
- this = select_expr ,
431
- alias = new_cte_name ,
432
- )
433
- new_with_clause = sge .With (expressions = [* existing_ctes , new_cte ])
434
- new_table_expr = sge .Table (this = new_cte_name )
435
- new_select_expr = sge .Select ().select (sge .Star ()).from_ (new_table_expr )
436
- new_select_expr .set ("with" , new_with_clause )
437
- return new_select_expr , new_table_expr
488
+ def _select_to_cte (expr : sge .Select , cte_name : sge .Identifier ) -> sge .Select :
489
+ """Transforms a given sge.Select query by pushing its main SELECT statement
490
+ into a new CTE and then generates a 'SELECT * FROM new_cte_name'
491
+ for the new query."""
492
+ select_expr = expr .copy ()
493
+ existing_ctes = select_expr .args .pop ("with" , [])
494
+ new_cte = sge .CTE (
495
+ this = select_expr ,
496
+ alias = cte_name ,
497
+ )
498
+ new_with_clause = sge .With (expressions = [* existing_ctes , new_cte ])
499
+ new_select_expr = sge .Select ().select (sge .Star ()).from_ (sge .Table (this = cte_name ))
500
+ new_select_expr .set ("with" , new_with_clause )
501
+ return new_select_expr
438
502
439
503
440
504
def _literal (value : typing .Any , dtype : dtypes .Dtype ) -> sge .Expression :
@@ -454,6 +518,8 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
454
518
return sge .func ("ST_GEOGFROMTEXT" , sge .convert (wkt ))
455
519
elif dtype == dtypes .JSON_DTYPE :
456
520
return sge .ParseJSON (this = sge .convert (str (value )))
521
+ elif dtype == dtypes .TIMEDELTA_DTYPE :
522
+ return sge .convert (utils .timedelta_to_micros (value ))
457
523
elif dtypes .is_struct_like (dtype ):
458
524
items = [
459
525
_literal (value = value [field_name ], dtype = field_dtype ).as_ (
0 commit comments