20
20
import datetime
21
21
import logging
22
22
import os
23
- import re
24
23
import secrets
25
24
import typing
26
25
from typing import (
86
85
import bigframes .core .tree_properties as tree_properties
87
86
import bigframes .core .utils as utils
88
87
import bigframes .dtypes
88
+ import bigframes .exceptions
89
89
import bigframes .formatting_helpers as formatting_helpers
90
90
from bigframes .functions .remote_function import read_gbq_function as bigframes_rgf
91
91
from bigframes .functions .remote_function import remote_function as bigframes_rf
92
- import bigframes .session ._io .bigquery as bigframes_io
92
+ import bigframes .session ._io .bigquery as bf_io_bigquery
93
93
import bigframes .session ._io .bigquery .read_gbq_table as bf_read_gbq_table
94
94
import bigframes .session .clients
95
95
import bigframes .version
145
145
)
146
146
147
147
148
- def _is_query (query_or_table : str ) -> bool :
149
- """Determine if `query_or_table` is a table ID or a SQL string"""
150
- return re .search (r"\s" , query_or_table .strip (), re .MULTILINE ) is not None
148
+ def _to_index_cols (
149
+ index_col : Iterable [str ] | str | bigframes .enums .DefaultIndexKind = (),
150
+ ) -> List [str ]:
151
+ """Convert index_col into a list of column names."""
152
+ if isinstance (index_col , bigframes .enums .DefaultIndexKind ):
153
+ index_cols : List [str ] = []
154
+ elif isinstance (index_col , str ):
155
+ index_cols = [index_col ]
156
+ else :
157
+ index_cols = list (index_col )
151
158
152
-
153
- def _is_table_with_wildcard_suffix (query_or_table : str ) -> bool :
154
- """Determine if `query_or_table` is a table and contains a wildcard suffix."""
155
- return not _is_query (query_or_table ) and query_or_table .endswith ("*" )
159
+ return index_cols
156
160
157
161
158
162
class Session (
@@ -181,12 +185,26 @@ def __init__(
181
185
if context is None :
182
186
context = bigquery_options .BigQueryOptions ()
183
187
184
- # TODO(swast): Get location from the environment.
185
188
if context .location is None :
186
189
self ._location = "US"
187
190
warnings .warn (
188
191
f"No explicit location is set, so using location { self ._location } for the session." ,
189
- stacklevel = 2 ,
192
+ # User's code
193
+ # -> get_global_session()
194
+ # -> connect()
195
+ # -> Session()
196
+ #
197
+ # Note: We could also have:
198
+ # User's code
199
+ # -> read_gbq()
200
+ # -> with_default_session()
201
+ # -> get_global_session()
202
+ # -> connect()
203
+ # -> Session()
204
+ # but we currently have no way to disambiguate these
205
+ # situations.
206
+ stacklevel = 4 ,
207
+ category = bigframes .exceptions .DefaultLocationWarning ,
190
208
)
191
209
else :
192
210
self ._location = context .location
@@ -322,13 +340,19 @@ def read_gbq(
322
340
columns = col_order
323
341
324
342
filters = list (filters )
325
- if len (filters ) != 0 or _is_table_with_wildcard_suffix (query_or_table ):
343
+ if len (filters ) != 0 or bf_io_bigquery .is_table_with_wildcard_suffix (
344
+ query_or_table
345
+ ):
326
346
# TODO(b/338111344): This appears to be missing index_cols, which
327
347
# are necessary to be selected.
328
- # TODO(b/338039517): Also, need to account for primary keys.
329
- query_or_table = self ._to_query (query_or_table , columns , filters )
348
+ # TODO(b/338039517): Refactor this to be called inside both
349
+ # _read_gbq_query and _read_gbq_table (after detecting primary keys)
350
+ # so we can make sure index_col/index_cols reflects primary keys.
351
+ query_or_table = bf_io_bigquery .to_query (
352
+ query_or_table , _to_index_cols (index_col ), columns , filters
353
+ )
330
354
331
- if _is_query (query_or_table ):
355
+ if bf_io_bigquery . is_query (query_or_table ):
332
356
return self ._read_gbq_query (
333
357
query_or_table ,
334
358
index_col = index_col ,
@@ -355,85 +379,6 @@ def read_gbq(
355
379
use_cache = use_cache if use_cache is not None else True ,
356
380
)
357
381
358
- def _to_query (
359
- self ,
360
- query_or_table : str ,
361
- columns : Iterable [str ],
362
- filters : third_party_pandas_gbq .FiltersType ,
363
- ) -> str :
364
- """Compile query_or_table with conditions(filters, wildcards) to query."""
365
- filters = list (filters )
366
- sub_query = (
367
- f"({ query_or_table } )"
368
- if _is_query (query_or_table )
369
- else f"`{ query_or_table } `"
370
- )
371
-
372
- # TODO(b/338111344): Generate an index based on DefaultIndexKind if we
373
- # don't have index columns specified.
374
- select_clause = "SELECT " + (
375
- ", " .join (f"`{ column } `" for column in columns ) if columns else "*"
376
- )
377
-
378
- where_clause = ""
379
- if filters :
380
- valid_operators : Mapping [third_party_pandas_gbq .FilterOps , str ] = {
381
- "in" : "IN" ,
382
- "not in" : "NOT IN" ,
383
- "LIKE" : "LIKE" ,
384
- "==" : "=" ,
385
- ">" : ">" ,
386
- "<" : "<" ,
387
- ">=" : ">=" ,
388
- "<=" : "<=" ,
389
- "!=" : "!=" ,
390
- }
391
-
392
- # If single layer filter, add another pseudo layer. So the single layer represents "and" logic.
393
- if isinstance (filters [0 ], tuple ) and (
394
- len (filters [0 ]) == 0 or not isinstance (list (filters [0 ])[0 ], tuple )
395
- ):
396
- filters = typing .cast (third_party_pandas_gbq .FiltersType , [filters ])
397
-
398
- or_expressions = []
399
- for group in filters :
400
- if not isinstance (group , Iterable ):
401
- group = [group ]
402
-
403
- and_expressions = []
404
- for filter_item in group :
405
- if not isinstance (filter_item , tuple ) or (len (filter_item ) != 3 ):
406
- raise ValueError (
407
- f"Filter condition should be a tuple of length 3, { filter_item } is not valid."
408
- )
409
-
410
- column , operator , value = filter_item
411
-
412
- if not isinstance (column , str ):
413
- raise ValueError (
414
- f"Column name should be a string, but received '{ column } ' of type { type (column ).__name__ } ."
415
- )
416
-
417
- if operator not in valid_operators :
418
- raise ValueError (f"Operator { operator } is not valid." )
419
-
420
- operator_str = valid_operators [operator ]
421
-
422
- if operator_str in ["IN" , "NOT IN" ]:
423
- value_list = ", " .join ([repr (v ) for v in value ])
424
- expression = f"`{ column } ` { operator_str } ({ value_list } )"
425
- else :
426
- expression = f"`{ column } ` { operator_str } { repr (value )} "
427
- and_expressions .append (expression )
428
-
429
- or_expressions .append (" AND " .join (and_expressions ))
430
-
431
- if or_expressions :
432
- where_clause = " WHERE " + " OR " .join (or_expressions )
433
-
434
- full_query = f"{ select_clause } FROM { sub_query } AS sub{ where_clause } "
435
- return full_query
436
-
437
382
def _query_to_destination (
438
383
self ,
439
384
query : str ,
@@ -610,12 +555,7 @@ def _read_gbq_query(
610
555
True if use_cache is None else use_cache
611
556
)
612
557
613
- if isinstance (index_col , bigframes .enums .DefaultIndexKind ):
614
- index_cols = []
615
- elif isinstance (index_col , str ):
616
- index_cols = [index_col ]
617
- else :
618
- index_cols = list (index_col )
558
+ index_cols = _to_index_cols (index_col )
619
559
620
560
destination , query_job = self ._query_to_destination (
621
561
query ,
@@ -682,8 +622,13 @@ def read_gbq_table(
682
622
columns = col_order
683
623
684
624
filters = list (filters )
685
- if len (filters ) != 0 or _is_table_with_wildcard_suffix (query ):
686
- query = self ._to_query (query , columns , filters )
625
+ if len (filters ) != 0 or bf_io_bigquery .is_table_with_wildcard_suffix (query ):
626
+ # TODO(b/338039517): Refactor this to be called inside both
627
+ # _read_gbq_query and _read_gbq_table (after detecting primary keys)
628
+ # so we can make sure index_col/index_cols reflects primary keys.
629
+ query = bf_io_bigquery .to_query (
630
+ query , _to_index_cols (index_col ), columns , filters
631
+ )
687
632
688
633
return self ._read_gbq_query (
689
634
query ,
@@ -838,12 +783,7 @@ def _read_bigquery_load_job(
838
783
index_col : Iterable [str ] | str | bigframes .enums .DefaultIndexKind = (),
839
784
columns : Iterable [str ] = (),
840
785
) -> dataframe .DataFrame :
841
- if isinstance (index_col , bigframes .enums .DefaultIndexKind ):
842
- index_cols = []
843
- elif isinstance (index_col , str ):
844
- index_cols = [index_col ]
845
- else :
846
- index_cols = list (index_col )
786
+ index_cols = _to_index_cols (index_col )
847
787
848
788
if not job_config .clustering_fields and index_cols :
849
789
job_config .clustering_fields = index_cols [:_MAX_CLUSTER_COLUMNS ]
@@ -1430,7 +1370,7 @@ def _create_empty_temp_table(
1430
1370
datetime .datetime .now (datetime .timezone .utc ) + constants .DEFAULT_EXPIRATION
1431
1371
)
1432
1372
1433
- table = bigframes_io .create_temp_table (
1373
+ table = bf_io_bigquery .create_temp_table (
1434
1374
self ,
1435
1375
expiration ,
1436
1376
schema = schema ,
0 commit comments