25
25
import dataclasses
26
26
import functools
27
27
import itertools
28
- import os
29
28
import random
30
29
import textwrap
31
30
import typing
32
- from typing import Iterable , List , Literal , Mapping , Optional , Sequence , Tuple , Union
31
+ from typing import (
32
+ Iterable ,
33
+ List ,
34
+ Literal ,
35
+ Mapping ,
36
+ Optional ,
37
+ Sequence ,
38
+ Tuple ,
39
+ TYPE_CHECKING ,
40
+ Union ,
41
+ )
33
42
import warnings
34
43
35
44
import bigframes_vendored .constants as constants
56
65
import bigframes .features
57
66
import bigframes .operations as ops
58
67
import bigframes .operations .aggregations as agg_ops
59
- import bigframes .session ._io .pandas
68
+ import bigframes .session ._io .pandas as io_pandas
69
+
70
+ if TYPE_CHECKING :
71
+ import bigframes .session .executor
60
72
61
73
# Type constraint for wherever column labels are used
62
74
Label = typing .Hashable
@@ -450,46 +462,14 @@ def reorder_levels(self, ids: typing.Sequence[str]):
450
462
level_names = [self .col_id_to_index_name [index_id ] for index_id in ids ]
451
463
return Block (self .expr , ids , self .column_labels , level_names )
452
464
453
- def _to_dataframe (self , result ) -> pd .DataFrame :
454
- """Convert BigQuery data to pandas DataFrame with specific dtypes."""
455
- result_dataframe = self .session ._rows_to_dataframe (result )
456
- # Runs strict validations to ensure internal type predictions and ibis are completely in sync
457
- # Do not execute these validations outside of testing suite.
458
- if "PYTEST_CURRENT_TEST" in os .environ :
459
- self ._validate_result_schema (result .schema )
460
- return result_dataframe
461
-
462
- def _validate_result_schema (
463
- self , bq_result_schema : list [bigquery .schema .SchemaField ]
464
- ):
465
- actual_schema = tuple (bq_result_schema )
466
- ibis_schema = self .expr ._compiled_schema
467
- internal_schema = self .expr .schema
468
- if not bigframes .features .PANDAS_VERSIONS .is_arrow_list_dtype_usable :
469
- return
470
- if internal_schema .to_bigquery () != actual_schema :
471
- raise ValueError (
472
- f"This error should only occur while testing. BigFrames internal schema: { internal_schema .to_bigquery ()} does not match actual schema: { actual_schema } "
473
- )
474
- if ibis_schema .to_bigquery () != actual_schema :
475
- raise ValueError (
476
- f"This error should only occur while testing. Ibis schema: { ibis_schema .to_bigquery ()} does not match actual schema: { actual_schema } "
477
- )
478
-
479
465
def to_arrow (
480
466
self ,
481
467
* ,
482
468
ordered : bool = True ,
483
469
) -> Tuple [pa .Table , bigquery .QueryJob ]:
484
470
"""Run query and download results as a pyarrow Table."""
485
- # pa.Table.from_pandas puts index columns last, so update the expression to match.
486
- expr = self .expr .select_columns (
487
- list (self .value_columns ) + list (self .index_columns )
488
- )
489
-
490
- _ , query_job = self .session ._execute (expr , ordered = ordered )
491
- results_iterator = query_job .result ()
492
- pa_table = results_iterator .to_arrow ()
471
+ execute_result = self .session ._executor .execute (self .expr , ordered = ordered )
472
+ pa_table = execute_result .to_arrow_table ()
493
473
494
474
pa_index_labels = []
495
475
for index_level , index_label in enumerate (self ._index_labels ):
@@ -498,8 +478,10 @@ def to_arrow(
498
478
else :
499
479
pa_index_labels .append (f"__index_level_{ index_level } __" )
500
480
481
+ # pa.Table.from_pandas puts index columns last, so update to match.
482
+ pa_table = pa_table .select ([* self .value_columns , * self .index_columns ])
501
483
pa_table = pa_table .rename_columns (list (self .column_labels ) + pa_index_labels )
502
- return pa_table , query_job
484
+ return pa_table , execute_result . query_job
503
485
504
486
def to_pandas (
505
487
self ,
@@ -508,7 +490,7 @@ def to_pandas(
508
490
random_state : Optional [int ] = None ,
509
491
* ,
510
492
ordered : bool = True ,
511
- ) -> Tuple [pd .DataFrame , bigquery .QueryJob ]:
493
+ ) -> Tuple [pd .DataFrame , Optional [ bigquery .QueryJob ] ]:
512
494
"""Run query and download results as a pandas DataFrame.
513
495
514
496
Args:
@@ -560,8 +542,8 @@ def try_peek(
560
542
self , n : int = 20 , force : bool = False
561
543
) -> typing .Optional [pd .DataFrame ]:
562
544
if force or self .expr .supports_fast_peek :
563
- iterator , _ = self .session ._peek (self .expr , n )
564
- df = self . _to_dataframe ( iterator )
545
+ result = self .session ._executor . peek (self .expr , n )
546
+ df = io_pandas . arrow_to_pandas ( result . to_arrow_table (), self . expr . schema )
565
547
self ._copy_index_to_pandas (df )
566
548
return df
567
549
else :
@@ -574,18 +556,15 @@ def to_pandas_batches(
574
556
575
557
page_size and max_results determine the size and number of batches,
576
558
see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result"""
577
- dtypes = dict (zip (self .index_columns , self .index .dtypes ))
578
- dtypes .update (zip (self .value_columns , self .dtypes ))
579
- _ , query_job = self .session ._executor .execute (
580
- self .expr , ordered = True , use_explicit_destination = True
581
- )
582
- results_iterator = query_job .result (
583
- page_size = page_size , max_results = max_results
584
- )
585
- for arrow_table in results_iterator .to_arrow_iterable (
586
- bqstorage_client = self .session .bqstoragereadclient
587
- ):
588
- df = bigframes .session ._io .pandas .arrow_to_pandas (arrow_table , dtypes )
559
+ execute_result = self .session ._executor .execute (
560
+ self .expr ,
561
+ ordered = True ,
562
+ use_explicit_destination = True ,
563
+ page_size = page_size ,
564
+ max_results = max_results ,
565
+ )
566
+ for record_batch in execute_result .arrow_batches ():
567
+ df = io_pandas .arrow_to_pandas (record_batch , self .expr .schema )
589
568
self ._copy_index_to_pandas (df )
590
569
yield df
591
570
@@ -605,22 +584,19 @@ def _copy_index_to_pandas(self, df: pd.DataFrame):
605
584
606
585
def _materialize_local (
607
586
self , materialize_options : MaterializationOptions = MaterializationOptions ()
608
- ) -> Tuple [pd .DataFrame , bigquery .QueryJob ]:
587
+ ) -> Tuple [pd .DataFrame , Optional [ bigquery .QueryJob ] ]:
609
588
"""Run query and download results as a pandas DataFrame. Return the total number of results as well."""
610
589
# TODO(swast): Allow for dry run and timeout.
611
- _ , query_job = self .session ._execute (
612
- self .expr , ordered = materialize_options .ordered
613
- )
614
- results_iterator = query_job .result ()
615
-
616
- table_size = (
617
- self .session ._get_table_size (query_job .destination ) / _BYTES_TO_MEGABYTES
590
+ execute_result = self .session ._executor .execute (
591
+ self .expr , ordered = materialize_options .ordered , get_size_bytes = True
618
592
)
593
+ assert execute_result .total_bytes is not None
594
+ table_mb = execute_result .total_bytes / _BYTES_TO_MEGABYTES
619
595
sample_config = materialize_options .downsampling
620
596
max_download_size = sample_config .max_download_size
621
597
fraction = (
622
- max_download_size / table_size
623
- if (max_download_size is not None ) and (table_size != 0 )
598
+ max_download_size / table_mb
599
+ if (max_download_size is not None ) and (table_mb != 0 )
624
600
else 2
625
601
)
626
602
@@ -629,7 +605,7 @@ def _materialize_local(
629
605
if fraction < 1 :
630
606
if not sample_config .enable_downsampling :
631
607
raise RuntimeError (
632
- f"The data size ({ table_size :.2f} MB) exceeds the maximum download limit of "
608
+ f"The data size ({ table_mb :.2f} MB) exceeds the maximum download limit of "
633
609
f"{ max_download_size } MB. You can:\n \t * Enable downsampling in global options:\n "
634
610
"\t \t `bigframes.options.sampling.enable_downsampling = True`\n "
635
611
"\t * Update the global `max_download_size` option. Please make sure "
@@ -640,12 +616,12 @@ def _materialize_local(
640
616
)
641
617
642
618
warnings .warn (
643
- f"The data size ({ table_size :.2f} MB) exceeds the maximum download limit of"
619
+ f"The data size ({ table_mb :.2f} MB) exceeds the maximum download limit of"
644
620
f"({ max_download_size } MB). It will be downsampled to { max_download_size } MB for download."
645
621
"\n Please refer to the documentation for configuring the downloading limit." ,
646
622
UserWarning ,
647
623
)
648
- total_rows = results_iterator .total_rows
624
+ total_rows = execute_result .total_rows
649
625
# Remove downsampling config from subsequent invocations, as otherwise could result in many
650
626
# iterations if downsampling undershoots
651
627
return self ._downsample (
@@ -657,11 +633,12 @@ def _materialize_local(
657
633
MaterializationOptions (ordered = materialize_options .ordered )
658
634
)
659
635
else :
660
- total_rows = results_iterator .total_rows
661
- df = self ._to_dataframe (results_iterator )
636
+ total_rows = execute_result .total_rows
637
+ arrow = self .session ._executor .execute (self .expr ).to_arrow_table ()
638
+ df = io_pandas .arrow_to_pandas (arrow , schema = self .expr .schema )
662
639
self ._copy_index_to_pandas (df )
663
640
664
- return df , query_job
641
+ return df , execute_result . query_job
665
642
666
643
def _downsample (
667
644
self , total_rows : int , sampling_method : str , fraction : float , random_state
@@ -680,7 +657,7 @@ def _downsample(
680
657
)
681
658
return block
682
659
elif sampling_method == _UNIFORM :
683
- block = self ._split (
660
+ block = self .split (
684
661
fracs = (fraction ,),
685
662
random_state = random_state ,
686
663
sort = False ,
@@ -693,7 +670,7 @@ def _downsample(
693
670
f"please choose from { ',' .join (_SAMPLING_METHODS )} ."
694
671
)
695
672
696
- def _split (
673
+ def split (
697
674
self ,
698
675
ns : Iterable [int ] = (),
699
676
fracs : Iterable [float ] = (),
@@ -785,7 +762,7 @@ def _compute_dry_run(
785
762
self , value_keys : Optional [Iterable [str ]] = None
786
763
) -> bigquery .QueryJob :
787
764
expr = self ._apply_value_keys_to_expr (value_keys = value_keys )
788
- _ , query_job = self .session ._dry_run (expr )
765
+ query_job = self .session ._executor . dry_run (expr )
789
766
return query_job
790
767
791
768
def _apply_value_keys_to_expr (self , value_keys : Optional [Iterable [str ]] = None ):
@@ -1567,20 +1544,21 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1):
1567
1544
@functools .cache
1568
1545
def retrieve_repr_request_results (
1569
1546
self , max_results : int
1570
- ) -> Tuple [pd .DataFrame , int , bigquery .QueryJob ]:
1547
+ ) -> Tuple [pd .DataFrame , int , Optional [ bigquery .QueryJob ] ]:
1571
1548
"""
1572
1549
Retrieves a pandas dataframe containing only max_results many rows for use
1573
1550
with printing methods.
1574
1551
1575
1552
Returns a tuple of the dataframe and the overall number of rows of the query.
1576
1553
"""
1577
1554
1578
- results , query_job = self .session ._executor .head (self .expr , max_results )
1555
+ head_result = self .session ._executor .head (self .expr , max_results )
1579
1556
count = self .session ._executor .get_row_count (self .expr )
1580
1557
1581
- computed_df = self ._to_dataframe (results )
1582
- self ._copy_index_to_pandas (computed_df )
1583
- return computed_df , count , query_job
1558
+ arrow = self .session ._executor .execute (self .expr ).to_arrow_table ()
1559
+ df = io_pandas .arrow_to_pandas (arrow , schema = self .expr .schema )
1560
+ self ._copy_index_to_pandas (df )
1561
+ return df , count , head_result .query_job
1584
1562
1585
1563
def promote_offsets (self , label : Label = None ) -> typing .Tuple [Block , str ]:
1586
1564
expr , result_id = self ._expr .promote_offsets ()
@@ -2330,7 +2308,10 @@ def to_sql_query(
2330
2308
# the BigQuery unicode column name feature?
2331
2309
substitutions [old_id ] = new_id
2332
2310
2333
- sql = self .session ._to_sql (
2311
+ # Note: this uses the sql from the executor, so is coupled tightly to execution
2312
+ # implementaton. It will reference cached tables instead of original data sources.
2313
+ # Maybe should just compile raw BFET? Depends on user intent.
2314
+ sql = self .session ._executor .to_sql (
2334
2315
array_value , col_id_overrides = substitutions , enable_cache = enable_cache
2335
2316
)
2336
2317
return (
@@ -2424,7 +2405,7 @@ def _get_rows_as_json_values(self) -> Block:
2424
2405
# TODO(shobs): Replace direct SQL manipulation by structured expression
2425
2406
# manipulation
2426
2407
expr , ordering_column_name = self .expr .promote_offsets ()
2427
- expr_sql = self .session ._to_sql (expr )
2408
+ expr_sql = self .session ._executor . to_sql (expr )
2428
2409
2429
2410
# Names of the columns to serialize for the row.
2430
2411
# We will use the repr-eval pattern to serialize a value here and
@@ -2578,17 +2559,8 @@ def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index:
2578
2559
raise bigframes .exceptions .NullIndexError (
2579
2560
"Cannot materialize index, as this object does not have an index. Set index column(s) using set_index."
2580
2561
)
2581
- # Project down to only the index column. So the query can be cached to visualize other data.
2582
- index_columns = list (self ._block .index_columns )
2583
- expr = self ._expr .select_columns (index_columns )
2584
- results , _ = self .session ._execute (
2585
- expr , ordered = ordered if ordered is not None else True
2586
- )
2587
- df = expr .session ._rows_to_dataframe (results )
2588
- df = df .set_index (index_columns )
2589
- index = df .index
2590
- index .names = list (self ._block ._index_labels ) # type:ignore
2591
- return index
2562
+ ordered = ordered if ordered is not None else True
2563
+ return self ._block .select_columns ([]).to_pandas (ordered = ordered )[0 ].index
2592
2564
2593
2565
def resolve_level (self , level : LevelsType ) -> typing .Sequence [str ]:
2594
2566
if utils .is_list_like (level ):
0 commit comments