@@ -96,7 +96,31 @@ def _to_index_cols(
96
96
return index_cols
97
97
98
98
99
- def _check_column_duplicates (index_cols : Iterable [str ], columns : Iterable [str ]):
99
+ def _check_column_duplicates (
100
+ index_cols : Iterable [str ], columns : Iterable [str ], index_col_in_columns : bool
101
+ ) -> Iterable [str ]:
102
+ """Validates and processes index and data columns for duplicates and overlap.
103
+
104
+ This function performs two main tasks:
105
+ 1. Ensures there are no duplicate column names within the `index_cols` list
106
+ or within the `columns` list.
107
+ 2. Based on the `index_col_in_columns` flag, it validates the relationship
108
+ between `index_cols` and `columns`.
109
+
110
+ Args:
111
+ index_cols (Iterable[str]):
112
+ An iterable of column names designated as the index.
113
+ columns (Iterable[str]):
114
+ An iterable of column names designated as the data columns.
115
+ index_col_in_columns (bool):
116
+ A flag indicating how to handle overlap between `index_cols` and
117
+ `columns`.
118
+ - If `False`, the two lists must be disjoint (contain no common
119
+ elements). An error is raised if any overlap is found.
120
+ - If `True`, `index_cols` is expected to be a subset of
121
+ `columns`. An error is raised if an index column is not found
122
+ in the `columns` list.
123
+ """
100
124
index_cols_list = list (index_cols ) if index_cols is not None else []
101
125
columns_list = list (columns ) if columns is not None else []
102
126
set_index = set (index_cols_list )
@@ -108,17 +132,29 @@ def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]):
108
132
"All column names specified in 'index_col' must be unique."
109
133
)
110
134
135
+ if len (columns_list ) == 0 :
136
+ return columns
137
+
111
138
if len (columns_list ) > len (set_columns ):
112
139
raise ValueError (
113
140
"The 'columns' argument contains duplicate names. "
114
141
"All column names specified in 'columns' must be unique."
115
142
)
116
143
117
- if not set_index .isdisjoint (set_columns ):
118
- raise ValueError (
119
- "Found column names that exist in both 'index_col' and 'columns' arguments. "
120
- "These arguments must specify distinct sets of columns."
121
- )
144
+ if index_col_in_columns :
145
+ if not set_index .issubset (set_columns ):
146
+ raise ValueError (
147
+ f"The specified index column(s) were not found: { set_index - set_columns } . "
148
+ f"Available columns are: { set_columns } "
149
+ )
150
+ return [col for col in columns if col not in set_index ]
151
+ else :
152
+ if not set_index .isdisjoint (set_columns ):
153
+ raise ValueError (
154
+ "Found column names that exist in both 'index_col' and 'columns' arguments. "
155
+ "These arguments must specify distinct sets of columns."
156
+ )
157
+ return columns
122
158
123
159
124
160
@dataclasses .dataclass
@@ -391,6 +427,7 @@ def read_gbq_table( # type: ignore[overload-overlap]
391
427
dry_run : Literal [False ] = ...,
392
428
force_total_order : Optional [bool ] = ...,
393
429
n_rows : Optional [int ] = None ,
430
+ index_col_in_columns : bool = False ,
394
431
) -> dataframe .DataFrame :
395
432
...
396
433
@@ -413,6 +450,7 @@ def read_gbq_table(
413
450
dry_run : Literal [True ] = ...,
414
451
force_total_order : Optional [bool ] = ...,
415
452
n_rows : Optional [int ] = None ,
453
+ index_col_in_columns : bool = False ,
416
454
) -> pandas .Series :
417
455
...
418
456
@@ -434,7 +472,67 @@ def read_gbq_table(
434
472
dry_run : bool = False ,
435
473
force_total_order : Optional [bool ] = None ,
436
474
n_rows : Optional [int ] = None ,
475
+ index_col_in_columns : bool = False ,
437
476
) -> dataframe .DataFrame | pandas .Series :
477
+ """Read a BigQuery table into a BigQuery DataFrames DataFrame.
478
+
479
+ This method allows you to create a DataFrame from a BigQuery table.
480
+ You can specify the columns to load, an index column, and apply
481
+ filters.
482
+
483
+ Args:
484
+ table_id (str):
485
+ The identifier of the BigQuery table to read.
486
+ index_col (Iterable[str] | str | Iterable[int] | int | bigframes.enums.DefaultIndexKind, optional):
487
+ The column(s) to use as the index for the DataFrame. This can be
488
+ a single column name or a list of column names. If not provided,
489
+ a default index will be used based on the session's
490
+ ``default_index_type``.
491
+ columns (Iterable[str], optional):
492
+ The columns to read from the table. If not specified, all
493
+ columns will be read.
494
+ names (Optional[Iterable[str]], optional):
495
+ A list of column names to use for the resulting DataFrame. This
496
+ is useful if you want to rename the columns as you read the
497
+ data.
498
+ max_results (Optional[int], optional):
499
+ The maximum number of rows to retrieve from the table. If not
500
+ specified, all rows will be loaded.
501
+ use_cache (bool, optional):
502
+ Whether to use cached results for the query. Defaults to True.
503
+ Setting this to False will force a re-execution of the query.
504
+ filters (third_party_pandas_gbq.FiltersType, optional):
505
+ A list of filters to apply to the data. Filters are specified
506
+ as a list of tuples, where each tuple contains a column name,
507
+ an operator (e.g., '==', '!='), and a value.
508
+ enable_snapshot (bool, optional):
509
+ If True, a snapshot of the table is used to ensure that the
510
+ DataFrame is deterministic, even if the underlying table
511
+ changes. Defaults to True.
512
+ dry_run (bool, optional):
513
+ If True, the function will not actually execute the query but
514
+ will instead return statistics about the table. Defaults to False.
515
+ force_total_order (Optional[bool], optional):
516
+ If True, a total ordering is enforced on the DataFrame, which
517
+ can be useful for operations that require a stable row order.
518
+ If None, the session's default behavior is used.
519
+ n_rows (Optional[int], optional):
520
+ The number of rows to consider for type inference and other
521
+ metadata operations. This does not limit the number of rows
522
+ in the final DataFrame.
523
+ index_col_in_columns (bool, optional):
524
+ Specifies if the ``index_col`` is also present in the ``columns``
525
+ list. Defaults to ``False``.
526
+
527
+ * If ``False``, ``index_col`` and ``columns`` must specify
528
+ distinct sets of columns. An error will be raised if any
529
+ column is found in both.
530
+ * If ``True``, the column(s) in ``index_col`` are expected to
531
+ also be present in the ``columns`` list. This is useful
532
+ when the index is selected from the data columns (e.g., in a
533
+ ``read_csv`` scenario). The column will be used as the
534
+ DataFrame's index and removed from the list of value columns.
535
+ """
438
536
import bigframes ._tools .strings
439
537
import bigframes .dataframe as dataframe
440
538
@@ -516,7 +614,9 @@ def read_gbq_table(
516
614
index_col = index_col ,
517
615
names = names ,
518
616
)
519
- _check_column_duplicates (index_cols , columns )
617
+ columns = list (
618
+ _check_column_duplicates (index_cols , columns , index_col_in_columns )
619
+ )
520
620
521
621
for key in index_cols :
522
622
if key not in table_column_names :
@@ -798,7 +898,9 @@ def read_gbq_query(
798
898
)
799
899
800
900
index_cols = _to_index_cols (index_col )
801
- _check_column_duplicates (index_cols , columns )
901
+ columns = _check_column_duplicates (
902
+ index_cols , columns , index_col_in_columns = False
903
+ )
802
904
803
905
filters_copy1 , filters_copy2 = itertools .tee (filters )
804
906
has_filters = len (list (filters_copy1 )) != 0
0 commit comments