diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 65435bd902..2acb4ae961 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -184,6 +184,7 @@ def read_gbq( # type: ignore[overload-overlap] use_cache: Optional[bool] = ..., col_order: Iterable[str] = ..., dry_run: Literal[False] = ..., + allow_large_results: bool = ..., ) -> bigframes.dataframe.DataFrame: ... @@ -200,6 +201,7 @@ def read_gbq( use_cache: Optional[bool] = ..., col_order: Iterable[str] = ..., dry_run: Literal[True] = ..., + allow_large_results: bool = ..., ) -> pandas.Series: ... @@ -215,6 +217,7 @@ def read_gbq( use_cache: Optional[bool] = None, col_order: Iterable[str] = (), dry_run: bool = False, + allow_large_results: bool = True, ) -> bigframes.dataframe.DataFrame | pandas.Series: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( @@ -228,6 +231,7 @@ def read_gbq( use_cache=use_cache, col_order=col_order, dry_run=dry_run, + allow_large_results=allow_large_results, ) @@ -391,6 +395,7 @@ def read_gbq_query( # type: ignore[overload-overlap] col_order: Iterable[str] = ..., filters: vendored_pandas_gbq.FiltersType = ..., dry_run: Literal[False] = ..., + allow_large_results: bool = ..., ) -> bigframes.dataframe.DataFrame: ... @@ -407,6 +412,7 @@ def read_gbq_query( col_order: Iterable[str] = ..., filters: vendored_pandas_gbq.FiltersType = ..., dry_run: Literal[True] = ..., + allow_large_results: bool = ..., ) -> pandas.Series: ... @@ -422,6 +428,7 @@ def read_gbq_query( col_order: Iterable[str] = (), filters: vendored_pandas_gbq.FiltersType = (), dry_run: bool = False, + allow_large_results: bool = True, ) -> bigframes.dataframe.DataFrame | pandas.Series: _set_default_session_location_if_possible(query) return global_session.with_default_session( @@ -435,6 +442,7 @@ def read_gbq_query( col_order=col_order, filters=filters, dry_run=dry_run, + allow_large_results=allow_large_results, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index d27cd48cdd..de97a81344 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -394,6 +394,7 @@ def read_gbq( # type: ignore[overload-overlap] use_cache: Optional[bool] = ..., col_order: Iterable[str] = ..., dry_run: Literal[False] = ..., + allow_large_results: bool = ..., ) -> dataframe.DataFrame: ... @@ -410,6 +411,7 @@ def read_gbq( use_cache: Optional[bool] = ..., col_order: Iterable[str] = ..., dry_run: Literal[True] = ..., + allow_large_results: bool = ..., ) -> pandas.Series: ... @@ -424,8 +426,8 @@ def read_gbq( filters: third_party_pandas_gbq.FiltersType = (), use_cache: Optional[bool] = None, col_order: Iterable[str] = (), - dry_run: bool = False - # Add a verify index argument that fails if the index is not unique. + dry_run: bool = False, + allow_large_results: bool = True, ) -> dataframe.DataFrame | pandas.Series: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. if columns and col_order: @@ -445,6 +447,7 @@ def read_gbq( use_cache=use_cache, filters=filters, dry_run=dry_run, + allow_large_results=allow_large_results, ) else: if configuration is not None: @@ -551,6 +554,7 @@ def read_gbq_query( # type: ignore[overload-overlap] col_order: Iterable[str] = ..., filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[False] = ..., + allow_large_results: bool = ..., ) -> dataframe.DataFrame: ... @@ -567,6 +571,7 @@ def read_gbq_query( col_order: Iterable[str] = ..., filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[True] = ..., + allow_large_results: bool = ..., ) -> pandas.Series: ... @@ -582,6 +587,7 @@ def read_gbq_query( col_order: Iterable[str] = (), filters: third_party_pandas_gbq.FiltersType = (), dry_run: bool = False, + allow_large_results: bool = True, ) -> dataframe.DataFrame | pandas.Series: """Turn a SQL query into a DataFrame. @@ -631,9 +637,48 @@ def read_gbq_query( See also: :meth:`Session.read_gbq`. + Args: + query (str): + A SQL query to execute. + index_col (Iterable[str] or str, optional): + The column(s) to use as the index for the DataFrame. This can be + a single column name or a list of column names. If not provided, + a default index will be used. + columns (Iterable[str], optional): + The columns to read from the query result. If not + specified, all columns will be read. + configuration (dict, optional): + A dictionary of query job configuration options. See the + BigQuery REST API documentation for a list of available options: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query + max_results (int, optional): + The maximum number of rows to retrieve from the query + result. If not specified, all rows will be loaded. + use_cache (bool, optional): + Whether to use cached results for the query. Defaults to ``True``. + Setting this to ``False`` will force a re-execution of the query. + col_order (Iterable[str], optional): + The desired order of columns in the resulting DataFrame. This + parameter is deprecated and will be removed in a future version. + Use ``columns`` instead. + filters (list[tuple], optional): + A list of filters to apply to the data. Filters are specified + as a list of tuples, where each tuple contains a column name, + an operator (e.g., '==', '!='), and a value. + dry_run (bool, optional): + If ``True``, the function will not actually execute the query but + will instead return statistics about the query. Defaults to + ``False``. + allow_large_results (bool, optional): + Whether to allow large query results. If ``True``, the query + results can be larger than the maximum response size. + Defaults to ``True``. + Returns: - bigframes.pandas.DataFrame: - A DataFrame representing results of the query or table. + bigframes.pandas.DataFrame or pandas.Series: + A DataFrame representing the result of the query. If ``dry_run`` + is ``True``, a ``pandas.Series`` containing query statistics is + returned. Raises: ValueError: @@ -657,6 +702,7 @@ def read_gbq_query( use_cache=use_cache, filters=filters, dry_run=dry_run, + allow_large_results=allow_large_results, ) @overload @@ -714,9 +760,40 @@ def read_gbq_table( See also: :meth:`Session.read_gbq`. + Args: + table_id (str): + The identifier of the BigQuery table to read. + index_col (Iterable[str] or str, optional): + The column(s) to use as the index for the DataFrame. This can be + a single column name or a list of column names. If not provided, + a default index will be used. + columns (Iterable[str], optional): + The columns to read from the table. If not specified, all + columns will be read. + max_results (int, optional): + The maximum number of rows to retrieve from the table. If not + specified, all rows will be loaded. + filters (list[tuple], optional): + A list of filters to apply to the data. Filters are specified + as a list of tuples, where each tuple contains a column name, + an operator (e.g., '==', '!='), and a value. + use_cache (bool, optional): + Whether to use cached results for the query. Defaults to ``True``. + Setting this to ``False`` will force a re-execution of the query. + col_order (Iterable[str], optional): + The desired order of columns in the resulting DataFrame. This + parameter is deprecated and will be removed in a future version. + Use ``columns`` instead. + dry_run (bool, optional): + If ``True``, the function will not actually execute the query but + will instead return statistics about the table. Defaults to + ``False``. + Returns: - bigframes.pandas.DataFrame: - A DataFrame representing results of the query or table. + bigframes.pandas.DataFrame or pandas.Series: + A DataFrame representing the contents of the table. If + ``dry_run`` is ``True``, a ``pandas.Series`` containing table + statistics is returned. Raises: ValueError: diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index a04da64af0..f480984c5b 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -33,6 +33,7 @@ import pytest import bigframes +import bigframes.core.nodes as nodes import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model @@ -640,6 +641,37 @@ def test_read_gbq_with_configuration( assert df.shape == (9, 3) +def test_read_gbq_query_w_allow_large_results(session: bigframes.Session): + if not hasattr(session.bqclient, "default_job_creation_mode"): + pytest.skip("Jobless query only available on newer google-cloud-bigquery.") + + query = "SELECT 1" + + # Make sure we don't get a cached table. + configuration = {"query": {"useQueryCache": False}} + + # Very small results should wrap a local node. + df_false = session.read_gbq( + query, + configuration=configuration, + allow_large_results=False, + ) + assert df_false.shape == (1, 1) + roots_false = df_false._get_block().expr.node.roots + assert any(isinstance(node, nodes.ReadLocalNode) for node in roots_false) + assert not any(isinstance(node, nodes.ReadTableNode) for node in roots_false) + + # Large results allowed should wrap a table. + df_true = session.read_gbq( + query, + configuration=configuration, + allow_large_results=True, + ) + assert df_true.shape == (1, 1) + roots_true = df_true._get_block().expr.node.roots + assert any(isinstance(node, nodes.ReadTableNode) for node in roots_true) + + def test_read_gbq_with_custom_global_labels( session: bigframes.Session, scalars_table_id: str ): diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 3dae2b6bbe..c9b9ab9292 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -25,6 +25,7 @@ def read_gbq( filters: FiltersType = (), use_cache: Optional[bool] = None, col_order: Iterable[str] = (), + allow_large_results: bool = True, ): """Loads a DataFrame from BigQuery. @@ -156,6 +157,11 @@ def read_gbq( `configuration` to avoid conflicts. col_order (Iterable[str]): Alias for columns, retained for backwards compatibility. + allow_large_results (bool, optional): + Whether to allow large query results. If ``True``, the query + results can be larger than the maximum response size. This + option is only applicable when ``query_or_table`` is a query. + Defaults to ``True``. Raises: bigframes.exceptions.DefaultIndexWarning: