Skip to content

feat: add allow_large_results option to read_gbq_query. Set to False to enable faster queries #1935

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions bigframes/pandas/io/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def read_gbq( # type: ignore[overload-overlap]
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -200,6 +201,7 @@ def read_gbq(
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -215,6 +217,7 @@ def read_gbq(
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
dry_run: bool = False,
allow_large_results: bool = True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should allow_large_results also default to None? This would allow it to inherit its value from ComputeOptions.allow_large_results.

) -> bigframes.dataframe.DataFrame | pandas.Series:
_set_default_session_location_if_possible(query_or_table)
return global_session.with_default_session(
Expand All @@ -228,6 +231,7 @@ def read_gbq(
use_cache=use_cache,
col_order=col_order,
dry_run=dry_run,
allow_large_results=allow_large_results,
)


Expand Down Expand Up @@ -391,6 +395,7 @@ def read_gbq_query( # type: ignore[overload-overlap]
col_order: Iterable[str] = ...,
filters: vendored_pandas_gbq.FiltersType = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -407,6 +412,7 @@ def read_gbq_query(
col_order: Iterable[str] = ...,
filters: vendored_pandas_gbq.FiltersType = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -422,6 +428,7 @@ def read_gbq_query(
col_order: Iterable[str] = (),
filters: vendored_pandas_gbq.FiltersType = (),
dry_run: bool = False,
allow_large_results: bool = True,
) -> bigframes.dataframe.DataFrame | pandas.Series:
_set_default_session_location_if_possible(query)
return global_session.with_default_session(
Expand All @@ -435,6 +442,7 @@ def read_gbq_query(
col_order=col_order,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)


Expand Down
89 changes: 83 additions & 6 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ def read_gbq( # type: ignore[overload-overlap]
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> dataframe.DataFrame:
...

Expand All @@ -410,6 +411,7 @@ def read_gbq(
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -424,8 +426,8 @@ def read_gbq(
filters: third_party_pandas_gbq.FiltersType = (),
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
dry_run: bool = False
# Add a verify index argument that fails if the index is not unique.
dry_run: bool = False,
allow_large_results: bool = True,
) -> dataframe.DataFrame | pandas.Series:
# TODO(b/281571214): Generate prompt to show the progress of read_gbq.
if columns and col_order:
Expand All @@ -445,6 +447,7 @@ def read_gbq(
use_cache=use_cache,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)
else:
if configuration is not None:
Expand Down Expand Up @@ -551,6 +554,7 @@ def read_gbq_query( # type: ignore[overload-overlap]
col_order: Iterable[str] = ...,
filters: third_party_pandas_gbq.FiltersType = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> dataframe.DataFrame:
...

Expand All @@ -567,6 +571,7 @@ def read_gbq_query(
col_order: Iterable[str] = ...,
filters: third_party_pandas_gbq.FiltersType = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -582,6 +587,7 @@ def read_gbq_query(
col_order: Iterable[str] = (),
filters: third_party_pandas_gbq.FiltersType = (),
dry_run: bool = False,
allow_large_results: bool = True,
) -> dataframe.DataFrame | pandas.Series:
"""Turn a SQL query into a DataFrame.

Expand Down Expand Up @@ -631,9 +637,48 @@ def read_gbq_query(

See also: :meth:`Session.read_gbq`.

Args:
query (str):
A SQL query to execute.
index_col (Iterable[str] or str, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used.
columns (Iterable[str], optional):
The columns to read from the query result. If not
specified, all columns will be read.
configuration (dict, optional):
A dictionary of query job configuration options. See the
BigQuery REST API documentation for a list of available options:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query
max_results (int, optional):
The maximum number of rows to retrieve from the query
result. If not specified, all rows will be loaded.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to ``True``.
Setting this to ``False`` will force a re-execution of the query.
col_order (Iterable[str], optional):
The desired order of columns in the resulting DataFrame. This
parameter is deprecated and will be removed in a future version.
Use ``columns`` instead.
filters (list[tuple], optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
dry_run (bool, optional):
If ``True``, the function will not actually execute the query but
will instead return statistics about the query. Defaults to
``False``.
allow_large_results (bool, optional):
Whether to allow large query results. If ``True``, the query
results can be larger than the maximum response size.
Defaults to ``True``.

Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
bigframes.pandas.DataFrame or pandas.Series:
A DataFrame representing the result of the query. If ``dry_run``
is ``True``, a ``pandas.Series`` containing query statistics is
returned.

Raises:
ValueError:
Expand All @@ -657,6 +702,7 @@ def read_gbq_query(
use_cache=use_cache,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)

@overload
Expand Down Expand Up @@ -714,9 +760,40 @@ def read_gbq_table(

See also: :meth:`Session.read_gbq`.

Args:
table_id (str):
The identifier of the BigQuery table to read.
index_col (Iterable[str] or str, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used.
columns (Iterable[str], optional):
The columns to read from the table. If not specified, all
columns will be read.
max_results (int, optional):
The maximum number of rows to retrieve from the table. If not
specified, all rows will be loaded.
filters (list[tuple], optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to ``True``.
Setting this to ``False`` will force a re-execution of the query.
col_order (Iterable[str], optional):
The desired order of columns in the resulting DataFrame. This
parameter is deprecated and will be removed in a future version.
Use ``columns`` instead.
dry_run (bool, optional):
If ``True``, the function will not actually execute the query but
will instead return statistics about the table. Defaults to
``False``.

Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
bigframes.pandas.DataFrame or pandas.Series:
A DataFrame representing the contents of the table. If
``dry_run`` is ``True``, a ``pandas.Series`` containing table
statistics is returned.

Raises:
ValueError:
Expand Down
32 changes: 32 additions & 0 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import pytest

import bigframes
import bigframes.core.nodes as nodes
import bigframes.dataframe
import bigframes.dtypes
import bigframes.ml.linear_model
Expand Down Expand Up @@ -640,6 +641,37 @@ def test_read_gbq_with_configuration(
assert df.shape == (9, 3)


def test_read_gbq_query_w_allow_large_results(session: bigframes.Session):
if not hasattr(session.bqclient, "default_job_creation_mode"):
pytest.skip("Jobless query only available on newer google-cloud-bigquery.")

query = "SELECT 1"

# Make sure we don't get a cached table.
configuration = {"query": {"useQueryCache": False}}

# Very small results should wrap a local node.
df_false = session.read_gbq(
query,
configuration=configuration,
allow_large_results=False,
)
assert df_false.shape == (1, 1)
roots_false = df_false._get_block().expr.node.roots
assert any(isinstance(node, nodes.ReadLocalNode) for node in roots_false)
assert not any(isinstance(node, nodes.ReadTableNode) for node in roots_false)

# Large results allowed should wrap a table.
df_true = session.read_gbq(
query,
configuration=configuration,
allow_large_results=True,
)
assert df_true.shape == (1, 1)
roots_true = df_true._get_block().expr.node.roots
assert any(isinstance(node, nodes.ReadTableNode) for node in roots_true)


def test_read_gbq_with_custom_global_labels(
session: bigframes.Session, scalars_table_id: str
):
Expand Down
6 changes: 6 additions & 0 deletions third_party/bigframes_vendored/pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def read_gbq(
filters: FiltersType = (),
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
allow_large_results: bool = True,
):
"""Loads a DataFrame from BigQuery.

Expand Down Expand Up @@ -156,6 +157,11 @@ def read_gbq(
`configuration` to avoid conflicts.
col_order (Iterable[str]):
Alias for columns, retained for backwards compatibility.
allow_large_results (bool, optional):
Whether to allow large query results. If ``True``, the query
results can be larger than the maximum response size. This
option is only applicable when ``query_or_table`` is a query.
Defaults to ``True``.

Raises:
bigframes.exceptions.DefaultIndexWarning:
Expand Down