Skip to content

Commit 4c5dee5

Browse files
sycairelease-please[bot]gcf-owl-bot[bot]
authored
feat: add dry_run parameter to read_gbq(), read_gbq_table() and read_gbq_query() (#1674)
* feat: add dry_run parameter to read_gbq() and read_gbq_query() * fix lint * chore(main): release 2.2.0 (#1643) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> * create a different stats report for reading gbq tables * fix lint * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * rename column count and column dtypes * fix typo * format code --------- Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent cb0267d commit 4c5dee5

File tree

6 files changed

+550
-74
lines changed

6 files changed

+550
-74
lines changed

bigframes/core/blocks.py

Lines changed: 10 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,14 @@
2222
from __future__ import annotations
2323

2424
import ast
25-
import copy
2625
import dataclasses
2726
import datetime
2827
import functools
2928
import itertools
3029
import random
3130
import textwrap
3231
import typing
33-
from typing import (
34-
Any,
35-
Iterable,
36-
List,
37-
Literal,
38-
Mapping,
39-
Optional,
40-
Sequence,
41-
Tuple,
42-
Union,
43-
)
32+
from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union
4433
import warnings
4534

4635
import bigframes_vendored.constants as constants
@@ -69,6 +58,7 @@
6958
import bigframes.exceptions as bfe
7059
import bigframes.operations as ops
7160
import bigframes.operations.aggregations as agg_ops
61+
from bigframes.session import dry_runs
7262
from bigframes.session import executor as executors
7363

7464
# Type constraint for wherever column labels are used
@@ -822,59 +812,18 @@ def _compute_dry_run(
822812
if sampling.enable_downsampling:
823813
raise NotImplementedError("Dry run with sampling is not supported")
824814

825-
index: List[Any] = []
826-
values: List[Any] = []
827-
828-
index.append("columnCount")
829-
values.append(len(self.value_columns))
830-
index.append("columnDtypes")
831-
values.append(
832-
{
833-
col: self.expr.get_column_type(self.resolve_label_exact_or_error(col))
834-
for col in self.column_labels
835-
}
836-
)
837-
838-
index.append("indexLevel")
839-
values.append(self.index.nlevels)
840-
index.append("indexDtypes")
841-
values.append(self.index.dtypes)
842-
843815
expr = self._apply_value_keys_to_expr(value_keys=value_keys)
844816
query_job = self.session._executor.dry_run(expr, ordered)
845-
job_api_repr = copy.deepcopy(query_job._properties)
846-
847-
job_ref = job_api_repr["jobReference"]
848-
for key, val in job_ref.items():
849-
index.append(key)
850-
values.append(val)
851-
852-
index.append("jobType")
853-
values.append(job_api_repr["configuration"]["jobType"])
854-
855-
query_config = job_api_repr["configuration"]["query"]
856-
for key in ("destinationTable", "useLegacySql"):
857-
index.append(key)
858-
values.append(query_config.get(key))
859-
860-
query_stats = job_api_repr["statistics"]["query"]
861-
for key in (
862-
"referencedTables",
863-
"totalBytesProcessed",
864-
"cacheHit",
865-
"statementType",
866-
):
867-
index.append(key)
868-
values.append(query_stats.get(key))
869817

870-
index.append("creationTime")
871-
values.append(
872-
pd.Timestamp(
873-
job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC"
874-
)
875-
)
818+
column_dtypes = {
819+
col: self.expr.get_column_type(self.resolve_label_exact_or_error(col))
820+
for col in self.column_labels
821+
}
876822

877-
return pd.Series(values, index=index), query_job
823+
dry_run_stats = dry_runs.get_query_stats_with_dtypes(
824+
query_job, column_dtypes, self.index.dtypes
825+
)
826+
return dry_run_stats, query_job
878827

879828
def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None):
880829
expr = self._expr

bigframes/pandas/io/api.py

Lines changed: 104 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
Literal,
2626
MutableSequence,
2727
Optional,
28+
overload,
2829
Sequence,
2930
Tuple,
3031
Union,
@@ -155,6 +156,38 @@ def read_json(
155156
read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json)
156157

157158

159+
@overload
160+
def read_gbq( # type: ignore[overload-overlap]
161+
query_or_table: str,
162+
*,
163+
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ...,
164+
columns: Iterable[str] = ...,
165+
configuration: Optional[Dict] = ...,
166+
max_results: Optional[int] = ...,
167+
filters: vendored_pandas_gbq.FiltersType = ...,
168+
use_cache: Optional[bool] = ...,
169+
col_order: Iterable[str] = ...,
170+
dry_run: Literal[False] = ...,
171+
) -> bigframes.dataframe.DataFrame:
172+
...
173+
174+
175+
@overload
176+
def read_gbq(
177+
query_or_table: str,
178+
*,
179+
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ...,
180+
columns: Iterable[str] = ...,
181+
configuration: Optional[Dict] = ...,
182+
max_results: Optional[int] = ...,
183+
filters: vendored_pandas_gbq.FiltersType = ...,
184+
use_cache: Optional[bool] = ...,
185+
col_order: Iterable[str] = ...,
186+
dry_run: Literal[True] = ...,
187+
) -> pandas.Series:
188+
...
189+
190+
158191
def read_gbq(
159192
query_or_table: str,
160193
*,
@@ -165,7 +198,8 @@ def read_gbq(
165198
filters: vendored_pandas_gbq.FiltersType = (),
166199
use_cache: Optional[bool] = None,
167200
col_order: Iterable[str] = (),
168-
) -> bigframes.dataframe.DataFrame:
201+
dry_run: bool = False,
202+
) -> bigframes.dataframe.DataFrame | pandas.Series:
169203
_set_default_session_location_if_possible(query_or_table)
170204
return global_session.with_default_session(
171205
bigframes.session.Session.read_gbq,
@@ -177,6 +211,7 @@ def read_gbq(
177211
filters=filters,
178212
use_cache=use_cache,
179213
col_order=col_order,
214+
dry_run=dry_run,
180215
)
181216

182217

@@ -208,6 +243,38 @@ def read_gbq_object_table(
208243
)
209244

210245

246+
@overload
247+
def read_gbq_query( # type: ignore[overload-overlap]
248+
query: str,
249+
*,
250+
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ...,
251+
columns: Iterable[str] = ...,
252+
configuration: Optional[Dict] = ...,
253+
max_results: Optional[int] = ...,
254+
use_cache: Optional[bool] = ...,
255+
col_order: Iterable[str] = ...,
256+
filters: vendored_pandas_gbq.FiltersType = ...,
257+
dry_run: Literal[False] = ...,
258+
) -> bigframes.dataframe.DataFrame:
259+
...
260+
261+
262+
@overload
263+
def read_gbq_query(
264+
query: str,
265+
*,
266+
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ...,
267+
columns: Iterable[str] = ...,
268+
configuration: Optional[Dict] = ...,
269+
max_results: Optional[int] = ...,
270+
use_cache: Optional[bool] = ...,
271+
col_order: Iterable[str] = ...,
272+
filters: vendored_pandas_gbq.FiltersType = ...,
273+
dry_run: Literal[True] = ...,
274+
) -> pandas.Series:
275+
...
276+
277+
211278
def read_gbq_query(
212279
query: str,
213280
*,
@@ -218,7 +285,8 @@ def read_gbq_query(
218285
use_cache: Optional[bool] = None,
219286
col_order: Iterable[str] = (),
220287
filters: vendored_pandas_gbq.FiltersType = (),
221-
) -> bigframes.dataframe.DataFrame:
288+
dry_run: bool = False,
289+
) -> bigframes.dataframe.DataFrame | pandas.Series:
222290
_set_default_session_location_if_possible(query)
223291
return global_session.with_default_session(
224292
bigframes.session.Session.read_gbq_query,
@@ -230,12 +298,43 @@ def read_gbq_query(
230298
use_cache=use_cache,
231299
col_order=col_order,
232300
filters=filters,
301+
dry_run=dry_run,
233302
)
234303

235304

236305
read_gbq_query.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_query)
237306

238307

308+
@overload
309+
def read_gbq_table( # type: ignore[overload-overlap]
310+
query: str,
311+
*,
312+
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ...,
313+
columns: Iterable[str] = ...,
314+
max_results: Optional[int] = ...,
315+
filters: vendored_pandas_gbq.FiltersType = ...,
316+
use_cache: bool = ...,
317+
col_order: Iterable[str] = ...,
318+
dry_run: Literal[False] = ...,
319+
) -> bigframes.dataframe.DataFrame:
320+
...
321+
322+
323+
@overload
324+
def read_gbq_table(
325+
query: str,
326+
*,
327+
index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ...,
328+
columns: Iterable[str] = ...,
329+
max_results: Optional[int] = ...,
330+
filters: vendored_pandas_gbq.FiltersType = ...,
331+
use_cache: bool = ...,
332+
col_order: Iterable[str] = ...,
333+
dry_run: Literal[True] = ...,
334+
) -> pandas.Series:
335+
...
336+
337+
239338
def read_gbq_table(
240339
query: str,
241340
*,
@@ -245,7 +344,8 @@ def read_gbq_table(
245344
filters: vendored_pandas_gbq.FiltersType = (),
246345
use_cache: bool = True,
247346
col_order: Iterable[str] = (),
248-
) -> bigframes.dataframe.DataFrame:
347+
dry_run: bool = False,
348+
) -> bigframes.dataframe.DataFrame | pandas.Series:
249349
_set_default_session_location_if_possible(query)
250350
return global_session.with_default_session(
251351
bigframes.session.Session.read_gbq_table,
@@ -256,6 +356,7 @@ def read_gbq_table(
256356
filters=filters,
257357
use_cache=use_cache,
258358
col_order=col_order,
359+
dry_run=dry_run,
259360
)
260361

261362

0 commit comments

Comments
 (0)