Skip to content

Commit f495c84

Browse files
authored
chore: use faster query_and_wait API in _read_gbq_colab (#1777)
* chore: use faster query_and_wait API in _read_gbq_colab * try to fix unit tests * more unit test fixes * more test fixes * fix mypy * fix metrics counter in read_gbq with allow_large_results=False * use managedarrowtable * Update bigframes/session/loader.py * split out a few special case return values for read_gbq_query * support slice node for repr * fix failing system test * move slice into semiexecutor and out of readlocalnode * unit test for local executor * split method instead of using reloads * fix reference to _start_query * use limit rewrite for slice support * do not use numpy for offsets
1 parent e480d29 commit f495c84

28 files changed

+665
-117
lines changed

bigframes/blob/_functions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ def _create_udf(self):
9595
sql,
9696
job_config=bigquery.QueryJobConfig(),
9797
metrics=self._session._metrics,
98+
location=None,
99+
project=None,
100+
timeout=None,
101+
query_with_job=True,
98102
)
99103

100104
return udf_name

bigframes/core/array_value.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import bigframes.core.ordering as orderings
3535
import bigframes.core.schema as schemata
3636
import bigframes.core.tree_properties
37-
import bigframes.core.utils
3837
from bigframes.core.window_spec import WindowSpec
3938
import bigframes.dtypes
4039
import bigframes.exceptions as bfe

bigframes/core/compile/compiler.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,9 @@
2222
import bigframes_vendored.ibis.expr.api as ibis_api
2323
import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
2424
import bigframes_vendored.ibis.expr.types as ibis_types
25-
import pyarrow as pa
2625

2726
from bigframes import dtypes, operations
28-
from bigframes.core import expression
27+
from bigframes.core import expression, pyarrow_utils
2928
import bigframes.core.compile.compiled as compiled
3029
import bigframes.core.compile.concat as concat_impl
3130
import bigframes.core.compile.configs as configs
@@ -172,9 +171,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args):
172171
pa_table = pa_table.rename_columns([item.id.sql for item in node.scan_list.items])
173172

174173
if offsets:
175-
pa_table = pa_table.append_column(
176-
offsets, pa.array(range(pa_table.num_rows), type=pa.int64())
177-
)
174+
pa_table = pyarrow_utils.append_offsets(pa_table, offsets)
178175
return compiled.UnorderedIR.from_polars(pa_table, bq_schema)
179176

180177

bigframes/core/compile/sqlglot/compiler.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@
1818
import typing
1919

2020
from google.cloud import bigquery
21-
import pyarrow as pa
2221
import sqlglot.expressions as sge
2322

24-
from bigframes.core import expression, guid, identifiers, nodes, rewrite
23+
from bigframes.core import expression, guid, identifiers, nodes, pyarrow_utils, rewrite
2524
from bigframes.core.compile import configs
2625
import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
2726
import bigframes.core.compile.sqlglot.sqlglot_ir as ir
@@ -155,9 +154,7 @@ def compile_readlocal(self, node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR:
155154

156155
offsets = node.offsets_col.sql if node.offsets_col else None
157156
if offsets:
158-
pa_table = pa_table.append_column(
159-
offsets, pa.array(range(pa_table.num_rows), type=pa.int64())
160-
)
157+
pa_table = pyarrow_utils.append_offsets(pa_table, offsets)
161158

162159
return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema, uid_gen=self.uid_gen)
163160

bigframes/core/local_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ def _adapt_chunked_array(
295295

296296

297297
def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtype]:
298-
"""Normalize the array to managed storage types. Preverse shapes, only transforms values."""
298+
"""Normalize the array to managed storage types. Preserve shapes, only transforms values."""
299299
if array.offset != 0: # Offset arrays don't have all operations implemented
300300
return _adapt_arrow_array(pa.concat_arrays([array]))
301301

bigframes/core/nodes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,16 @@ def is_limit(self) -> bool:
154154
and (self.stop > 0)
155155
)
156156

157+
@property
158+
def is_noop(self) -> bool:
159+
"""Returns whether this node doesn't actually change the results."""
160+
# TODO: Handle tail case.
161+
return (
162+
((not self.start) or (self.start == 0))
163+
and (self.step == 1)
164+
and ((self.stop is None) or (self.stop == self.row_count))
165+
)
166+
157167
@property
158168
def row_count(self) -> typing.Optional[int]:
159169
child_length = self.child.row_count

bigframes/core/pyarrow_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,12 @@ def truncate_pyarrow_iterable(
8585
else:
8686
yield batch
8787
total_yielded += batch.num_rows
88+
89+
90+
def append_offsets(
91+
pa_table: pa.Table,
92+
offsets_col: str,
93+
) -> pa.Table:
94+
return pa_table.append_column(
95+
offsets_col, pa.array(range(pa_table.num_rows), type=pa.int64())
96+
)

bigframes/core/rewrite/scan_reduction.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing import Optional
1717

1818
from bigframes.core import nodes
19+
import bigframes.core.rewrite.slices
1920

2021

2122
def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTableNode]:
@@ -28,7 +29,15 @@ def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTab
2829
return None
2930

3031

31-
def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLocalNode]:
32+
def try_reduce_to_local_scan(
33+
node: nodes.BigFrameNode,
34+
) -> Optional[tuple[nodes.ReadLocalNode, Optional[int]]]:
35+
"""Create a ReadLocalNode with optional limit, if possible.
36+
37+
Similar to ReadApiSemiExecutor._try_adapt_plan.
38+
"""
39+
node, limit = bigframes.core.rewrite.slices.pull_out_limit(node)
40+
3241
if not all(
3342
map(
3443
lambda x: isinstance(x, (nodes.ReadLocalNode, nodes.SelectionNode)),
@@ -38,7 +47,7 @@ def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLoc
3847
return None
3948
result = node.bottom_up(merge_scan)
4049
if isinstance(result, nodes.ReadLocalNode):
41-
return result
50+
return result, limit
4251
return None
4352

4453

bigframes/core/rewrite/slices.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ def pull_out_limit(
5757
if (prior_limit is not None) and (prior_limit < limit):
5858
limit = prior_limit
5959
return new_root, limit
60+
if root.is_noop:
61+
new_root, prior_limit = pull_out_limit(root.child)
62+
return new_root, prior_limit
6063
elif (
6164
isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode))
6265
and root.row_preserving

bigframes/core/schema.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from dataclasses import dataclass
1818
import functools
1919
import typing
20-
from typing import Sequence
20+
from typing import Dict, List, Sequence
2121

2222
import google.cloud.bigquery
2323
import pyarrow
@@ -47,14 +47,24 @@ def from_bq_table(
4747
column_type_overrides: typing.Optional[
4848
typing.Dict[str, bigframes.dtypes.Dtype]
4949
] = None,
50+
):
51+
return ArraySchema.from_bq_schema(
52+
table.schema, column_type_overrides=column_type_overrides
53+
)
54+
55+
@classmethod
56+
def from_bq_schema(
57+
cls,
58+
schema: List[google.cloud.bigquery.SchemaField],
59+
column_type_overrides: typing.Optional[
60+
Dict[str, bigframes.dtypes.Dtype]
61+
] = None,
5062
):
5163
if column_type_overrides is None:
5264
column_type_overrides = {}
5365
items = tuple(
5466
SchemaItem(name, column_type_overrides.get(name, dtype))
55-
for name, dtype in bigframes.dtypes.bf_type_from_type_kind(
56-
table.schema
57-
).items()
67+
for name, dtype in bigframes.dtypes.bf_type_from_type_kind(schema).items()
5868
)
5969
return ArraySchema(items)
6070

0 commit comments

Comments
 (0)