Skip to content

Commit ae03756

Browse files
authored
fix: make to_pandas override enable_downsampling when sampling_method is manually set. (#200)
* fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set.
1 parent edd0522 commit ae03756

File tree

2 files changed

+34
-18
lines changed

2 files changed

+34
-18
lines changed

bigframes/core/blocks.py

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -389,23 +389,6 @@ def to_pandas(
389389
ordered: bool = True,
390390
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
391391
"""Run query and download results as a pandas DataFrame."""
392-
if max_download_size is None:
393-
max_download_size = bigframes.options.sampling.max_download_size
394-
if sampling_method is None:
395-
sampling_method = (
396-
bigframes.options.sampling.sampling_method
397-
if bigframes.options.sampling.sampling_method is not None
398-
else _UNIFORM
399-
)
400-
if random_state is None:
401-
random_state = bigframes.options.sampling.random_state
402-
403-
sampling_method = sampling_method.lower()
404-
if sampling_method not in _SAMPLING_METHODS:
405-
raise NotImplementedError(
406-
f"The downsampling method {sampling_method} is not implemented, "
407-
f"please choose from {','.join(_SAMPLING_METHODS)}."
408-
)
409392

410393
df, _, query_job = self._compute_and_count(
411394
value_keys=value_keys,
@@ -453,6 +436,28 @@ def _compute_and_count(
453436
) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
454437
"""Run query and download results as a pandas DataFrame. Return the total number of results as well."""
455438
# TODO(swast): Allow for dry run and timeout.
439+
enable_downsampling = (
440+
True
441+
if sampling_method is not None
442+
else bigframes.options.sampling.enable_downsampling
443+
)
444+
445+
max_download_size = (
446+
max_download_size or bigframes.options.sampling.max_download_size
447+
)
448+
449+
random_state = random_state or bigframes.options.sampling.random_state
450+
451+
if sampling_method is None:
452+
sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM
453+
sampling_method = sampling_method.lower()
454+
455+
if sampling_method not in _SAMPLING_METHODS:
456+
raise NotImplementedError(
457+
f"The downsampling method {sampling_method} is not implemented, "
458+
f"please choose from {','.join(_SAMPLING_METHODS)}."
459+
)
460+
456461
expr = self._apply_value_keys_to_expr(value_keys=value_keys)
457462

458463
results_iterator, query_job = expr.start_query(
@@ -469,7 +474,7 @@ def _compute_and_count(
469474
)
470475

471476
if fraction < 1:
472-
if not bigframes.options.sampling.enable_downsampling:
477+
if not enable_downsampling:
473478
raise RuntimeError(
474479
f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of "
475480
f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n"

tests/system/small/test_dataframe.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3546,3 +3546,14 @@ def test_df_dot_operator_series(
35463546
bf_result,
35473547
pd_result,
35483548
)
3549+
3550+
3551+
def test_to_pandas_downsampling_option_override(session):
3552+
df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
3553+
download_size = 1
3554+
3555+
df = df.to_pandas(max_download_size=download_size, sampling_method="head")
3556+
3557+
total_memory_bytes = df.memory_usage(deep=True).sum()
3558+
total_memory_mb = total_memory_bytes / (1024 * 1024)
3559+
assert total_memory_mb == pytest.approx(download_size, rel=0.3)

0 commit comments

Comments
 (0)