refactor: refactor block materialization (#306)

TrevorBergeron · web-flow · commit 4a1a1e08014d · 2024-01-12T00:04:15.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py
@@ -14,6 +14,8 @@
 
 """Options for downsampling."""
 
+from __future__ import annotations
+
 import dataclasses
 from typing import Literal, Optional
 
@@ -25,6 +27,28 @@ class SamplingOptions:
     __doc__ = vendored_pandas_config.sampling_options_doc
 
     max_download_size: Optional[int] = 500
+    # Enable downsampling
     enable_downsampling: bool = False
     sampling_method: Literal["head", "uniform"] = "uniform"
     random_state: Optional[int] = None
+
+    def with_max_download_size(self, max_rows: Optional[int]) -> SamplingOptions:
+        return SamplingOptions(
+            max_rows, self.enable_downsampling, self.sampling_method, self.random_state
+        )
+
+    def with_method(self, method: Literal["head", "uniform"]) -> SamplingOptions:
+        return SamplingOptions(self.max_download_size, True, method, self.random_state)
+
+    def with_random_state(self, state: Optional[int]) -> SamplingOptions:
+        return SamplingOptions(
+            self.max_download_size,
+            self.enable_downsampling,
+            self.sampling_method,
+            state,
+        )
+
+    def with_disabled(self) -> SamplingOptions:
+        return SamplingOptions(
+            self.max_download_size, False, self.sampling_method, self.random_state
+        )
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -21,8 +21,7 @@
 import ibis.expr.types as ibis_types
 import pandas
 
-import bigframes.core.compile.compiled as compiled
-import bigframes.core.compile.compiler as compiler
+import bigframes.core.compile as compiling
 import bigframes.core.expression as expressions
 import bigframes.core.guid
 import bigframes.core.nodes as nodes
@@ -104,11 +103,11 @@ def _try_evaluate_local(self):
     def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
         return self._compile_ordered().get_column_type(key)
 
-    def _compile_ordered(self) -> compiled.OrderedIR:
-        return compiler.compile_ordered(self.node)
+    def _compile_ordered(self) -> compiling.OrderedIR:
+        return compiling.compile_ordered(self.node)
 
-    def _compile_unordered(self) -> compiled.UnorderedIR:
-        return compiler.compile_unordered(self.node)
+    def _compile_unordered(self) -> compiling.UnorderedIR:
+        return compiling.compile_unordered(self.node)
 
     def row_count(self) -> ArrayValue:
         """Get number of rows in ArrayValue as a single-entry ArrayValue."""
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -21,6 +21,7 @@
 
 from __future__ import annotations
 
+import dataclasses
 import functools
 import itertools
 import random
@@ -31,6 +32,7 @@
 import google.cloud.bigquery as bigquery
 import pandas as pd
 
+import bigframes._config.sampling_options as sampling_options
 import bigframes.constants as constants
 import bigframes.core as core
 import bigframes.core.guid as guid
@@ -80,6 +82,14 @@ def _get_block(self) -> Block:
         """Get the underlying block value of the object"""
 
 
+@dataclasses.dataclass()
+class MaterializationOptions:
+    downsampling: sampling_options.SamplingOptions = dataclasses.field(
+        default_factory=sampling_options.SamplingOptions
+    )
+    ordered: bool = True
+
+
 class Block:
     """A immutable 2D data structure."""
 
@@ -395,23 +405,31 @@ def _to_dataframe(self, result) -> pd.DataFrame:
 
     def to_pandas(
         self,
-        value_keys: Optional[Iterable[str]] = None,
-        max_results: Optional[int] = None,
         max_download_size: Optional[int] = None,
         sampling_method: Optional[str] = None,
         random_state: Optional[int] = None,
         *,
         ordered: bool = True,
     ) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
         """Run query and download results as a pandas DataFrame."""
+        if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS):
+            raise NotImplementedError(
+                f"The downsampling method {sampling_method} is not implemented, "
+                f"please choose from {','.join(_SAMPLING_METHODS)}."
+            )
 
-        df, _, query_job = self._compute_and_count(
-            value_keys=value_keys,
-            max_results=max_results,
-            max_download_size=max_download_size,
-            sampling_method=sampling_method,
-            random_state=random_state,
-            ordered=ordered,
+        sampling = bigframes.options.sampling.with_max_download_size(max_download_size)
+        if sampling_method is not None:
+            sampling = sampling.with_method(sampling_method).with_random_state(  # type: ignore
+                random_state
+            )
+        else:
+            sampling = sampling.with_disabled()
+
+        df, query_job = self._materialize_local(
+            materialize_options=MaterializationOptions(
+                downsampling=sampling, ordered=ordered
+            )
         )
         return df, query_job
 
@@ -439,57 +457,29 @@ def _copy_index_to_pandas(self, df: pd.DataFrame):
             # See: https://github.com/pandas-dev/pandas-stubs/issues/804
             df.index.names = self.index.names  # type: ignore
 
-    def _compute_and_count(
-        self,
-        value_keys: Optional[Iterable[str]] = None,
-        max_results: Optional[int] = None,
-        max_download_size: Optional[int] = None,
-        sampling_method: Optional[str] = None,
-        random_state: Optional[int] = None,
-        *,
-        ordered: bool = True,
-    ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
+    def _materialize_local(
+        self, materialize_options: MaterializationOptions = MaterializationOptions()
+    ) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
         """Run query and download results as a pandas DataFrame. Return the total number of results as well."""
         # TODO(swast): Allow for dry run and timeout.
-        enable_downsampling = (
-            True
-            if sampling_method is not None
-            else bigframes.options.sampling.enable_downsampling
-        )
-
-        max_download_size = (
-            max_download_size or bigframes.options.sampling.max_download_size
-        )
-
-        random_state = random_state or bigframes.options.sampling.random_state
-
-        if sampling_method is None:
-            sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM
-        sampling_method = sampling_method.lower()
-
-        if sampling_method not in _SAMPLING_METHODS:
-            raise NotImplementedError(
-                f"The downsampling method {sampling_method} is not implemented, "
-                f"please choose from {','.join(_SAMPLING_METHODS)}."
-            )
-
-        expr = self._apply_value_keys_to_expr(value_keys=value_keys)
-
         results_iterator, query_job = self.session._execute(
-            expr, max_results=max_results, sorted=ordered
+            self.expr, sorted=materialize_options.ordered
         )
-
         table_size = (
             self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES
         )
+        sample_config = materialize_options.downsampling
+        max_download_size = sample_config.max_download_size
         fraction = (
             max_download_size / table_size
             if (max_download_size is not None) and (table_size != 0)
             else 2
         )
 
+        # TODO: Maybe materialize before downsampling
+        # Some downsampling methods
         if fraction < 1:
-            if not enable_downsampling:
+            if not sample_config.enable_downsampling:
                 raise RuntimeError(
                     f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of "
                     f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n"
@@ -507,42 +497,53 @@ def _compute_and_count(
                 "\nPlease refer to the documentation for configuring the downloading limit.",
                 UserWarning,
             )
-            if sampling_method == _HEAD:
-                total_rows = int(results_iterator.total_rows * fraction)
-                results_iterator.max_results = total_rows
-                df = self._to_dataframe(results_iterator)
-
-                if self.index_columns:
-                    df.set_index(list(self.index_columns), inplace=True)
-                    df.index.names = self.index.names  # type: ignore
-            elif (sampling_method == _UNIFORM) and (random_state is None):
-                filtered_expr = self.expr._uniform_sampling(fraction)
-                block = Block(
-                    filtered_expr,
-                    index_columns=self.index_columns,
-                    column_labels=self.column_labels,
-                    index_labels=self.index.names,
-                )
-                df, total_rows, _ = block._compute_and_count(max_download_size=None)
-            elif sampling_method == _UNIFORM:
-                block = self._split(
-                    fracs=(max_download_size / table_size,),
-                    random_state=random_state,
-                    preserve_order=True,
-                )[0]
-                df, total_rows, _ = block._compute_and_count(max_download_size=None)
-            else:
-                # This part should never be called, just in case.
-                raise NotImplementedError(
-                    f"The downsampling method {sampling_method} is not implemented, "
-                    f"please choose from {','.join(_SAMPLING_METHODS)}."
-                )
+            total_rows = results_iterator.total_rows
+            # Remove downsampling config from subsequent invocations, as otherwise could result in many
+            # iterations if downsampling undershoots
+            return self._downsample(
+                total_rows=total_rows,
+                sampling_method=sample_config.sampling_method,
+                fraction=fraction,
+                random_state=sample_config.random_state,
+            )._materialize_local(
+                MaterializationOptions(ordered=materialize_options.ordered)
+            )
         else:
             total_rows = results_iterator.total_rows
             df = self._to_dataframe(results_iterator)
             self._copy_index_to_pandas(df)
 
-        return df, total_rows, query_job
+        return df, query_job
+
+    def _downsample(
+        self, total_rows: int, sampling_method: str, fraction: float, random_state
+    ) -> Block:
+        # either selecting fraction or number of rows
+        if sampling_method == _HEAD:
+            filtered_block = self.slice(stop=int(total_rows * fraction))
+            return filtered_block
+        elif (sampling_method == _UNIFORM) and (random_state is None):
+            filtered_expr = self.expr._uniform_sampling(fraction)
+            block = Block(
+                filtered_expr,
+                index_columns=self.index_columns,
+                column_labels=self.column_labels,
+                index_labels=self.index.names,
+            )
+            return block
+        elif sampling_method == _UNIFORM:
+            block = self._split(
+                fracs=(fraction,),
+                random_state=random_state,
+                preserve_order=True,
+            )[0]
+            return block
+        else:
+            # This part should never be called, just in case.
+            raise NotImplementedError(
+                f"The downsampling method {sampling_method} is not implemented, "
+                f"please choose from {','.join(_SAMPLING_METHODS)}."
+            )
 
     def _split(
         self,
@@ -1209,10 +1210,9 @@ def retrieve_repr_request_results(
         count = self.shape[0]
         if count > max_results:
             head_block = self.slice(0, max_results)
-            computed_df, query_job = head_block.to_pandas(max_results=max_results)
         else:
             head_block = self
-            computed_df, query_job = head_block.to_pandas()
+        computed_df, query_job = head_block.to_pandas()
         formatted_df = computed_df.set_axis(self.column_labels, axis=1)
         # we reset the axis and substitute the bf index name for the default
         formatted_df.index.name = self.index.name
diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py
@@ -24,7 +24,7 @@
 
 import bigframes.constants as constants
 import bigframes.core.compile.compiled as compiled
-import bigframes.core.joins.name_resolution as naming
+import bigframes.core.joins as joining
 import bigframes.core.ordering as orderings
 
 SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"}
@@ -68,7 +68,7 @@ def join_by_row_identity_unordered(
     right_mask = right_relative_predicates if how in ["left", "outer"] else None
 
     # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result
-    map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER(
+    map_left_id, map_right_id = joining.JOIN_NAME_REMAPPER(
         left.column_ids, right.column_ids
     )
     joined_columns = [
@@ -125,10 +125,10 @@ def join_by_row_identity_ordered(
     right_mask = right_relative_predicates if how in ["left", "outer"] else None
 
     # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result
-    lpublicmapping, rpublicmapping = naming.JOIN_NAME_REMAPPER(
+    lpublicmapping, rpublicmapping = joining.JOIN_NAME_REMAPPER(
         left.column_ids, right.column_ids
     )
-    lhiddenmapping, rhiddenmapping = naming.JoinNameRemapper(namespace="hidden")(
+    lhiddenmapping, rhiddenmapping = joining.JoinNameRemapper(namespace="hidden")(
         left._hidden_column_ids, right._hidden_column_ids
     )
     map_left_id = {**lpublicmapping, **lhiddenmapping}
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -305,14 +305,13 @@ def to_pandas(
                 is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame.
         """
         df, query_job = self._block.to_pandas(
-            (self._value_column,),
             max_download_size=max_download_size,
             sampling_method=sampling_method,
             random_state=random_state,
             ordered=ordered,
         )
         self._set_internal_query_job(query_job)
-        series = df[self._value_column]
+        series = df.squeeze(axis=1)
         series.name = self._name
         return series
 
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1497,7 +1497,6 @@ def _execute(
         self,
         array_value: core.ArrayValue,
         job_config: Optional[bigquery.job.QueryJobConfig] = None,
-        max_results: Optional[int] = None,
         *,
         sorted: bool = True,
         dry_run=False,
@@ -1507,7 +1506,6 @@ def _execute(
         return self._start_query(
             sql=sql,
             job_config=job_config,
-            max_results=max_results,
         )
 
     def _to_sql(