refactor: caching and aggregation don't require ordering (#759)

TrevorBergeron · web-flow · commit 7e8296d7e31e · 2024-06-07T16:50:17.000-05:00
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -177,7 +177,7 @@ def _compiled_schema(self) -> schemata.ArraySchema:
     def as_cached(
         self: ArrayValue,
         cache_table: google.cloud.bigquery.Table,
-        ordering: orderings.ExpressionOrdering,
+        ordering: Optional[orderings.ExpressionOrdering],
     ) -> ArrayValue:
         """
         Replace the node with an equivalent one that references a tabel where the value has been materialized to.
@@ -234,6 +234,8 @@ def promote_offsets(self, col_id: str) -> ArrayValue:
         """
         Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
         """
+        if not self.session._strictly_ordered:
+            raise ValueError("Generating offsets not supported in unordered mode")
         return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id))
 
     def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue:
@@ -382,6 +384,10 @@ def project_window_op(
         never_skip_nulls: will disable null skipping for operators that would otherwise do so
         skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection
         """
+        if not self.session._strictly_ordered:
+            # TODO: Support unbounded windows with aggregate ops and some row-order-independent analytic ops
+            # TODO: Support non-deterministic windowing
+            raise ValueError("Windowed ops not supported in unordered mode")
         return ArrayValue(
             nodes.WindowOpNode(
                 child=self.node,
@@ -433,8 +439,9 @@ def unpivot(
         """
         # There will be N labels, used to disambiguate which of N source columns produced each output row
         explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_")
-        labels_array = self._create_unpivot_labels_array(row_labels, index_col_ids)
-        labels_array = labels_array.promote_offsets(explode_offsets_id)
+        labels_array = self._create_unpivot_labels_array(
+            row_labels, index_col_ids, explode_offsets_id
+        )
 
         # Unpivot creates N output rows for each input row, labels disambiguate these N rows
         joined_array = self._cross_join_w_labels(labels_array, join_side)
@@ -500,6 +507,7 @@ def _create_unpivot_labels_array(
         self,
         former_column_labels: typing.Sequence[typing.Hashable],
         col_ids: typing.Sequence[str],
+        offsets_id: str,
     ) -> ArrayValue:
         """Create an ArrayValue from a list of label tuples."""
         rows = []
@@ -510,6 +518,7 @@ def _create_unpivot_labels_array(
                 col_ids[i]: (row_label[i] if pandas.notnull(row_label[i]) else None)
                 for i in range(len(col_ids))
             }
+            row[offsets_id] = row_offset
             rows.append(row)
 
         return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session)
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -553,7 +553,7 @@ def _materialize_local(
         """Run query and download results as a pandas DataFrame. Return the total number of results as well."""
         # TODO(swast): Allow for dry run and timeout.
         _, query_job = self.session._query_to_destination(
-            self.session._to_sql(self.expr, sorted=True),
+            self.session._to_sql(self.expr, sorted=materialize_options.ordered),
             list(self.index_columns),
             api_name="cached",
             do_clustering=False,
@@ -1016,7 +1016,7 @@ def aggregate_all_and_stack(
                 index_columns=[index_id],
                 column_labels=self.column_labels,
                 index_labels=[None],
-            ).transpose(original_row_index=pd.Index([None]))
+            ).transpose(original_row_index=pd.Index([None]), single_row_mode=True)
         else:  # axis_n == 1
             # using offsets as identity to group on.
             # TODO: Allow to promote identity/total_order columns instead for better perf
@@ -1659,6 +1659,8 @@ def melt(
         value_vars=typing.Sequence[str],
         var_names=typing.Sequence[typing.Hashable],
         value_name: typing.Hashable = "value",
+        *,
+        create_offsets_index: bool = True,
     ):
         """
         Unpivot columns to produce longer, narrower dataframe.
@@ -1679,20 +1681,31 @@ def melt(
             index_col_ids=var_col_ids,
             join_side="right",
         )
-        index_id = guid.generate_guid()
-        unpivot_expr = unpivot_expr.promote_offsets(index_id)
+
+        if create_offsets_index:
+            index_id = guid.generate_guid()
+            unpivot_expr = unpivot_expr.promote_offsets(index_id)
+            index_cols = [index_id]
+        else:
+            index_cols = []
+
         # Need to reorder to get id_vars before var_col and unpivot_col
         unpivot_expr = unpivot_expr.select_columns(
-            [index_id, *id_vars, *var_col_ids, unpivot_col_id]
+            [*index_cols, *id_vars, *var_col_ids, unpivot_col_id]
         )
 
         return Block(
             unpivot_expr,
             column_labels=[*id_labels, *var_names, value_name],
-            index_columns=[index_id],
+            index_columns=index_cols,
         )
 
-    def transpose(self, *, original_row_index: Optional[pd.Index] = None) -> Block:
+    def transpose(
+        self,
+        *,
+        original_row_index: Optional[pd.Index] = None,
+        single_row_mode: bool = False,
+    ) -> Block:
         """Transpose the block. Will fail if dtypes aren't coercible to a common type or too many rows.
         Can provide the original_row_index directly if it is already known, otherwise a query is needed.
         """
@@ -1718,7 +1731,11 @@ def transpose(self, *, original_row_index: Optional[pd.Index] = None) -> Block:
                 block.column_labels, pd.Index(range(len(block.column_labels)))
             )
         )
-        numbered_block, offsets = numbered_block.promote_offsets()
+        # TODO: Determine if single row from expression tree (after aggregation without groupby)
+        if single_row_mode:
+            numbered_block, offsets = numbered_block.create_constant(0)
+        else:
+            numbered_block, offsets = numbered_block.promote_offsets()
 
         stacked_block = numbered_block.melt(
             id_vars=(offsets,),
@@ -1727,6 +1744,7 @@ def transpose(self, *, original_row_index: Optional[pd.Index] = None) -> Block:
                 "col_offset",
             ),
             value_vars=block.value_columns,
+            create_offsets_index=False,
         )
         col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2]
         col_offset = stacked_block.value_columns[-2]  # disambiguator we created earlier
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -109,6 +109,11 @@ def compile_cached_table(node: nodes.CachedTableNode, ordered: bool = True):
     )
     ibis_table = ibis.table(physical_schema, full_table_name)
     if ordered:
+        if node.ordering is None:
+            # If this happens, session malfunctioned while applying cached results.
+            raise ValueError(
+                "Cannot use unordered cached value. Result requires ordering information."
+            )
         return compiled.OrderedIR(
             ibis_table,
             columns=tuple(
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -411,6 +411,7 @@ def transform_children(
         return self
 
 
+# This node shouldn't be used in the "original" expression tree, only used as replacement for original during planning
 @dataclass(frozen=True)
 class CachedTableNode(BigFrameNode):
     # The original BFET subtree that was cached
@@ -422,7 +423,7 @@ class CachedTableNode(BigFrameNode):
     table_id: str = field()
     physical_schema: Tuple[bq.SchemaField, ...] = field()
 
-    ordering: orderings.ExpressionOrdering = field()
+    ordering: typing.Optional[orderings.ExpressionOrdering] = field()
 
     @property
     def session(self):
@@ -446,6 +447,8 @@ def variables_introduced(self) -> int:
     @property
     def hidden_columns(self) -> typing.Tuple[str, ...]:
         """Physical columns used to define ordering but not directly exposed as value columns."""
+        if self.ordering is None:
+            return ()
         return tuple(
             col
             for col in sorted(self.ordering.referenced_columns)
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -294,6 +294,9 @@ def __init__(
         self._bytes_processed_sum = 0
         self._slot_millis_sum = 0
         self._execution_count = 0
+        # Whether this session treats objects as totally ordered.
+        # Will expose as feature later, only False for internal testing
+        self._strictly_ordered = True
 
     @property
     def bqclient(self):
@@ -1841,24 +1844,31 @@ def _cache_with_cluster_cols(
         """Executes the query and uses the resulting table to rewrite future executions."""
         # TODO: Use this for all executions? Problem is that caching materializes extra
         # ordering columns
+        # TODO: May want to support some partial ordering info even for non-strict ordering mode
+        keep_order_info = self._strictly_ordered
+
         compiled_value = self._compile_ordered(array_value)
 
         ibis_expr = compiled_value._to_ibis_expr(
-            ordering_mode="unordered", expose_hidden_cols=True
+            ordering_mode="unordered", expose_hidden_cols=keep_order_info
         )
         tmp_table = self._ibis_to_temp_table(
             ibis_expr, cluster_cols=cluster_cols, api_name="cached"
         )
         cached_replacement = array_value.as_cached(
             cache_table=self.bqclient.get_table(tmp_table),
-            ordering=compiled_value._ordering,
+            ordering=compiled_value._ordering if keep_order_info else None,
         ).node
         self._cached_executions[array_value.node] = cached_replacement
 
     def _cache_with_offsets(self, array_value: core.ArrayValue):
         """Executes the query and uses the resulting table to rewrite future executions."""
         # TODO: Use this for all executions? Problem is that caching materializes extra
         # ordering columns
+        if not self._strictly_ordered:
+            raise ValueError(
+                "Caching with offsets only supported in strictly ordered mode."
+            )
         compiled_value = self._compile_ordered(array_value)
 
         ibis_expr = compiled_value._to_ibis_expr(
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -139,6 +139,17 @@ def session() -> Generator[bigframes.Session, None, None]:
     session.close()  # close generated session at cleanup time
 
 
+@pytest.fixture(scope="session")
+def unordered_session() -> Generator[bigframes.Session, None, None]:
+    context = bigframes.BigQueryOptions(
+        location="US",
+    )
+    session = bigframes.Session(context=context)
+    session._strictly_ordered = False
+    yield session
+    session.close()  # close generated session at cleanup type
+
+
 @pytest.fixture(scope="session")
 def session_tokyo(tokyo_location: str) -> Generator[bigframes.Session, None, None]:
     context = bigframes.BigQueryOptions(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -3124,9 +3124,9 @@ def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col,
 
     # Check dtype separately
     assert bf_result.dtype == "Int64"
-
+    # Is otherwise "object" dtype
+    pd_result.index = pd_result.index.astype("string[pyarrow]")
     # Pandas may produce narrower numeric types
-    # Pandas has object index type
     assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
 
 
@@ -3146,6 +3146,7 @@ def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col
 
     # Pandas may produce narrower numeric types
     # Pandas has object index type
+    pd_result.index = pd_result.index.astype("string[pyarrow]")
     assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
 
 
@@ -3183,6 +3184,7 @@ def test_dataframe_aggregates(
 
     # Pandas may produce narrower numeric types, but bigframes always produces Float64
     # Pandas has object index type
+    pd_result.index = pd_result.index.astype("string[pyarrow]")
     assert_series_equal(
         pd_result,
         bf_result,
diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py
@@ -0,0 +1,28 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+
+import bigframes.pandas as bpd
+from tests.system.utils import assert_pandas_df_equal
+
+
+def test_unordered_mode_cache_aggregate(unordered_session):
+    pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype())
+    df = bpd.DataFrame(pd_df, session=unordered_session)
+    mean_diff = df - df.mean()
+    mean_diff.cache()
+    bf_result = mean_diff.to_pandas(ordered=False)
+    pd_result = pd_df - pd_df.mean()
+
+    assert_pandas_df_equal(bf_result, pd_result, ignore_order=True)