refactor: Unordered mode supports user partial orders (#842)

TrevorBergeron · web-flow · commit 33464947aca8 · 2024-07-18T15:35:10.000-05:00
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -147,7 +147,7 @@ def _compiled_schema(self) -> schemata.ArraySchema:
     def as_cached(
         self: ArrayValue,
         cache_table: google.cloud.bigquery.Table,
-        ordering: Optional[orderings.TotalOrdering],
+        ordering: Optional[orderings.RowOrdering],
     ) -> ArrayValue:
         """
         Replace the node with an equivalent one that references a tabel where the value has been materialized to.
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -498,9 +498,33 @@ def to_pandas(
         sampling_method: Optional[str] = None,
         random_state: Optional[int] = None,
         *,
-        ordered: Optional[bool] = None,
+        ordered: bool = True,
     ) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
-        """Run query and download results as a pandas DataFrame."""
+        """Run query and download results as a pandas DataFrame.
+
+        Args:
+            max_download_size (int, default None):
+                Download size threshold in MB. If max_download_size is exceeded when downloading data
+                (e.g., to_pandas()), the data will be downsampled if
+                bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
+                raised. If set to a value other than None, this will supersede the global config.
+            sampling_method (str, default None):
+                Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
+                returns a portion of the data from the beginning. It is fast and requires minimal
+                computations to perform the downsampling; "uniform": This algorithm returns uniform
+                random samples of the data. If set to a value other than None, this will supersede
+                the global config.
+            random_state (int, default None):
+                The seed for the uniform downsampling algorithm. If provided, the uniform method may
+                take longer to execute and require more computation. If set to a value other than
+                None, this will supersede the global config.
+            ordered (bool, default True):
+                Determines whether the resulting pandas dataframe will be ordered.
+                Whether the row ordering is deterministics depends on whether session ordering is strict.
+
+        Returns:
+            pandas.DataFrame, QueryJob
+        """
         if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS):
             raise NotImplementedError(
                 f"The downsampling method {sampling_method} is not implemented, "
@@ -517,10 +541,7 @@ def to_pandas(
 
         df, query_job = self._materialize_local(
             materialize_options=MaterializationOptions(
-                downsampling=sampling,
-                ordered=ordered
-                if ordered is not None
-                else self.session._strictly_ordered,
+                downsampling=sampling, ordered=ordered
             )
         )
         df.set_axis(self.column_labels, axis=1, copy=False)
@@ -547,7 +568,7 @@ def to_pandas_batches(
         dtypes = dict(zip(self.index_columns, self.index.dtypes))
         dtypes.update(zip(self.value_columns, self.dtypes))
         _, query_job = self.session._query_to_destination(
-            self.session._to_sql(self.expr, ordered=self.session._strictly_ordered),
+            self.session._to_sql(self.expr, ordered=True),
             list(self.index_columns),
             api_name="cached",
             do_clustering=False,
@@ -2593,10 +2614,7 @@ def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index:
         index_columns = list(self._block.index_columns)
         expr = self._expr.select_columns(index_columns)
         results, _ = self.session._execute(
-            expr,
-            ordered=ordered
-            if (ordered is not None)
-            else self.session._strictly_ordered,
+            expr, ordered=ordered if ordered is not None else True
         )
         df = expr.session._rows_to_dataframe(results)
         df = df.set_index(index_columns)
diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py
@@ -14,19 +14,13 @@
 from __future__ import annotations
 
 from bigframes.core.compile.api import (
-    compile_ordered,
-    compile_peek,
-    compile_raw,
-    compile_unordered,
+    SQLCompiler,
     test_only_ibis_inferred_schema,
     test_only_try_evaluate,
 )
 
 __all__ = [
-    "compile_peek",
-    "compile_unordered",
-    "compile_ordered",
-    "compile_raw",
+    "SQLCompiler",
     "test_only_try_evaluate",
     "test_only_ibis_inferred_schema",
 ]
diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py
@@ -25,38 +25,44 @@
 _STRICT_COMPILER = compiler.Compiler(strict=True)
 
 
-def compile_peek(node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str:
-    """Compile node into sql that selects N arbitrary rows, may not execute deterministically."""
-    return _STRICT_COMPILER.compile_unordered_ir(node).peek_sql(n_rows)
-
-
-def compile_unordered(
-    node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {}
-) -> str:
-    """Compile node into sql where rows are unsorted, and no ordering information is preserved."""
-    return _STRICT_COMPILER.compile_unordered_ir(node).to_sql(
-        col_id_overrides=col_id_overrides
-    )
-
-
-def compile_ordered(
-    node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {}
-) -> str:
-    """Compile node into sql where rows are sorted with ORDER BY."""
-    return _STRICT_COMPILER.compile_ordered_ir(node).to_sql(
-        col_id_overrides=col_id_overrides, ordered=True
-    )
-
-
-def compile_raw(
-    node: bigframes.core.nodes.BigFrameNode,
-) -> Tuple[str, bigframes.core.ordering.TotalOrdering]:
-    """Compile node into sql that exposes all columns, including hidden ordering-only columns."""
-    ir = _STRICT_COMPILER.compile_ordered_ir(node)
-    sql = ir.raw_sql()
-    ordering_info = ir._ordering
-    assert ir.has_total_order
-    return sql, ordering_info  # type: ignore
+class SQLCompiler:
+    def __init__(self, strict: bool = True):
+        self._compiler = compiler.Compiler(strict=strict)
+
+    def compile_peek(self, node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str:
+        """Compile node into sql that selects N arbitrary rows, may not execute deterministically."""
+        return self._compiler.compile_unordered_ir(node).peek_sql(n_rows)
+
+    def compile_unordered(
+        self,
+        node: bigframes.core.nodes.BigFrameNode,
+        *,
+        col_id_overrides: Mapping[str, str] = {},
+    ) -> str:
+        """Compile node into sql where rows are unsorted, and no ordering information is preserved."""
+        return self._compiler.compile_unordered_ir(node).to_sql(
+            col_id_overrides=col_id_overrides
+        )
+
+    def compile_ordered(
+        self,
+        node: bigframes.core.nodes.BigFrameNode,
+        *,
+        col_id_overrides: Mapping[str, str] = {},
+    ) -> str:
+        """Compile node into sql where rows are sorted with ORDER BY."""
+        return self._compiler.compile_ordered_ir(node).to_sql(
+            col_id_overrides=col_id_overrides, ordered=True
+        )
+
+    def compile_raw(
+        self,
+        node: bigframes.core.nodes.BigFrameNode,
+    ) -> Tuple[str, bigframes.core.ordering.RowOrdering]:
+        """Compile node into sql that exposes all columns, including hidden ordering-only columns."""
+        ir = self._compiler.compile_ordered_ir(node)
+        sql = ir.raw_sql()
+        return sql, ir._ordering
 
 
 def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode):
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -46,7 +46,8 @@ class Compiler:
 
     def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR:
         ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True))
-        assert ir.has_total_order
+        if self.strict:
+            assert ir.has_total_order
         return ir
 
     def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR:
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -492,9 +492,7 @@ def to_pandas(self) -> pandas.Index:
             pandas.Index:
                 A pandas Index with all of the labels from this Index.
         """
-        return self._block.index.to_pandas(
-            ordered=self._block.session._strictly_ordered
-        )
+        return self._block.index.to_pandas(ordered=True)
 
     def to_numpy(self, dtype=None, **kwargs) -> np.ndarray:
         return self.to_pandas().to_numpy(dtype, **kwargs)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1192,15 +1192,14 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame:
     def to_arrow(
         self,
         *,
-        ordered: Optional[bool] = None,
+        ordered: bool = True,
     ) -> pyarrow.Table:
         """Write DataFrame to an Arrow table / record batch.
 
         Args:
-            ordered (bool, default None):
-                Determines whether the resulting Arrow table will be deterministically ordered.
-                In some cases, unordered may result in a faster-executing query. If set to a value
-                other than None, will override Session default.
+            ordered (bool, default True):
+                Determines whether the resulting Arrow table will be ordered.
+                In some cases, unordered may result in a faster-executing query.
 
         Returns:
             pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame.
@@ -1211,9 +1210,7 @@ def to_arrow(
         )
 
         self._optimize_query_complexity()
-        pa_table, query_job = self._block.to_arrow(
-            ordered=ordered if ordered is not None else self._session._strictly_ordered,
-        )
+        pa_table, query_job = self._block.to_arrow(ordered=ordered)
         self._set_internal_query_job(query_job)
         return pa_table
 
@@ -1223,7 +1220,7 @@ def to_pandas(
         sampling_method: Optional[str] = None,
         random_state: Optional[int] = None,
         *,
-        ordered: Optional[bool] = None,
+        ordered: bool = True,
     ) -> pandas.DataFrame:
         """Write DataFrame to pandas DataFrame.
 
@@ -1243,10 +1240,9 @@ def to_pandas(
                 The seed for the uniform downsampling algorithm. If provided, the uniform method may
                 take longer to execute and require more computation. If set to a value other than
                 None, this will supersede the global config.
-            ordered (bool, default None):
-                Determines whether the resulting pandas dataframe will be deterministically ordered.
-                In some cases, unordered may result in a faster-executing query. If set to a value
-                other than None, will override Session default.
+            ordered (bool, default True):
+                Determines whether the resulting pandas dataframe will be ordered.
+                In some cases, unordered may result in a faster-executing query.
 
         Returns:
             pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the
@@ -1259,7 +1255,7 @@ def to_pandas(
             max_download_size=max_download_size,
             sampling_method=sampling_method,
             random_state=random_state,
-            ordered=ordered if ordered is not None else self._session._strictly_ordered,
+            ordered=ordered,
         )
         self._set_internal_query_job(query_job)
         return df.set_axis(self._block.column_labels, axis=1, copy=False)
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -329,7 +329,7 @@ def to_pandas(
         sampling_method: Optional[str] = None,
         random_state: Optional[int] = None,
         *,
-        ordered: Optional[bool] = None,
+        ordered: bool = True,
     ) -> pandas.Series:
         """Writes Series to pandas Series.
 
@@ -349,10 +349,9 @@ def to_pandas(
                 The seed for the uniform downsampling algorithm. If provided, the uniform method may
                 take longer to execute and require more computation. If set to a value other than
                 None, this will supersede the global config.
-            ordered (bool, default None):
-                Determines whether the resulting pandas series will be deterministically ordered.
-                In some cases, unordered may result in a faster-executing query. If set to a value
-                other than None, will override Session default.
+            ordered (bool, default True):
+                Determines whether the resulting pandas series will be  ordered.
+                In some cases, unordered may result in a faster-executing query.
 
 
         Returns:
@@ -364,7 +363,7 @@ def to_pandas(
             max_download_size=max_download_size,
             sampling_method=sampling_method,
             random_state=random_state,
-            ordered=ordered if ordered is not None else self._session._strictly_ordered,
+            ordered=ordered,
         )
         self._set_internal_query_job(query_job)
         series = df.squeeze(axis=1)
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -304,6 +304,9 @@ def __init__(
             if context._strictly_ordered
             else bigframes.enums.DefaultIndexKind.NULL
         )
+        self._compiler = bigframes.core.compile.SQLCompiler(
+            strict=context._strictly_ordered
+        )
 
         self._remote_function_session = bigframes_rf._RemoteFunctionSession()
 
@@ -1893,18 +1896,16 @@ def _cache_with_cluster_cols(
         """Executes the query and uses the resulting table to rewrite future executions."""
         # TODO: Use this for all executions? Problem is that caching materializes extra
         # ordering columns
-        # TODO: May want to support some partial ordering info even for non-strict ordering mode
-        keep_order_info = self._strictly_ordered
 
-        sql, ordering_info = bigframes.core.compile.compile_raw(
+        sql, ordering_info = self._compiler.compile_raw(
             self._with_cached_executions(array_value.node)
         )
         tmp_table = self._sql_to_temp_table(
             sql, cluster_cols=cluster_cols, api_name="cached"
         )
         cached_replacement = array_value.as_cached(
             cache_table=self.bqclient.get_table(tmp_table),
-            ordering=ordering_info if keep_order_info else None,
+            ordering=ordering_info,
         ).node
         self._cached_executions[array_value.node] = cached_replacement
 
@@ -1917,7 +1918,7 @@ def _cache_with_offsets(self, array_value: core.ArrayValue):
                 "Caching with offsets only supported in strictly ordered mode."
             )
         offset_column = bigframes.core.guid.generate_guid("bigframes_offsets")
-        sql = bigframes.core.compile.compile_unordered(
+        sql = self._compiler.compile_unordered(
             self._with_cached_executions(
                 array_value.promote_offsets(offset_column).node
             )
@@ -2023,7 +2024,7 @@ def _peek(
         """A 'peek' efficiently accesses a small number of rows in the dataframe."""
         if not tree_properties.peekable(self._with_cached_executions(array_value.node)):
             warnings.warn("Peeking this value cannot be done efficiently.")
-        sql = bigframes.core.compile.compile_peek(
+        sql = self._compiler.compile_peek(
             self._with_cached_executions(array_value.node), n_rows
         )
 
@@ -2044,10 +2045,10 @@ def _to_sql(
             array_value = array_value.promote_offsets(offset_column)
         node_w_cached = self._with_cached_executions(array_value.node)
         if ordered:
-            return bigframes.core.compile.compile_ordered(
+            return self._compiler.compile_ordered(
                 node_w_cached, col_id_overrides=col_id_overrides
             )
-        return bigframes.core.compile.compile_unordered(
+        return self._compiler.compile_unordered(
             node_w_cached, col_id_overrides=col_id_overrides
         )
 
diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py
@@ -132,3 +132,17 @@ def test_unordered_mode_blocks_windowing(unordered_session, function):
         match=r"Op.*not supported when strict ordering is disabled",
     ):
         function(df)
+
+
+def test_unordered_mode_cache_preserves_order(unordered_session):
+    pd_df = pd.DataFrame(
+        {"a": [1, 2, 3, 4, 5, 6], "b": [4, 5, 9, 3, 1, 6]}, dtype=pd.Int64Dtype()
+    )
+    pd_df.index = pd_df.index.astype(pd.Int64Dtype())
+    df = bpd.DataFrame(pd_df, session=unordered_session)
+    sorted_df = df.sort_values("b").cache()
+    bf_result = sorted_df.to_pandas()
+    pd_result = pd_df.sort_values("b")
+
+    # B is unique so unstrict order mode result here should be equivalent to strictly ordered
+    assert_pandas_df_equal(bf_result, pd_result, ignore_order=False)