SNOW-2042703 - Upper bound for row estimates when resulting from a cartesian product (#3578)

sfc-gh-jkew · graphite-app[bot] · web-flow · commit 2e42e72ba3f8 · 2025-07-22T17:42:08.000-07:00
SNOW-2042703 - Upper bound for row estimates when resulting from a cartesian product There was a significant performance regression in astronomer_attribution resulting from the row estimatation. After some investigation the issue seems to occur after we estimate rows from the cartesian product in an align ( and possibly join too ). When the result of this estimation is very large set_frame_2d_labels seems to take a very long time as well. We had added a "hack" to _get_rows in SnowflakeQueryCompiler to catch this case and recalculate rows manually when we absolutely need it, but we apparently use this information in other places as well. Returning None when this product is very large (>1e34) reduces astronomer attribution from 940s to 9s on my machine, and returning None from the row estimator is a conservative and reasonable thing to do ( we should never fully count on estimates ). We still need to figure out why this value would cause such a problem, and potentially fix that code as well. Even with a very high maximum estimated upper bound on rows we should not cause a performance regression. This should provide some relief from those workloads though. Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -48,6 +48,9 @@
 
 #### Bug Fixes
 
+- Added an upper bound to the row estimation when the cartesian product from an align or join results in a very large number. This mitigates a performance regression.
+
+
 ## 1.34.0 (2025-07-15)
 
 ### Snowpark Python API Updates
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/row_count_estimation.py b/src/snowflake/snowpark/modin/plugin/_internal/row_count_estimation.py
@@ -30,6 +30,10 @@ class DataFrameOperation(Enum):
     SAMPLE = "sample"
 
 
+# The maximum number of rows we allow for estimates of joins/aligns
+MAX_ROW_COUNT_FOR_ESTIMATION = 1e15
+
+
 class RowCountEstimator:
     @staticmethod
     def upper_bound(
@@ -89,7 +93,14 @@ def upper_bound(
             if right_bound is None:
                 # Cannot estimate row count: other DataFrame has no row count information
                 return None
-            return current * right_bound
+            # SNOW-2042703 - TODO: Performance regression in cartiesian products with row estimate
+            # When the product becomes very large we return None conservatively, as this can have
+            # a negative performance impact on alignment. This is a similar fix to what was added
+            # in SnowflakeQueryCompiler::_get_rows
+            cartesian_result = current * right_bound
+            if cartesian_result > MAX_ROW_COUNT_FOR_ESTIMATION:
+                return None
+            return cartesian_result
 
         # TODO: Implement a better estimate by having cases for different align types
         # Align can cause a Cartesian product with the row counts multiplying
@@ -99,7 +110,14 @@ def upper_bound(
             if other_bound is None:
                 # Cannot estimate row count: other DataFrame has no row count information
                 return None
-            return current * other_bound
+            # SNOW-2042703 - TODO: Performance regression in cartiesian products with row estimate
+            # When the product becomes very large we return None conservatively, as this can have
+            # a negative performance impact on alignment. This is a similar fix to what was added
+            # in SnowflakeQueryCompiler::_get_rows
+            cartesian_result = current * other_bound
+            if cartesian_result > MAX_ROW_COUNT_FOR_ESTIMATION:
+                return None
+            return cartesian_result
 
         # Limit sets the upper bound to n rows
         elif operation == DataFrameOperation.LIMIT:
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -310,6 +310,9 @@
     validate_resample_supported_by_snowflake,
     compute_resample_start_and_end_date,
 )
+from snowflake.snowpark.modin.plugin._internal.row_count_estimation import (
+    MAX_ROW_COUNT_FOR_ESTIMATION,
+)
 from snowflake.snowpark.modin.plugin._internal.snowpark_pandas_types import (
     SnowparkPandasColumn,
     SnowparkPandasType,
@@ -772,10 +775,13 @@ def _get_rows(cls, query_compiler: BaseQueryCompiler) -> int:
             internal_frame = query_compiler._modin_frame
             ordered_dataframe = internal_frame.ordered_dataframe
             num_rows = ordered_dataframe.row_count_upper_bound
+            # SNOW-2042703 - TODO: Performance regression in cartiesian products with row estimate
+            # It's possible this bit of code is related to the performance regression
             # hack to work around large numbers when things are an estimate
             if (
                 ordered_dataframe.row_count_upper_bound is None
-                or ordered_dataframe.row_count_upper_bound > 1e34
+                or ordered_dataframe.row_count_upper_bound
+                > MAX_ROW_COUNT_FOR_ESTIMATION
             ):
                 num_rows = query_compiler.get_axis_len(0)
             if num_rows is None:
@@ -791,7 +797,7 @@ def _max_shape(self) -> tuple[int, int]:
         # hack to work around large numbers when things are an estimate
         if (
             ordered_dataframe.row_count_upper_bound is None
-            or ordered_dataframe.row_count_upper_bound > 1e34
+            or ordered_dataframe.row_count_upper_bound > MAX_ROW_COUNT_FOR_ESTIMATION
         ):
             num_rows = self.get_axis_len(0)
         if num_rows is None:
diff --git a/tests/integ/modin/hybrid/test_switch_operations.py b/tests/integ/modin/hybrid/test_switch_operations.py
@@ -75,7 +75,7 @@ def test_merge(init_transaction_tables, us_holidays_data):
     assert combined.get_backend() == "Snowflake"
 
 
-@sql_count_checker(query_count=6)
+@sql_count_checker(query_count=7)
 def test_filtered_data(init_transaction_tables):
     # When data is filtered, the engine should change when it is sufficiently small.
     df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")
diff --git a/tests/unit/modin/test_ordered_dataframe.py b/tests/unit/modin/test_ordered_dataframe.py
@@ -9,9 +9,13 @@
 from snowflake.snowpark.dataframe import DataFrame as SnowparkDataFrame
 from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import (
     DataFrameReference,
+    DataFrameOperation,
     OrderedDataFrame,
     OrderingColumn,
 )
+from snowflake.snowpark.modin.plugin._internal.row_count_estimation import (
+    RowCountEstimator,
+)
 from snowflake.snowpark.types import (
     ColumnIdentifier,
     IntegerType,
@@ -137,3 +141,26 @@ def test_ordered_dataframe_missing_row_position_column_negative(
             ordering_columns=[OrderingColumn('"INDEX"')],
             row_position_snowflake_quoted_identifier='"E"',
         )
+
+
+def test_row_count_estimator_join_big():
+    # Create two mock OrderedDataFrame objects with large row counts
+    df1 = mock.create_autospec(OrderedDataFrame)
+    df1.row_count = 1e10
+    df1.row_count_upper_bound = None
+
+    df2 = mock.create_autospec(OrderedDataFrame)
+    df2.row_count = 1e10
+    df2.row_count_upper_bound = None
+
+    # Verify that the RowCountEstimator returns None for a JOIN operation
+    # which is "large"
+    assert (
+        RowCountEstimator.upper_bound(df1, DataFrameOperation.JOIN, {"right": df2})
+        is None
+    )
+
+    assert (
+        RowCountEstimator.upper_bound(df1, DataFrameOperation.ALIGN, {"right": df2})
+        is None
+    )