Skip to content

Commit 2e42e72

Browse files
SNOW-2042703 - Upper bound for row estimates when resulting from a cartesian product (#3578)
SNOW-2042703 - Upper bound for row estimates when resulting from a cartesian product There was a significant performance regression in astronomer_attribution resulting from the row estimatation. After some investigation the issue seems to occur after we estimate rows from the cartesian product in an align ( and possibly join too ). When the result of this estimation is very large set_frame_2d_labels seems to take a very long time as well. We had added a "hack" to _get_rows in SnowflakeQueryCompiler to catch this case and recalculate rows manually when we absolutely need it, but we apparently use this information in other places as well. Returning None when this product is very large (>1e34) reduces astronomer attribution from 940s to 9s on my machine, and returning None from the row estimator is a conservative and reasonable thing to do ( we should never fully count on estimates ). We still need to figure out why this value would cause such a problem, and potentially fix that code as well. Even with a very high maximum estimated upper bound on rows we should not cause a performance regression. This should provide some relief from those workloads though. Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com>
1 parent 6da287e commit 2e42e72

File tree

5 files changed

+59
-5
lines changed

5 files changed

+59
-5
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@
4848

4949
#### Bug Fixes
5050

51+
- Added an upper bound to the row estimation when the cartesian product from an align or join results in a very large number. This mitigates a performance regression.
52+
53+
5154
## 1.34.0 (2025-07-15)
5255

5356
### Snowpark Python API Updates

src/snowflake/snowpark/modin/plugin/_internal/row_count_estimation.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ class DataFrameOperation(Enum):
3030
SAMPLE = "sample"
3131

3232

33+
# The maximum number of rows we allow for estimates of joins/aligns
34+
MAX_ROW_COUNT_FOR_ESTIMATION = 1e15
35+
36+
3337
class RowCountEstimator:
3438
@staticmethod
3539
def upper_bound(
@@ -89,7 +93,14 @@ def upper_bound(
8993
if right_bound is None:
9094
# Cannot estimate row count: other DataFrame has no row count information
9195
return None
92-
return current * right_bound
96+
# SNOW-2042703 - TODO: Performance regression in cartiesian products with row estimate
97+
# When the product becomes very large we return None conservatively, as this can have
98+
# a negative performance impact on alignment. This is a similar fix to what was added
99+
# in SnowflakeQueryCompiler::_get_rows
100+
cartesian_result = current * right_bound
101+
if cartesian_result > MAX_ROW_COUNT_FOR_ESTIMATION:
102+
return None
103+
return cartesian_result
93104

94105
# TODO: Implement a better estimate by having cases for different align types
95106
# Align can cause a Cartesian product with the row counts multiplying
@@ -99,7 +110,14 @@ def upper_bound(
99110
if other_bound is None:
100111
# Cannot estimate row count: other DataFrame has no row count information
101112
return None
102-
return current * other_bound
113+
# SNOW-2042703 - TODO: Performance regression in cartiesian products with row estimate
114+
# When the product becomes very large we return None conservatively, as this can have
115+
# a negative performance impact on alignment. This is a similar fix to what was added
116+
# in SnowflakeQueryCompiler::_get_rows
117+
cartesian_result = current * other_bound
118+
if cartesian_result > MAX_ROW_COUNT_FOR_ESTIMATION:
119+
return None
120+
return cartesian_result
103121

104122
# Limit sets the upper bound to n rows
105123
elif operation == DataFrameOperation.LIMIT:

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,9 @@
310310
validate_resample_supported_by_snowflake,
311311
compute_resample_start_and_end_date,
312312
)
313+
from snowflake.snowpark.modin.plugin._internal.row_count_estimation import (
314+
MAX_ROW_COUNT_FOR_ESTIMATION,
315+
)
313316
from snowflake.snowpark.modin.plugin._internal.snowpark_pandas_types import (
314317
SnowparkPandasColumn,
315318
SnowparkPandasType,
@@ -772,10 +775,13 @@ def _get_rows(cls, query_compiler: BaseQueryCompiler) -> int:
772775
internal_frame = query_compiler._modin_frame
773776
ordered_dataframe = internal_frame.ordered_dataframe
774777
num_rows = ordered_dataframe.row_count_upper_bound
778+
# SNOW-2042703 - TODO: Performance regression in cartiesian products with row estimate
779+
# It's possible this bit of code is related to the performance regression
775780
# hack to work around large numbers when things are an estimate
776781
if (
777782
ordered_dataframe.row_count_upper_bound is None
778-
or ordered_dataframe.row_count_upper_bound > 1e34
783+
or ordered_dataframe.row_count_upper_bound
784+
> MAX_ROW_COUNT_FOR_ESTIMATION
779785
):
780786
num_rows = query_compiler.get_axis_len(0)
781787
if num_rows is None:
@@ -791,7 +797,7 @@ def _max_shape(self) -> tuple[int, int]:
791797
# hack to work around large numbers when things are an estimate
792798
if (
793799
ordered_dataframe.row_count_upper_bound is None
794-
or ordered_dataframe.row_count_upper_bound > 1e34
800+
or ordered_dataframe.row_count_upper_bound > MAX_ROW_COUNT_FOR_ESTIMATION
795801
):
796802
num_rows = self.get_axis_len(0)
797803
if num_rows is None:

tests/integ/modin/hybrid/test_switch_operations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def test_merge(init_transaction_tables, us_holidays_data):
7575
assert combined.get_backend() == "Snowflake"
7676

7777

78-
@sql_count_checker(query_count=6)
78+
@sql_count_checker(query_count=7)
7979
def test_filtered_data(init_transaction_tables):
8080
# When data is filtered, the engine should change when it is sufficiently small.
8181
df_transactions = pd.read_snowflake("REVENUE_TRANSACTIONS")

tests/unit/modin/test_ordered_dataframe.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,13 @@
99
from snowflake.snowpark.dataframe import DataFrame as SnowparkDataFrame
1010
from snowflake.snowpark.modin.plugin._internal.ordered_dataframe import (
1111
DataFrameReference,
12+
DataFrameOperation,
1213
OrderedDataFrame,
1314
OrderingColumn,
1415
)
16+
from snowflake.snowpark.modin.plugin._internal.row_count_estimation import (
17+
RowCountEstimator,
18+
)
1519
from snowflake.snowpark.types import (
1620
ColumnIdentifier,
1721
IntegerType,
@@ -137,3 +141,26 @@ def test_ordered_dataframe_missing_row_position_column_negative(
137141
ordering_columns=[OrderingColumn('"INDEX"')],
138142
row_position_snowflake_quoted_identifier='"E"',
139143
)
144+
145+
146+
def test_row_count_estimator_join_big():
147+
# Create two mock OrderedDataFrame objects with large row counts
148+
df1 = mock.create_autospec(OrderedDataFrame)
149+
df1.row_count = 1e10
150+
df1.row_count_upper_bound = None
151+
152+
df2 = mock.create_autospec(OrderedDataFrame)
153+
df2.row_count = 1e10
154+
df2.row_count_upper_bound = None
155+
156+
# Verify that the RowCountEstimator returns None for a JOIN operation
157+
# which is "large"
158+
assert (
159+
RowCountEstimator.upper_bound(df1, DataFrameOperation.JOIN, {"right": df2})
160+
is None
161+
)
162+
163+
assert (
164+
RowCountEstimator.upper_bound(df1, DataFrameOperation.ALIGN, {"right": df2})
165+
is None
166+
)

0 commit comments

Comments
 (0)