Skip to content

Commit 522749f

Browse files
SNOW-2230533 - hybrid adjust defaults (#3669)
Co-authored-by: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com>
1 parent c9d4a24 commit 522749f

File tree

4 files changed

+50
-10
lines changed

4 files changed

+50
-10
lines changed

CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,27 @@
11
# Release History
22

3+
## 1.38.0 (YYYY-MM-DD)
4+
5+
### Snowpark Python API Updates
6+
7+
#### New Features
8+
9+
#### Bug Fixes
10+
11+
#### Deprecations
12+
13+
#### Dependency Updates
14+
15+
### Snowpark pandas API Updates
16+
17+
#### New Features
18+
19+
#### Improvements
20+
- Set the default transfer limit in hybrid execution for data leaving Snowflake to 100k, which can be overridden with the SnowflakePandasTransferThreshold environment variable. This configuration is appropriate for scenarios with two available engines, "Pandas" and "Snowflake" on relational workloads.
21+
22+
#### Dependency Updates
23+
#### Bug Fixes
24+
325
## 1.37.0 (YYYY-MM-DD)
426

527
### Snowpark Python API Updates

docs/source/modin/hybrid_execution.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ Configuring Transfer Costs
120120

121121
Transfer costs are also considered for data moving between engines. For data moving
122122
from Snowflake this threshold can be configured with the SnowflakePandasTransferThreshold
123-
environment variable. This is set to 10M rows by default; which will penalize
124-
the movement of data as it nears this threshold.
123+
environment variable. This is set to 100k rows by default; which will penalize
124+
the movement of data as it nears this threshold. The default may change in the future.
125125

126126
.. code-block:: python
127127

src/snowflake/snowpark/modin/config/envvars.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class SnowflakePandasTransferThreshold(EnvironmentVariable, type=int):
8585
"""
8686

8787
varname = "SNOWFLAKE_PANDAS_MAX_XFER_ROWS"
88-
default = 10_000_000
88+
default = 100_000
8989

9090

9191
# have to monkey patch this into modin right now to use config contexts

tests/integ/modin/hybrid/test_switch_operations.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,11 @@ def test_snowflake_pandas_transfer_threshold():
7676
is correctly used in the cost model.
7777
"""
7878
# Verify the default value of the configuration variable.
79-
assert SnowflakePandasTransferThreshold.get() == 10_000_000
79+
assert SnowflakePandasTransferThreshold.get() == 100_000
8080

8181
# Create a SnowflakeQueryCompiler and verify that it has the default value.
8282
compiler = SnowflakeQueryCompiler(mock.create_autospec(InternalFrame))
83-
assert compiler._transfer_threshold() == 10_000_000
83+
assert compiler._transfer_threshold() == 100_000
8484

8585
df = pd.DataFrame()
8686
assert df.get_backend() == "Pandas"
@@ -175,23 +175,41 @@ def test_filtered_data(init_transaction_tables):
175175
df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])
176176
assert df_transactions.get_backend() == "Snowflake"
177177
base_date = pd.Timestamp("2025-06-09").date()
178+
179+
# Filter 1 will stay in snowflake, because no operations are
180+
# performed which will trigger a switch
178181
df_transactions_filter1 = df_transactions[
179182
(df_transactions["DATE"] >= base_date - pd.Timedelta("7 days"))
180183
& (df_transactions["DATE"] < base_date)
181-
]
184+
][["DATE", "REVENUE"]]
182185
assert df_transactions_filter1.get_backend() == "Snowflake"
186+
187+
# We still do not know the size of the underlying data, so
188+
# GroupBy.sum will keep the data in Snowflake
183189
# The smaller dataframe does operations in pandas
184-
df_transactions_filter1 = df_transactions_filter1.groupby("DATE").sum()["REVENUE"]
190+
df_transactions_filter1 = df_transactions_filter1.groupby("DATE").sum()
185191
# We still operate in Snowflake because we cannot properly estimate the rows
186192
assert df_transactions_filter1.get_backend() == "Snowflake"
193+
194+
# Filter 2 will immediately move to pandas because we know the size of the
195+
# resultset. The SQL here is functionatly the same as above.
187196
df_transactions_filter2 = pd.read_snowflake(
188-
"SELECT * FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, '2025-06-09' ) and Date < '2025-06-09'"
197+
"SELECT Date, SUM(Revenue) AS REVENUE FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, '2025-06-09' ) and Date < '2025-06-09' GROUP BY DATE"
189198
)
190199
assert df_transactions_filter2.get_backend() == "Pandas"
200+
201+
# Sort and compare the results.
191202
assert_array_equal(
192203
# Snowpark handles index objects differently from native pandas, so just check values
193-
df_transactions_filter1.to_pandas().values,
194-
df_transactions_filter2.groupby("DATE").sum()["REVENUE"].to_pandas().values,
204+
# A .head on filter1 will trigger migration to pandas
205+
df_transactions_filter1["REVENUE"]
206+
.to_pandas()
207+
.sort_values(ascending=True)
208+
.values,
209+
df_transactions_filter2["REVENUE"]
210+
.to_pandas()
211+
.sort_values(ascending=True)
212+
.values,
195213
)
196214

197215

0 commit comments

Comments
 (0)