SNOW-2230533 - hybrid adjust defaults (#3669)

sfc-gh-jkew · sfc-gh-joshi · web-flow · commit 522749f0bd87 · 2025-08-19T12:54:50.000-07:00
Co-authored-by: Jonathan Shi &lt;149419494+sfc-gh-joshi@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,27 @@
 # Release History
 
+## 1.38.0 (YYYY-MM-DD)
+
+### Snowpark Python API Updates
+
+#### New Features
+
+#### Bug Fixes
+
+#### Deprecations
+
+#### Dependency Updates
+
+### Snowpark pandas API Updates
+
+#### New Features
+
+#### Improvements
+- Set the default transfer limit in hybrid execution for data leaving Snowflake to 100k, which can be overridden with the SnowflakePandasTransferThreshold environment variable. This configuration is appropriate for scenarios with two available engines, "Pandas" and "Snowflake" on relational workloads.
+
+#### Dependency Updates
+#### Bug Fixes
+
 ## 1.37.0 (YYYY-MM-DD)
 
 ### Snowpark Python API Updates
diff --git a/docs/source/modin/hybrid_execution.rst b/docs/source/modin/hybrid_execution.rst
@@ -120,8 +120,8 @@ Configuring Transfer Costs
 
 Transfer costs are also considered for data moving between engines. For data moving
 from Snowflake this threshold can be configured with the SnowflakePandasTransferThreshold
-environment variable. This is set to 10M rows by default; which will penalize
-the movement of data as it nears this threshold.
+environment variable. This is set to 100k rows by default; which will penalize
+the movement of data as it nears this threshold. The default may change in the future.
 
 .. code-block:: python
 
diff --git a/src/snowflake/snowpark/modin/config/envvars.py b/src/snowflake/snowpark/modin/config/envvars.py
@@ -85,7 +85,7 @@ class SnowflakePandasTransferThreshold(EnvironmentVariable, type=int):
     """
 
     varname = "SNOWFLAKE_PANDAS_MAX_XFER_ROWS"
-    default = 10_000_000
+    default = 100_000
 
 
 # have to monkey patch this into modin right now to use config contexts
diff --git a/tests/integ/modin/hybrid/test_switch_operations.py b/tests/integ/modin/hybrid/test_switch_operations.py
@@ -76,11 +76,11 @@ def test_snowflake_pandas_transfer_threshold():
     is correctly used in the cost model.
     """
     # Verify the default value of the configuration variable.
-    assert SnowflakePandasTransferThreshold.get() == 10_000_000
+    assert SnowflakePandasTransferThreshold.get() == 100_000
 
     # Create a SnowflakeQueryCompiler and verify that it has the default value.
     compiler = SnowflakeQueryCompiler(mock.create_autospec(InternalFrame))
-    assert compiler._transfer_threshold() == 10_000_000
+    assert compiler._transfer_threshold() == 100_000
 
     df = pd.DataFrame()
     assert df.get_backend() == "Pandas"
@@ -175,23 +175,41 @@ def test_filtered_data(init_transaction_tables):
     df_transactions["DATE"] = pd.to_datetime(df_transactions["DATE"])
     assert df_transactions.get_backend() == "Snowflake"
     base_date = pd.Timestamp("2025-06-09").date()
+
+    # Filter 1 will stay in snowflake, because no operations are
+    # performed which will trigger a switch
     df_transactions_filter1 = df_transactions[
         (df_transactions["DATE"] >= base_date - pd.Timedelta("7 days"))
         & (df_transactions["DATE"] < base_date)
-    ]
+    ][["DATE", "REVENUE"]]
     assert df_transactions_filter1.get_backend() == "Snowflake"
+
+    # We still do not know the size of the underlying data, so
+    # GroupBy.sum will keep the data in Snowflake
     # The smaller dataframe does operations in pandas
-    df_transactions_filter1 = df_transactions_filter1.groupby("DATE").sum()["REVENUE"]
+    df_transactions_filter1 = df_transactions_filter1.groupby("DATE").sum()
     # We still operate in Snowflake because we cannot properly estimate the rows
     assert df_transactions_filter1.get_backend() == "Snowflake"
+
+    # Filter 2 will immediately move to pandas because we know the size of the
+    # resultset. The SQL here is functionatly the same as above.
     df_transactions_filter2 = pd.read_snowflake(
-        "SELECT * FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, '2025-06-09' ) and Date < '2025-06-09'"
+        "SELECT Date, SUM(Revenue) AS REVENUE FROM revenue_transactions WHERE Date >= DATEADD( 'days', -7, '2025-06-09' ) and Date < '2025-06-09' GROUP BY DATE"
     )
     assert df_transactions_filter2.get_backend() == "Pandas"
+
+    # Sort and compare the results.
     assert_array_equal(
         # Snowpark handles index objects differently from native pandas, so just check values
-        df_transactions_filter1.to_pandas().values,
-        df_transactions_filter2.groupby("DATE").sum()["REVENUE"].to_pandas().values,
+        # A .head on filter1 will trigger migration to pandas
+        df_transactions_filter1["REVENUE"]
+        .to_pandas()
+        .sort_values(ascending=True)
+        .values,
+        df_transactions_filter2["REVENUE"]
+        .to_pandas()
+        .sort_values(ascending=True)
+        .values,
     )