SNOW-2396665: Add support for isin in faster pandas (#3856)

sfc-gh-helmeleegy · web-flow · commit 82e510d6e928 · 2025-10-07T15:49:07.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -158,6 +158,7 @@
 - Improved performance of `DataFrame.to_snowflake` and `pd.to_snowflake(dataframe)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
 - Improved performance of `Series.to_snowflake` and `pd.to_snowflake(series)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
 - Set `cte_optimization_enabled` to True for all Snowpark pandas sessions.
+- Add support for `isin` in faster pandas.
 
 ## 1.39.1 (2025-09-25)
 
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -13009,6 +13009,31 @@ def isin(
         values: Union[
             list[Any], np.ndarray, "SnowflakeQueryCompiler", dict[Hashable, ListLike]
         ],
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _isin_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None and (
+            not isinstance(values, SnowflakeQueryCompiler)
+            or values._relaxed_query_compiler is not None
+        ):
+            new_values = values
+            if isinstance(values, SnowflakeQueryCompiler):
+                assert values._relaxed_query_compiler is not None
+                new_values = values._relaxed_query_compiler
+            relaxed_query_compiler = self._relaxed_query_compiler._isin_internal(
+                values=new_values
+            )
+
+        qc = self._isin_internal(values=values)
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _isin_internal(
+        self,
+        values: Union[
+            list[Any], np.ndarray, "SnowflakeQueryCompiler", dict[Hashable, ListLike]
+        ],
     ) -> "SnowflakeQueryCompiler":  # noqa: PR02
         """
         Check for each element of `self` whether it's contained in passed `values`.
diff --git a/tests/integ/modin/test_faster_pandas.py b/tests/integ/modin/test_faster_pandas.py
@@ -220,6 +220,64 @@ def test_isna_notna(session, func):
     assert_frame_equal(snow_result, native_result, check_dtype=False)
 
 
+@sql_count_checker(query_count=3)
+def test_isin_list(session):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([[1, 11], [2, 12], [3, 13]], columns=["A", "B"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result = df[df["B"].isin([12, 13])]
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result = native_df[native_df["B"].isin([12, 13])]
+
+    # compare results
+    assert_frame_equal(snow_result, native_result, check_dtype=False)
+
+
+@sql_count_checker(query_count=3)
+def test_isin_series(session):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([[1, 11], [2, 12], [3, 13]], columns=["A", "B"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result = df[df["B"].isin(df["A"])]
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result = native_df[native_df["B"].isin(native_df["A"])]
+
+    # compare results
+    assert_frame_equal(snow_result, native_result, check_dtype=False)
+
+
 @sql_count_checker(query_count=0)
 def test_dummy_row_pos_optimization_enabled_on_session(db_parameters):
     with Session.builder.configs(db_parameters).create() as new_session: